
    h%F                       d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z.  e'j^                  e0      Z1dSde	jd                  de	jf                  dee4   fdZ5	 dTde	jl                  de	jf                  de	jn                  de4fdZ8dTdZ9e e%d       G d de#                    Z:e e%d        G d! d"e#                    Z; G d# d$ejx                        Z=	 dUd%ejx                  d&e	jd                  d'e	jd                  d(e	jd                  d)ee	jd                     d*e>d+e>fd,Z? G d- d.ejx                        Z@ G d/ d0ejx                        ZA G d1 d2e      ZB G d3 d4ejx                        ZC G d5 d6ejx                        ZD G d7 d8ejx                        ZE G d9 d:ejx                        ZF G d; d<ejx                        ZG G d= d>e      ZH G d? d@ejx                        ZIe% G dA dBe             ZJ G dC dDeJ      ZK G dE dFeJ      ZL e%dG       G dH dIeJe             ZM G dJ dKejx                        ZN e%dL       G dM dNeJ             ZO e%dO       G dP dQeJe             ZPg dRZQy)VzPyTorch KOSMOS-2 model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)deprecate_kwarg   )Kosmos2ConfigKosmos2TextConfigKosmos2VisionConfigmaskdtypetgt_lenc                 2   | j                         \  }}||n|}| ddddddf   j                  |d||      j                  |      }d|z
  }|j                  |j                  t        j
                        t	        j                  |      j                        S )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    Nr         ?)sizeexpandtomasked_filltorchboolfinfomin)r#   r$   r%   bszsrc_lenexpanded_maskinverted_masks          j/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/kosmos2/modeling_kosmos2.py_expand_maskr5   .   s     99;LC ,g'GD$)*11#q'7KNNuUM-'M$$]%5%5ejj%A5;;uCUCYCYZZ    input_ids_shapedevicepast_key_values_lengthc                    | \  }}t        j                  ||ft        j                  |      j                  |      }t        j                  |j                  d      |      }|j                  ||dz   j                  |j                  d      d      k  d       |j                  |      }|dkD  r0t        j                  t        j                  ||||      |gd      }|ddddddf   j                  |d|||z         S )zB
    Make causal mask used for bi-directional self-attention.
    )r8   r   r   r$   r8   dimN)r,   fullr.   r/   aranger(   masked_fill_viewr*   catzerosr)   )r7   r$   r8   r9   r0   r%   r#   	mask_conds           r4   _make_causal_maskrF   <   s     #LC::w(%++e*<*@*@PDTYYr]6:Ii9q="6"6tyy}a"HH!L775>D!yy%++g/EU[abdhioqrdAq !((a'DZ:Z[[r6   c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r=   )neintr,   cumsumtype_aslong)	input_idspadding_idxr9   r#   incremental_indicess        r4   "create_position_ids_from_input_idsrP   N   sW     <<$((*D <<!4<<TBE[[_cc##%33r6   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    )custom_introc                   @   e Zd ZU dZdZeej                     ed<   dZ	ee
e
ej                           ed<   dZee
ej                        ed<   dZee
ej                        ed<   dZeej                     ed<   dZee
ej                        ed<   dZeed	<   d
e
e   fdZy)Kosmos2ModelOutputa  
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
        `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
        encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    vision_model_output (`BaseModelOutputWithPooling`, *optional*):
        The output of the [`Kosmos2VisionModel`].
    Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_embedsprojection_attentionsvision_model_outputreturnc                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw)text_model_outputrZ   Ngetattrto_tuple.0kselfs     r4   	<genexpr>z.Kosmos2ModelOutput.to_tuple.<locals>.<genexpr>   =      
  LLDGRYZ^`aRbRkRkRmm
   -0tuplekeysrf   s   `r4   rb   zKosmos2ModelOutput.to_tuple   #     
YY[
 
 	
r6   )__name__
__module____qualname____doc__rT   r   r,   FloatTensor__annotations__rU   rk   rV   rW   rX   rY   rZ   r   r   rb    r6   r4   rS   rS   ^   s    , 6:x 1 129AEOXeE%*;*;$<=>E8<M8E%"3"345<59Ju0012904L(5,,-4@D8E%*;*;$<=D6:3:
%* 
r6   rS   zC
    Model output class for `Kosmos2ForConditionalGeneration`.
    c                   h   e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeeej                           ed<   dZeeej                        ed<   dZeeej                        ed<   dZeej                     ed<   dZeeej                        ed	<   dZeed
<   dee   fdZy)*Kosmos2ForConditionalGenerationModelOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
        `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
        encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    vision_model_output (`BaseModelOutputWithPooling`, *optional*):
        The output of the [`Kosmos2VisionModel`].
    NlosslogitsrU   rV   rW   rX   rY   rZ   r[   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) ywr^   r`   rc   s     r4   rg   zFKosmos2ForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>   rh   ri   rj   rm   s   `r4   rb   z3Kosmos2ForConditionalGenerationModelOutput.to_tuple   rn   r6   )ro   rp   rq   rr   rx   r   r,   rs   rt   ry   rU   rk   rV   rW   rX   rY   rZ   r   r   rb   ru   r6   r4   rw   rw      s    4 )-D(5$$
%,*.FHU&&'.AEOXeE%*;*;$<=>E8<M8E%"3"345<59Ju0012904L(5,,-4@D8E%*;*;$<=D6:3:
%* 
r6   rw   c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )Kosmos2VisionEmbeddingsconfigc                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebias   r   position_ids)r   r;   
persistent)super__init__r~   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr,   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr@   r)   rf   r~   	__class__s     r4   r   z Kosmos2VisionEmbeddings.__init__   s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr6   
embeddingsheightwidthr[   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr;   g      ?r	   r   bicubicF)r(   modealign_cornersr=   )shaper   weight	unsqueezer,   jit
is_tracingr   r   r   reshapepermuter   
functionalinterpolaterB   rC   )rf   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr>   
new_height	new_widthsqrt_num_positionss                r4   interpolate_pos_encodingz0Kosmos2VisionEmbeddings.interpolate_pos_encoding   sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr6   pixel_valuesc                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model ().r$   r   r   r;   r=   )r   r   
ValueErrorr   r   r$   r*   flatten	transposer   r)   r,   rC   r   r   r   )rf   r   r   
batch_size_r   r   target_dtypepatch_embedsclass_embedsr   s              r4   forwardzKosmos2VisionEmbeddings.forward   s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr6   F)ro   rp   rq   r"   r   r,   TensorrI   r   rs   r   __classcell__r   s   @r4   r}   r}      se    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf r6   r}   modulequerykeyvalueattention_maskscalingdropoutc                 p   t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |d      }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr;   r=   ptrainingr   r   )	r,   matmulr   r   r   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r4   eager_attention_forwardr     s     <<s}}R'<=GL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r6   c                        e Zd ZdZ fdZ	 	 	 d	dej                  deej                     deej                     dee   de	ej                  eej                     f   f
dZ
 xZS )
Kosmos2VisionAttention=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )N;embed_dim must be divisible by num_heads (got `embed_dim`:  and `num_heads`: r         F)r   r   r~   r   r   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   s     r4   r   zKosmos2VisionAttention.__init__)  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar6   rV   r   causal_attention_maskoutput_attentionsr[   c           
         |j                   \  }}}| j                  |      }| j                  |      }	| j                  |      }
|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	|
j	                  ||| j
                  | j                        j                  dd      }
| j                  j                  dk7  r||||z   }n||}n	|du| _
        t        }| j                  j                  dk7  rN| j                  j                  dk(  r|rt        j                  d       nt        | j                  j                     } || ||	|
|| j                  | j                  | j                   sdn| j"                  	      \  }}|j%                  |||      j'                         }| j)                  |      }|sd}||fS )
#Input shape: Batch x Time x Channelr   r   flash_attention_2Neagersdpa`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   )r   r   r   r   rB   r   r   r   r~   _attn_implementationr   r   loggerwarning_oncer   r   r   r   r   r   r   )rf   rV   r   r   r   r   
seq_lengthr   queriesrl   valuesattention_interfacer   r   s                 r4   r   zKosmos2VisionAttention.forward=  s    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/2G!G&2!62$>DN(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0 LL((r6   )NNF)ro   rp   rq   rr   r   r,   r   r   r-   rk   r   r   r   s   @r4   r   r   &  s}    GB. 268<,15)||5) !.5)  (5	5)
 $D>5) 
u||Xell33	45)r6   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Kosmos2VisionMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y N)r   r   r~   r
   
hidden_actactivation_fnr   r   r   intermediate_sizefc1fc2r   s     r4   r   zKosmos2VisionMLP.__init__w  sd    #F$5$5699V//1I1IJ99V55v7I7IJr6   rV   r[   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r  rf   rV   s     r4   r   zKosmos2VisionMLP.forward~  s4    /**=9/r6   )ro   rp   rq   r   r,   r   r   r   r   s   @r4   r   r   v  s$    KU\\ ell r6   r   c                        e Zd Zdef fdZ	 d	dej                  dej                  dej                  dee   de	ej                     f
dZ xZS )
Kosmos2VisionEncoderLayerr~   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y Neps)r   r   r   r   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   s     r4   r   z"Kosmos2VisionEncoderLayer.__init__  sm    ++/7<<F<Q<QR#F+<<F<Q<QRr6   rV   r   r   r   r[   c                     |}| j                  |      }| j                  ||||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rV   r   r   r   )r  r
  r  r  )rf   rV   r   r   r   residualr   outputss           r4   r   z!Kosmos2VisionEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
#| !=0 ((7/ =0 "&Gr6   r   )ro   rp   rq   r"   r   r,   r   r   r-   rk   rs   r   r   r   s   @r4   r  r    sg    S2 S -2&||& &  %||	&
 $D>& 
u  	!&r6   r  c                        e Zd ZdZdef fdZe	 	 	 	 	 ddeej                     deej                     dee
   dee
   dee
   d	eeef   fd
       Z xZS )Kosmos2VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Kosmos2VisionEncoderLayer`].

    Args:
        config: Kosmos2VisionConfig
    r~   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r   r   r~   r   
ModuleListrangenum_hidden_layersr  layersgradient_checkpointing)rf   r~   r   r   s      r4   r   zKosmos2VisionEncoder.__init__  sQ    mmPUV\VnVnPo$p1%>v%F$pq&+# %qs   A#r   r   r   output_hidden_statesreturn_dictr[   c                 j   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]*  \  }
}|r||	fz   } ||	|||      }|d   }	|s"||d   fz   }, |r||	fz   }t        |	||      S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nru   )r   r   r   )rT   rV   rW   )r~   r   r  use_return_dict	enumerater  r   )rf   inputs_embedsr   r   r   r  r  encoder_statesall_attentionsrV   idxencoder_layerlayer_outputss                r4   r   zKosmos2VisionEncoder.forward  s    N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8 	FC#!/=2B!B)%"3	M *!,M !/=3C2E!E	F  +}.>>N+>Vd
 	
r6   )NNNNN)ro   rp   rq   rr   r"   r   r   r   r,   r   r-   r   rk   r   r   r   r   s   @r4   r  r    s    ,2 ,  268<,0/3&*D
 !.D
  (5	D

 $D>D
 'tnD
 d^D
 
uo%	&D
 D
r6   r  c                        e Zd Zdef fdZ	 	 	 	 	 d
deej                     dee   dee   dedee   de	e
ef   fd	Z xZS )Kosmos2VisionTransformerr~   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r  )r   r   r~   r   r}   r   r   r  r  pre_layrnormr  encoderpost_layernorm)rf   r~   r   r   s      r4   r   z!Kosmos2VisionTransformer.__init__  sj    &&	1&9LL8M8MN+F3 ll9&:O:OPr6   r   r   r  r   r  r[   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  ||      }| j                  |      }| j                  ||||      }|d   }|d d dd d f   }	| j                  |	      }	|s
||	f|dd  z   S t        ||	|j                  |j                        S )Nz You have to specify pixel_values)r   )r   r   r  r  r   r   )rT   pooler_outputrV   rW   )r~   r   r  r  r   r   r)  r*  r+  r   rV   rW   )
rf   r   r   r  r   r  rV   encoder_outputsrT   pooled_outputs
             r4   r   z Kosmos2VisionTransformer.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@Ogh))-8,,'/!5#	 ' 
 ,A.)!Q'2++M:%}58KKK)/')77&11	
 	
r6   NNNFN)ro   rp   rq   r"   r   r   r,   rs   r-   r   rk   r   r   r   r   s   @r4   r'  r'    s    Q2 Q 59,0/3).&*'
u001'
 $D>'
 'tn	'

 #''
 d^'
 
u00	1'
r6   r'  c                       e Zd ZdZddededee   f fdZddededee   fdZeddededee   fd       Z	 e
j                         	 	 	 	 dd	ee
j                     d
ee
j                     dedee
j                     fd       Zd Z xZS )(Kosmos2TextSinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.r   embedding_dimrN   c                     t         |           d| _        || _        || _        | j                  || j                  z   ||       y )Nr   )r   r   offsetr3  rN   make_weights)rf   r   r3  rN   r   s       r4   r   z1Kosmos2TextSinusoidalPositionalEmbedding.__init__L  s@    *&-$++5}kRr6   num_embeddingsc                     | j                  |||      }t        | d      r;|j                  | j                  j                  | j                  j
                        }| j                  d|d       y )Nweightsr<   Fr   )get_embeddinghasattrr*   r9  r$   r8   r   )rf   r7  r3  rN   emb_weightss        r4   r6  z5Kosmos2TextSinusoidalPositionalEmbedding.make_weightsT  s[    ((T4#%..t||/A/A$,,J]J].^KYFr6   c                    |dz  }t        j                  d      |dz
  z  }t        j                  t        j                  |t        j
                        j                         | z        }t        j                  | t        j
                        j                         j                  d      |j                  d      z  }t        j                  t        j                  |      t        j                  |      gd      j                  | d      }|dz  dk(  r-t        j                  |t        j                  | d      gd      }|	d||ddf<   |j                  t        j                               S )	z
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        r   i'  r   r   r   r=   r;   N)mathlogr,   expr@   int64floatr   rC   sincosrB   rD   r*   get_default_dtype)r7  r3  rN   half_dimembs        r4   r:  z6Kosmos2TextSinusoidalPositionalEmbedding.get_embedding\  s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r6   rM   r   r9   r   c                 v   |F|j                         \  }}|[t        || j                  |      j                  |j                        }n*|j                         d d \  }}|| j                  ||      }| j                  dz   |z   |z   }|| j                  j                  d      kD  r4| j                  || j                  z   | j                  | j                         | j                  j                  d|j                  d            j                  ||| j                  j                  d         j                         S )Nr;   r   r   )r(   rP   rN   r*   r8   &create_position_ids_from_inputs_embedsr9  r6  r5  r3  index_selectrB   r   detach)rf   rM   r   r9   r   r0   seq_lenmax_poss           r4   r   z0Kosmos2TextSinusoidalPositionalEmbedding.forwardr  s'     $>>+LC#At//1G "Y%%&  )--/4LC##JJ=Zpq ""Q&03IIT\\&&q))g3T5G5GIYIYZ||((L,=,=b,ABGGWVZVbVbVhVhikVlmttvvr6   c                 0   |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      j                         |z   S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr;   r   r<   r   )	r(   r,   r@   rN   rL   r8   r   r)   r   )rf   r   r9   input_shapesequence_lengthr   s         r4   rI  zOKosmos2TextSinusoidalPositionalEmbedding.create_position_ids_from_inputs_embeds  s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<GGILbbbr6   r   )NNr   N)ro   rp   rq   rr   rI   r   r   r6  staticmethodr:  r,   no_gradr   r   rI  r   r   s   @r4   r2  r2  H  s    NSc S# SHUXM SG3 Gs GQYZ]Q^ G 1c 1# 1HUXM 1 1( U]]_ -104&'/3wELL)w  -w !$	w
 u||,w w6cr6   r2  c                   x    e Zd ZdZ	 	 	 	 	 ddedededee   dee   dee   dee   f fd	Z e	d
dd      	 	 	 	 	 	 dde
j                  dee
j                     dee   dee
j                     dee
j                     dedee
j                     dee
j                  ee
j                     ee   f   fd       Z xZS )KosmosTextAttentionr   r   r   r   
is_decoderadd_inner_attn_layernormr   	layer_idxc	                 j   t         	|           || _        || _        || _        || _        ||z  | _        | j                  |z  | j                  k7  rt        d| j                   d| d      | j                  dz  | _        || _	        || _
        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        d | _        |r't        j$                  ||j&                        | _        y y )Nr   r   r   r   )r   r  )r   r   r~   r   r   r   r   r   r   rU  rW  r   r   r   r   r   r   inner_attn_lnr  r  )
rf   r~   r   r   r   rU  rV  r   rW  r   s
            r4   r   zKosmosTextAttention.__init__  s    	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"ii	94@ii	94@ii	94@		)YTB "#!#iV=R=R!SD $r6   past_key_valuerU   4.58new_nameversionrV   encoder_hidden_statesr   layer_head_maskr   cache_positionr[   c                    |du}	|j                   dd \  }
}| j                  |      }|j                  |
|| j                  | j                        j                  dd      }|St        |t              rA|j                  j                  | j                        }|	r|j                  }n|j                  }n|}|	r|n|}|	rK|IrGj                  | j                     j                  }|j                  | j                     j                  }n| j!                  |      }| j#                  |      }|j                  |
d| j                  | j                        j                  dd      }|j                  |
d| j                  | j                        j                  dd      }|D|	s|nd}j%                  ||| j                  d|i      \  }}|	rd|j                  | j                  <   t&        }| j(                  j*                  dk7  rN| j(                  j*                  dk(  r|rt,        j/                  d	       nt0        | j(                  j*                     } || ||||f| j2                  sd
n| j4                  | j6                  d|\  }}|j9                  |
|d      j;                         }| j<                  | j=                  |      }| j?                  |      }||fS )r   Nr   r   r;   ra  Tr   r   r   r   )r   r   ) r   r   rB   r   r   r   
isinstancer   
is_updatedgetrW  cross_attention_cacheself_attention_cacher  rl   r   r   r   updater   r~   r   r   r   r   r   r   r   r   r   rY  r   )rf   rV   r_  rU   r   r`  r   ra  r   is_cross_attentionr   r   query_statesrd  curr_past_key_valuecurrent_states
key_statesvalue_statesr   r   r   s                        r4   r   zKosmosTextAttention.forward  s     3$>!.!4!4Ra!8
J{{=1#((ZQUQ^Q^_iijkmno&/+>?,77;;DNNK
%*9*O*O'*9*N*N'&5#2D.-/"=*,33DNNCHHJ.55dnnELLL^4J;;~6L#RWaabcefgJ',,ZT^^T]][eefgijkL*7It+>+E+Ednn?OQ_>`,(
L &AEO..t~~>(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7	%
  $}}C$,,LL	%
 	%
!\ "))*j"EPPR),,[9KmmK0L((r6   )r   FFTN)NNNNFN)ro   rp   rq   rr   rI   rB  r   r-   r   r   r,   r   r   rk   r   r   r   s   @r4   rT  rT    sP   G %*38#$(#T #T 	#T
 #T TN#T #+4.#T tn#T D>#TJ %0A6R 9=+/1526"'15Q)||Q)  (5Q) "%	Q)
 !.Q) "%,,/Q)  Q) !.Q) 
u||Xell3Xe_D	EQ) SQ)r6   rT  c                   *     e Zd Zdef fdZd Z xZS )Kosmos2TextFFNr~   c                    t         |           |j                  | _        t        |j                     | _        |j                  | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y r  )r   r   r   r
   activation_functionr   activation_dropoutr   r   r   ffn_dimr   r  r  r  ffn_layernormr   s     r4   r   zKosmos2TextFFN.__init__  s    ~~#F$>$>?"(";";99V--v~~>99V^^V-=-=>\\&..f>S>STr6   c                 b   | j                  | j                  |            }t        j                  j	                  || j
                  | j                        }| j                  |      }| j                  |      }t        j                  j	                  || j                  | j                        }|S )Nr   )	r   r   r   r   r   rs  r   ru  r  r  s     r4   r   zKosmos2TextFFN.forward+  s    **488M+BC--mt?V?Vaeanan-o**=9/--mt||VZVcVc-dr6   )ro   rp   rq   r!   r   r   r   r   s   @r4   rp  rp    s    
U0 
Ur6   rp  c                       e Zd Zddef fdZ eddd      	 	 	 	 	 	 	 	 	 ddej                  deej                     d	eej                     d
eej                     deej                     deej                     dee	   dee
   dee
   deej                     deej                  eeej                  ej                  f      f   fd       Z xZS )Kosmos2TextBlockr~   c           	         t         |           |j                  | _        t        || j                  |j                  |j
                  dd|      | _        |j                  | _        t        j                  | j                  |j                        | _        |j                  ret        || j                  |j                  |j
                  dd|      | _        t        j                  | j                  |j                        | _        t        |      | _        t        j                  | j                  |j                        | _        y )NT)r   r   r   rU  rV  rW  r  F)r   r   r   rT  attention_headsr   r
  r   r   r  r  self_attn_layer_normadd_cross_attentionencoder_attnencoder_attn_layer_normrp  ffnfinal_layer_norm)rf   r~   rW  r   s      r4   r   zKosmos2TextBlock.__init__6  s    )),nn,,,,%)
 ~~$&LLVEZEZ$[!%% 3.. 0000).#!D ,.<<FLaLa+bD(!&) "T^^AVAV Wr6   rZ  rU   r[  r\  rV   r   r_  encoder_attention_maskr`  cross_attn_layer_head_maskr   	use_cachera  r[   c                 X   |}| j                  |      } | j                  d||||||
d|\  }}t        j                  j	                  || j                  | j
                        }||z   }d }|t        | d      st        d|  d      |}| j                  |      } | j                  d|||||||
d|\  }}t        j                  j	                  || j                  | j
                        }||z   }|}| j                  |      }| j                  |      }||z   }|f}|r|||fz  }|S )N)rV   rU   r   r`  r   ra  r   r}  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)rV   r_  r   r`  rU   r   ra  ru   )r{  r
  r   r   r   r   r;  r   r~  r}  r  r  )rf   rV   r   r_  r  r`  r  rU   r   r  ra  r   r  self_attn_weightscross_attn_weightsr  s                   r4   r   zKosmos2TextBlock.forwardU  s    !11-@+94>> ,
'+)+/),
 ,
(( --mt||VZVcVc-d =0 " ,40 =dV DD D 
 %H 88GM0A0A0A 	1+&;5 : /"3-	1 	1-M- MM11-4<<Z^ZgZg1hM$}4M !--m< / =0 ")+=>>Gr6   r   )	NNNNNNFTN)ro   rp   rq   r!   r   r   r,   r   r   r   r-   rk   rs   r   r   r   s   @r4   rx  rx  5  s9   X0 X> %0A6R 268<9=26=A+/,1$(15C||C !.C  (5	C
 !) 6C "%,,/C %-U\\$:C "%C $D>C D>C !.C 
u  (51B1BEDUDU1U+V"WW	XC SCr6   rx  c            '           e Zd ZdZdef fdZd Z	 	 	 	 	 ddeej                     deej                     deej                     de
d	eej                     f
d
Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     deej                     deeej                        deej                     d	eej                     dee   dee   dee   dee   deej                     dee   deeef   f$dZ xZS )Kosmos2TextTransformerz
    Transformer decoder consisting of `config.layers` layers. Each layer is a [`Kosmos2TextBlock`].

    Args:
        config: Kosmos2TextConfig
    r~   c           	         t         |           || _        |j                  | _        |j                  | _        |j
                  rt        j                  |j                        nd| _	        t        j                  |j                  |j                  |j                        | _        t        |j                   |j                  |j                        | _        t        j$                  t'        |j(                        D cg c]  }t+        ||       c}      | _        t        j,                  |j                  |j.                        | _        d| _        y c c}w )Nr'   )rN   )r   r3  rN   )rW  F)r   r   r~   r   	layerdropscale_embeddingr>  sqrtr   embed_scaler   r   
vocab_sizepad_token_idembed_tokensr2  max_position_embeddingsembed_positionsr  r  r  rx  r  r  
layer_normr  )rf   r~   ir   s      r4   r   zKosmos2TextTransformer.__init__  s    ~~)):@:P:P499V%5%56VYLL):):F<L<LZ`ZmZmnG 88 **++ 
 mmTYZ`ZgZgTh$iq%5f%J$ij,,v'7'79N9NO&+# %js   =Ec                     d }|d   dkD  r#t        ||j                  |j                  |      }|=t        ||j                  |d         j	                  |j                        }||n||z   }|S )Nr;   r   )r8   r9   r%   )rF   r$   r8   r5   r*   )rf   r   rO  r   r9   combined_attention_maskexpanded_attn_masks          r4   _prepare_decoder_attention_maskz6Kosmos2TextTransformer._prepare_decoder_attention_mask  s     #'r?Q&7##$++'=	'# %!-nm>Q>Q[fgi[j!k!n!n$$" '>&E"K]`wKw $ '&r6   r   rX   img_input_maskr9   r   c                    || j                  |      }|[|j                  |j                        j                  d|j	                  d            ||j                  t
        j                        <   || j                  z  }| j                  ||||      }|j                  |j                        }||z   }t        j                  j                  || j                  | j                        }|S )Nr;   r   )rM   r   r9   r   r   )r  r*   r8   rB   r(   r,   r-   r  r  r   r   r   r   )	rf   rM   r   rX   r  r9   r   	positionsrV   s	            r4   forward_embeddingz(Kosmos2TextTransformer.forward_embedding  s       --i8M#AMQ^QeQeAfAkAkL%%b)BM.++%**+=> &(8(88 (('#9%	 ) 
	 LL!5!56	%	1--mt||VZVcVc-dr6   rM   r   image_embeds_position_maskr_  r  	head_maskcross_attn_head_maskrU   r  r   r  r  ra  r   r[   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||
t	        d      |"|j
                  }|j                  d|d         }n!|
|
j                         d d }nt	        d      | j                  r%| j                  r|rt        j                  d       d}|rN|	L|4t        t        | j                         t        | j                               nt        | j                         }	|r:t        |	t              r*t        j                  d       t        j                   |	      }	|	|	j#                         nd}|dkD  rd }d }| j%                  ||
||||	      }| j'                  ||||      }||t)        ||
j*                  |d   
      }t,        j.                  j1                  || j0                  | j                        }|rdnd }|rdnd }|r|dnd }t3        ||gddg      D ]j  \  }}|	|j                         d   t5        | j6                        k7  s3t	        d| dt5        | j6                         d|j                         d    d       t9        | j6                        D ]|  \  }}|r||fz  }| j                  r%t;        j<                  g       }|| j>                  k  r? ||||f||||   nd |||   nd |	|||d|}|d   }|sh||d   fz  }|t||d   fz  }~ | jA                  |      }|r||fz  }tC        ||	|||      S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer;   z5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r~   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   )rM   r   rX   r  r9   r   r  r   ru   r  r  zThe `z` should be specified for z layers, but it is for .)r  r`  r  rU   r   r  ra  r   r   )rT   rU   rV   rW   cross_attentions)"r~   r   r  r  r   r   rB   r(   r  r   r   r   r   r   rc  rk   from_legacy_cacheget_seq_lengthr  r  r5   r$   r   r   r   ziplenr  r  r,   randr  r  r   )rf   rM   r   rX   r  r_  r  r  r  rU   r   r   r  r   r  r  ra  r   rO  r9   rV   all_hidden_statesall_self_attnsall_cross_attentions	attn_mask	mask_namer#  decoder_layerdropout_probabilityr%  s                                 r4   r   zKosmos2TextTransformer.forward  s   ( 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	 ]%>cdd"#//K!r;r?;I&',,.s3KTUU&&4==##p "	0 )4 $L$DlZ^ZeZeFfg!5 
 OU;\
 2CCOTOETE`!?!?!Afg "A%L)-&..'%5#9% / 
 ==K8N

 !,1G1S%12H-J]J]grsugv%w"--mt||VZVcVc-d #7BD0d&7<Q<]rdh %(4H(IKYoKp$q 	 Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03 	 #,DKK"8 	@C#!m%55!}}&+jjn#&7)% (>3<3H3dI]Ii,@,Eos /"3#- M *!,M =#3"55(4(]1-=,??(9	@> 6  -!118+++%1
 	
r6   )NNNr   NNNNNNNNNNNNNNNNN)ro   rp   rq   rr   r!   r   r  r   r,   r   rI   r  listrs   r-   r   r   r   rk   r   r   r   r   s   @r4   r  r    s   ,0 ,('4 15/315&'/3!  -! u||,	!
 !.! !$! u||,!J -115/3=A8<9=,07;=A04/3$(,0/3&*15#M
ELL)M
 !.M
 u||,	M

 %-U\\$:M
  (5M
 !) 6M
 ELL)M
 'u||4M
 "$u'8'8"9:M
  -M
 u||,M
 D>M
 $D>M
 'tnM
  d^!M
" !.#M
$ -.%M
& 
u??	@'M
r6   r  c                   P    e Zd ZU eed<   dZddgZdZdZdZ	de
j                  fdZy)Kosmos2PreTrainedModelr~   Tr  rx  r   c                    t        | t              r| j                  j                  }n6t        | t        t
        f      r | j                  j                  j                  }t        | t        t        f      r| j                  j                  }n6t        | t        t
        f      r | j                  j                  j                  }t        |t              rt        j                  j                  |j                  d|j                   dz  z         t        j                  j                  |j"                  j$                  |j                  j&                  |z         t        j                  j                  |j(                  j$                  |j                  j&                  |z         nt        |t*              r|j                   dz  d|j                  j,                  z  dz  z  z  }|j                   dz  |z  }t        j                  j                  |j.                  j$                  |       t        j                  j                  |j0                  j$                  |       t        j                  j                  |j2                  j$                  |       t        j                  j                  |j4                  j$                  |       nt        |t6              r|j                  j8                  dz  d|j                  j,                  z  dz  z  z  }d|j                  j8                  z  dz  |z  }t        j                  j                  |j:                  j$                  |       t        j                  j                  |j<                  j$                  |       nt        |t>              rt        j                  j                  |j.                  j$                         t        j                  j                  |j0                  j$                  |       t        j                  j                  |j2                  j$                  |       t        j                  j                  |j4                  j$                  |       n3t        |t@              rlt        j                  j                  |j:                  j$                         t        j                  j                  |j<                  j$                  |       nt        |t              r7t        j                  j                  |jB                  j$                         npt        |tD              r`t        j                  j                  |jF                  j$                         t        j                  j                  |jH                         n t        |tJ              r|jL                  j$                  jN                  j                  d       |jL                  jP                  |jL                  j$                  jN                  |jL                  jP                     jS                          nct        |t        jT                        rI|j$                  jN                  jW                  d       |jX                  jN                  jS                          t        |t        jZ                        r2|jX                  %|jX                  jN                  jS                          yyy)zInitialize the weightsr   r   )meanstd)r  r   Nr'   ).rc  Kosmos2VisionModelr~   initializer_factorKosmos2ModelKosmos2ForConditionalGenerationvision_configKosmos2TextModelKosmos2TextForCausalLMinit_stdtext_configr}   r   initnormal_r   r   r   r   initializer_ranger   r   r  r   r   r   r   r   r   r   r  rT  rp  lm_headKosmos2ImageToTextProjectiondenselatent_queryr  r  datarN   zero_r  fill_r   r   )rf   r   factorr  in_proj_stdout_proj_stdfc_stds          r4   _init_weightsz$Kosmos2PreTrainedModel._init_weights  s   d./[[33F|-LMN[[..AAFd-/EFG++&&C|-LMN++))22Cf56GGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOhGGOOF55<<&--BaBadjBjOk 67!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE 01!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O? 34GGOOFMM00cO:GGOOFMM00cO:GGOOFMM00cO:GGOOFOO22O</GGOOFJJ--3O7GGOOFJJ--3O7 67GGOOFNN11sO; <=GGOOFLL//SO9GGOOF//0 67&&++33#3F""..:##**//0C0C0O0OPVVX-MM$$S)KK""$fbii(V[[-DKK""$ .E(r6   N)ro   rp   rq   r    rt   supports_gradient_checkpointing_no_split_modules_supports_attention_backend_supports_flash_attn_supports_sdpar   Moduler  ru   r6   r4   r  r    s;    &*#46HI"&N2%BII 2%r6   r  c                        e Zd ZU eed<   dZdef fdZdej                  fdZ	e
	 	 	 	 	 ddeej                     dee   dee   ded	ee   deeef   fd
       Z xZS )r  r~   r   c                 d    t         |   |       t        |      | _        | j	                          y r   )r   r   r'  model	post_initr   s     r4   r   zKosmos2VisionModel.__init__  s&     -f5
r6   r[   c                 B    | j                   j                  j                  S r   )r  r   r   rm   s    r4   get_input_embeddingsz'Kosmos2VisionModel.get_input_embeddings  s    zz$$444r6   r   r  r   r  c                 .    | j                  |||||      S )N)r   r   r  r   r  r  )rf   r   r   r  r   r  s         r4   r   zKosmos2VisionModel.forward  s)     zz%/!5%=#  
 	
r6   r0  )ro   rp   rq   r"   rt   main_input_namer   r   r  r  r   r   r,   rs   r-   r   rk   r   r   r   r   s   @r4   r  r    s    $O2 5bii 5  59,0/3).&*
u001
 $D>
 'tn	

 #'
 d^
 
u00	1
 
r6   r  c            )       <    e Zd ZU eed<   def fdZdej                  fdZe	e
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     deej                     deeej                        deej                     deej                     dee   dee   dee   dee   deej                     dee   deeef   f$d              Z xZS )r  r~   c                 d    t         |   |       t        |      | _        | j	                          y r   )r   r   r  r  r  r   s     r4   r   zKosmos2TextModel.__init__  s&     +F3
r6   r[   c                 .    | j                   j                  S r   r  r  rm   s    r4   r  z%Kosmos2TextModel.get_input_embeddings      zz&&&r6   rM   r   rX   r  r_  r  r  r  rU   r   r   r  r   r  r  ra  r   c                      | j                   di d|d|d|d|d|d|d|d|d	|	d
|
d|d|d|d|d|d||S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        rM   r   rX   r  r_  r  r  r  rU   r   r   r  r   r  r  ra  ru   r  )rf   rM   r   rX   r  r_  r  r  r  rU   r   r   r  r   r  r  ra  r   s                     r4   r   zKosmos2TextModel.forward  s    J tzz 

)
 &
 (B	

 #8
 $:
  
 "6
 ,
 (
 &
  
 0
 "6
 $
  *#
 	
r6   r  )ro   rp   rq   r!   rt   r   r   r  r  r   r   r   r,   r   r  rs   r-   r   r   r   rk   r   r   r   r   s   @r4   r  r    s   0 'bii '  -115/3=A8<9=,07;=A04/3$(,0/3&*15#5
ELL)5
 !.5
 u||,	5

 %-U\\$:5
  (55
 !) 65
 ELL)5
 'u||45
 "$u'8'8"9:5
  -5
 u||,5
 D>5
 $D>5
 'tn5
  d^!5
" !.#5
$ -.%5
& 
u??	@'5
  5
r6   r  z
    The text model from KOSMOS-2 with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c            +           e Zd ZU eed<   dgZdef fdZdej                  fdZ	dej                  fdZ
ee	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     d	eej                     d
eej                     deej                     deej                     deej                     deej                     deeej"                        deej                     deej                     deej$                     dee   dee   dee   dee   deej                     dee   deeef   f&d              Z	 	 	 	 	 	 	 d fd	Z xZS )r  r~   zlm_head.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NF)in_featuresout_featuresr   )
r   r   r  r  r   r   r   r  r  r  r   s     r4   r   zKosmos2TextForCausalLM.__init__1  sI     +F3
yyV-=-=FL]L]dij 	r6   r[   c                 .    | j                   j                  S r   r  rm   s    r4   r  z+Kosmos2TextForCausalLM.get_input_embeddings:  r  r6   c                     | j                   S r   )r  rm   s    r4   get_output_embeddingsz,Kosmos2TextForCausalLM.get_output_embeddings=  s    ||r6   rM   r   rX   r  r_  r  r  r  rU   r   r   labelsr  r   r  r  ra  r   c                    ||n| j                   j                  }||rt        j                  d       d} | j                  di d|d|d|d|d|d	|d
|d|d|	d|
d|d|d|d|ddd||}| j                  |d         }d}|* | j                  d||| j                   j                  d|}t        |||j                  |j                  |j                  |j                        S )aK  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.FrM   r   rX   r  r_  r  r  r  rU   r   r   r  r   r  r  Tra  r   )ry   r  r  )rx   ry   rU   rV   rW   r  ru   )r~   r  r   warningr  r  loss_functionr  r   rU   rV   rW   r  )rf   rM   r   rX   r  r_  r  r  r  rU   r   r   r  r  r   r  r  ra  r   r  	lm_logitsrx   s                         r4   r   zKosmos2TextForCausalLM.forward@  sh   T &1%<k$++B]B]klI$** 

)
 &
 (B	

 #8
 $:
  
 "6
 ,
 (
 &
  
 0
 "6
 
  *#
& LL,	%4%%sYvRVR]R]RhRhslrsD0#33!//))$55
 	
r6   c	                    |d   dk7  rd }d }n|||j                         d d n|j                         \  }
}|j                         d   }t        j                  |t        j                  |
||z
  ft        j                  |j
                        fd      }t        |   |f|||||||d|	}|j                  dd        |S )Nr   r;   )r(   r$   r8   r   r=   )rU   r   rX   r  r   r  ra  r   )	r(   r,   rC   rD   r-   r8   r   prepare_inputs_for_generationpop)rf   rM   rX   r  rU   r   r   r  ra  model_kwargsr   rL  mask_lenmodel_inputsr   s                 r4   r  z4Kosmos2TextForCausalLM.prepare_inputs_for_generation  s     !!L)-& (3?L?X-"4"4"6s";^g^l^l^nJ1668<H)..KKj'H2D%EUZZ`i`p`pq *& w<

+)%'A')

 

 	.r6   )NNNNNNNNNNNNNNNNN)NNNNNNN)ro   rp   rq   r!   rt   _tied_weights_keysr   r   r  r  r  r   r   r   r,   r   r  rs   
LongTensorr-   r   r   r   rk   r   r   r  r   r   s   @r4   r  r  '  s$    *+0 'bii 'ryy   -115/3=A8<9=,07;=A04/3-1$(,0/3&*15%O
ELL)O
 !.O
 u||,	O

 %-U\\$:O
  (5O
 !) 6O
 ELL)O
 'u||4O
 "$u'8'8"9:O
  -O
 u||,O
 ))*O
 D>O
 $D>O
  'tn!O
" d^#O
$ !.%O
& +,'O
( 
u77	8)O
  O
h #'- -r6   r  c                   .     e Zd ZdZdef fdZd Z xZS )r  zmThe layer that transforms the image model's output to part of the text model's input (namely, image features)r~   c                    t         |           t        j                  |j                  j
                  |j                  j                        | _        t        j                  t        j                  |j                  |j                  j                              | _        t        |j                  |j                  j                  |j                  j                  |j                  j                   dd      | _        y )NF)r   rU  rV  )r   r   r   r   r  r   r  r   r  r   r,   r   latent_query_numr  rT  rz  r   x_attnr   s     r4   r   z%Kosmos2ImageToTextProjection.__init__  s    YYv33??ASASA]A]^
LLV5L5LfN`N`NjNj)kl)((..&&88%*
r6   c                    | j                  |      }| j                  j                  d      j                  |j	                  d      dd      }t        j                  ||gd      }| j                  ||d d d       \  }}||fS )Nr   r;   r   r=   )rV   r_  rU   r   r   )r  r  r   r)   r(   r,   rC   r  )rf   featuresrV   r  key_value_statesr   s         r4   r   z$Kosmos2ImageToTextProjection.forward  s    

8, ((2215<<]=O=OPQ=RTVXZ[ 99m\%BJ&*kk&"2 " '2 '
#| l**r6   )ro   rp   rq   rr   r    r   r   r   r   s   @r4   r  r    s    w
} 
+r6   r  z}
    KOSMOS-2 Model for generating text and image features. The model consists of a vision encoder and a language model.
    c            %       &    e Zd ZU eed<   dZdef fdZdej                  fdZ	d Z
	 	 ddej                  dee   dee   fd	Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej$                     d
eej$                     deej$                     deej$                     deej$                     deeej                        deej$                     deej$                     deej$                     dee   dee   dee   dedee   dee   deeef   f d              Z xZS )r  r~   r   c                     t         |   |       t        |j                        | _        t        |j                        | _        t        |      | _	        | j                          y r   )r   r   r  r  
text_modelr  r  vision_modelr  image_to_text_projectionr  r   s     r4   r   zKosmos2Model.__init__  sN     *6+=+=>.v/C/CD(DV(L% 	r6   r[   c                 B    | j                   j                  j                  S r   r  r  r  rm   s    r4   r  z!Kosmos2Model.get_input_embeddings      $$111r6   c                 :    || j                   j                  _        y r   r  rf   r   s     r4   set_input_embeddingsz!Kosmos2Model.set_input_embeddings      -2*r6   return_attentionsr   c                     | j                  ||      }| j                   j                  j                  |d         }t        j                  j                  |d      }| j                  |      \  }}|r||fS |S )aD  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
            return_attentions (`bool`, *optional*, defaults to `False`):
                Whether to return `projection_attentions` or not.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate positional embeddings or not.
        )r   r   r   r;   r=   )r  r  r+  r   r   	normalizer  )rf   r   r  r   rZ   rX   rY   s          r4   get_image_featureszKosmos2Model.get_image_features  s    " #//%%= 0 

 ((..==>QRS>TU}}..|.D.2.K.KL.Y++!666r6   rM   r  r   r  rU   rX   r   r   r  r   r  r  r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }d}d}|$|t	        d      | j                  |d|      \  }} | j                  d||||||||	|
||dd|}t        |j                  |j                  |j                  |j                  |||      S )aE  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Kosmos2Model

        >>> model = Kosmos2Model.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> text = (
        ...     "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863>"
        ...     "</object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911>"
        ...     "</object>"
        ... )

        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_eos_token=True)

        >>> last_hidden_state = model(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ... ).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 91, 2048]
        ```N<You have to specify either `pixel_values` or `image_embeds`.T)r  r   )rM   r   rX   r  r  rU   r   r   r  r   r  r  )rT   rU   rV   rW   rX   rY   rZ   ru   )r~   r   r  r  r   r  r  rS   rT   rU   rV   rW   )rf   r   rM   r  r   r  rU   rX   r   r   r  r   r  r   r  r   rZ   rY   r  s                      r4   r   zKosmos2Model.forward  s   x 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]" $# !_``262I2IOg 3J 3/L/ "$// 
)%'A+'%/!5
 
  "%77#33!//))%"7 3
 	
r6   )FF)NNNNNNNNNNNNFN)ro   rp   rq   r    rt   r  r   r   r  r  r	  r,   rs   r   r-   r  r   r   r   r  r   r   r   rk   rS   r   r   r   s   @r4   r  r    s    $O} 2bii 23 -238	'' $D> #+4.	>  04,0=A15,0=A/304/3$(,0/3).&*a
u||,a
 ELL)a
 %-U\\$:	a

 !.a
 ELL)a
 "$u'8'8"9:a
 u||,a
  -a
 u||,a
 D>a
 $D>a
 'tna
 #'a
 d^a
  -.!a
" 
u((	)#a
  a
r6   r  z
    KOSMOS-2 Model for generating text and bounding boxes given an image. The model consists of a vision encoder and a
    language model.
    c            #           e Zd ZU eed<   dZdgZdef fdZdej                  fdZ
d Zdej                  fdZd	 Zee	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej$                     d
eej$                     deej$                     deej$                     deej$                     deeej(                        deej$                     deej$                     deej$                     deej*                     dee   dee   dee   dee   deeef   fd              Z ej:                         	 	 	 	 	 	 ddeej$                     deej$                     d
eej$                     deej$                     deej$                     deej$                     fd       Z xZS )r  r~   r   ztext_model.lm_head.weightc                     t         |   |       t        |j                        | _        t        |j                        | _        t        |      | _	        | j                          y r   )r   r   r  r  r  r  r  r  r  r  r  r   s     r4   r   z(Kosmos2ForConditionalGeneration.__init__  sN     01C1CD.v/C/CD(DV(L% 	r6   r[   c                 B    | j                   j                  j                  S r   r  rm   s    r4   r  z4Kosmos2ForConditionalGeneration.get_input_embeddings  r  r6   c                 :    || j                   j                  _        y r   r  r  s     r4   r	  z4Kosmos2ForConditionalGeneration.set_input_embeddings  r
  r6   c                 6    | j                   j                         S r   )r  r  rm   s    r4   r  z5Kosmos2ForConditionalGeneration.get_output_embeddings  s    4466r6   c                 :    | j                   j                  |       y r   )r  set_output_embeddings)rf   new_embeddingss     r4   r  z5Kosmos2ForConditionalGeneration.set_output_embeddings  s    --n=r6   rM   r  r   r  rU   rX   r   r   r  r  r   r  r   c                 <   ||n| j                   j                  }||n| j                   j                  }d}d}|~|t        d      | j	                  |||      }| j                  j
                  j                  |d         }t        j                  j                  |d      }| j                  |      \  }} | j                  d
||||||||	|
|||dd|}t        |j                  |j                  |j                  |j                   |j"                  |||	      S )a5  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Kosmos2ForConditionalGeneration

        >>> model = Kosmos2ForConditionalGeneration.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> prompt = "<grounding> An image of"

        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")

        >>> generated_ids = model.generate(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds=None,
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ...     use_cache=True,
        ...     max_new_tokens=64,
        ... )
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
        >>> processed_text
        '<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.'

        >>> caption, entities = processor.post_process_generation(generated_text)
        >>> caption
        'An image of a snowman warming himself by a fire.'

        >>> entities
        [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
        ```Nr  )r   r   r  r   r;   r=   T)rM   r   rX   r  r  rU   r   r   r  r  r   r  r  )rx   ry   rU   rV   rW   rX   rY   rZ   ru   )r~   r   r  r   r  r  r+  r   r   r  r  r  rw   rx   ry   rU   rV   rW   )rf   r   rM   r  r   r  rU   rX   r   r   r  r  r   r  r   rZ   rY   
lm_outputss                     r4   r   z'Kosmos2ForConditionalGeneration.forward  s^   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 # $# !_``"&"3"3)"3%9 #4 #  ,,22AABUVWBXYL==22<R2HL262O2OP\2]/L/$T__ 
)%'A+'%/!5
 

" :$$&66$22!,,%"7 3	
 		
r6   c           	         |j                  dd       }||t        d| d      |||}|n| j                  |      }	| j                  j                  j	                  |	d         }t
        j                  j                  |d      }| j                  |      \  }}
 | j                  j                  d|||||d|}|S )	Ninputsz
`inputs`: zp were passed alongside `pixel_values` which is not allowed.Make sure to either pass `inputs` or pixel_values=...r   r;   r=   )rM   r   rX   r  r   ru   )r  r   r  r  r+  r   r   r  r  r  generate)rf   r   r  rM   r   rX   r   r   r  rZ   rY   outputs               r4   r  z(Kosmos2ForConditionalGeneration.generate  s     Hd+#(:VH %H I  F$6!L"&"3"3L"A,,22AABUVWBXYL==22<R2HL262O2OP\2]/L/))) 
)%'A'
 
 r6   )NNNNNNNNNNNNN)NNNNNN) ro   rp   rq   r    rt   r  r  r   r   r  r  r	  r  r  r   r   r   r,   r   r  rs   r  r-   r   r   r   rk   rw   r   rR  r  r   r   s   @r4   r  r    sZ    $O56	} 	2bii 237ryy 7>  04,0=A15,0=A/304/3-1$(,0/3u
u||,u
 ELL)u
 %-U\\$:	u

 !.u
 ELL)u
 "$u'8'8"9:u
 u||,u
  -u
 u||,u
 ))*u
 D>u
 $D>u
 'tnu
 +,u
  
u@@	A!u
  u
n U]]_ 04=A,015/304%u||,% %-U\\$:% ELL)	%
 !.% u||,%  -% %r6   r  )r  r  r  r   )r   )r   )Rrr   r>  dataclassesr   typingr   r   r   r   r,   torch.utils.checkpointr   activationsr
   cache_utilsr   r   r   
generationr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.deprecationr   configuration_kosmos2r    r!   r"   
get_loggerro   r   r   r$   rI   r5   Sizer8   rF   rP   rS   rw   r  r}   rB  r   r   r   r  r  r'  r2  rT  rp  rx  r  r  r  r  r  r  r  r  __all__ru   r6   r4   <module>r0     s,     ! 1 1    ! C C ) B 9  G & j j 0 X X 
		H	%[u|| [EKK [(3- [ jk\ZZ\(-\=B\\\cf\$4  
#
 #
 #
L 
(
 (
 (
XPbii Pv %II%<<% 
% <<	%
 U\\*% % %,L)RYY L)`ryy  / : /fT
299 T
p3
ryy 3
nUcryy Ucp{)")) {)|RYY .d1 dNc
RYY c
L :%_ :% :%z
/ 
BC
- C
L S3_ SSl +299  +F 
V
) V

V
r {&<o {{| Xr6   