
    hn                        d dl Z d dlmZ d dlmZmZmZ d dlZd dlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZmZmZ ddlmZ ddlmZmZ e ed       G d de                    Z e ed       G d de                    Z! ed       G d dejD                               Z# G d dejD                        Z$ G d dejD                        Z%	 dBdejD                  d ejL                  d!ejL                  d"ejL                  d#eejL                     d$e'd%e'fd&Z( G d' d(ejD                        Z) G d) d*ejD                        Z* G d+ d,ejD                        Z+ G d- d.e      Z, G d/ d0ejD                        Z- G d1 d2ejD                        Z. G d3 d4ej^                        Z0 G d5 d6e      Z1d7ejL                  d8e2fd9Z3 G d: d;e1      Z4 ed<       G d= d>e1             Z5e G d? d@e1e             Z6g dAZ7y)C    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)GenerationMixin)use_kernel_forward_from_hub)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringcan_return_tuple   )	AutoModel   )Ovis2ConfigOvis2VisionConfigzJ
    Base class for Llava outputs, with hidden states and attentions.
    )custom_introc                   :    e Zd ZU dZdZeej                     ed<   y)Ovis2ModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__     f/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/ovis2/modeling_ovis2.pyr   r   (   s    
 8<%"3"34;r%   r   zQ
    Base class for Ovis2 causal language model (or autoregressive) outputs.
    c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   dZeej                     ed<   y)	Ovis2CausalLMOutputWithPastaj  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr   )r   r   r   r    r)   r   r!   r"   r#   r*   r+   listr,   tupler-   r   r$   r%   r&   r(   r(   >   s      )-D(5$$
%,*.FHU&&'.9=OXd5#4#456=8<M8E%"3"345<59Ju001297;%"3"34;r%   r(   RMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )Ovis2RMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z;
        Ovis2RMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parameterr!   onesweightvariance_epsilon)selfhidden_sizeeps	__class__s      r&   r5   zOvis2RMSNorm.__init___   s1     	ll5::k#:; #r%   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nr   Tkeepdim)	dtypetor!   float32powmeanrsqrtr9   r8   )r:   r,   input_dtypevariances       r&   forwardzOvis2RMSNorm.forwardg   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r%   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)r/   r8   shaper9   r:   s    r&   
extra_reprzOvis2RMSNorm.extra_reprn   s*    ))*+6$2G2G1HIIr%   )gư>)r   r   r   r5   rJ   rN   __classcell__r=   s   @r&   r2   r2   ]   s    $;Jr%   r2   c                   $     e Zd Z fdZd Z xZS )Ovis2VisionMLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  | j                  | j                  |j                        | _
        t        |j                     | _        y Nbiasr4   r5   configr;   intermediate_sizer   Linearmlp_bias	gate_projup_proj	down_projr	   
hidden_actact_fnr:   rX   r=   s     r&   r5   zOvis2VisionMLP.__init__s       !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r%   c                     | j                  | j                  | j                  |            | j                  |      z        }|S Nr^   r`   r\   r]   r:   xr^   s      r&   rJ   zOvis2VisionMLP.forward}   6    NN4;;t~~a/@#ADLLQRO#ST	r%   r   r   r   r5   rJ   rO   rP   s   @r&   rR   rR   r       0r%   rR   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZ	S )Ovis2VisionEmbeddingsrX   c                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  |j                  | j                  | j                  | j                  d      | _
        | j
                  | j                  z  dz  | _        | j                  | _        t        j                  | j                  | j                        | _        | j                  dt!        j"                  | j                        j%                  d      d       t'        |j                  |j(                        | _        y )Nvalid)in_channelsout_channelskernel_sizestridepaddingr   position_ids)r   r?   F)
persistent)r4   r5   rX   r;   	embed_dim
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr!   arangeexpandr2   rms_norm_epsrms_normra   s     r&   r5   zOvis2VisionEmbeddings.__init__   s    ++ ++ ++!yy++?? 
 !OOt>1D!--"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jop$V%7%79L9LMr%   pixel_valuesreturnc                 (   | j                   j                  j                  }| j                  |j                  |            }|j	                  d      j                  dd      }| j                  |      }|| j                  | j                        z   }|S )NrB   r   r   )	r{   r8   rB   rC   flatten	transposer   r   rt   )r:   r   target_dtypepatch_embeds
embeddingss        r&   rJ   zOvis2VisionEmbeddings.forward   s    ++2288++LOO,O,OP!))!,66q!<
]]:.
$"9"9$:K:K"LL
r%   )
r   r   r   r   r5   r!   r"   TensorrJ   rO   rP   s   @r&   rl   rl      s/    N0 N*E$5$5 %,, r%   rl   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr?   )dimrB   )ptrainingr   r   )r!   matmulr   r   
functionalsoftmaxrD   rC   rB   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r&   eager_attention_forwardr      s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r%   c            
            e Zd ZdZ fdZ	 ddej                  deej                     deej                  eej                     f   fdZ	 xZ
S )Ovis2VisionAttention=Multi-headed attention from 'Attention Is All You Need' paperc                 x   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        y Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      FrU   r4   r5   rX   r;   rv   num_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutr   	is_causalr   rZ   qkv_biask_projv_projq_projout_projra   s     r&   r5   zOvis2VisionAttention.__init__   2   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//iiV__UiiV__UiiV__U		$..$..vWr%   r,   r   r   c           
      :   |j                   \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	t        }
| j                  j                  dk7  rt        | j                  j                     }
 |
| |||	|| j                  | j                  | j                  sdn| j                        \  }}|j!                  |||      j#                         }| j%                  |      }||fS z#Input shape: Batch x Time x Channelr   r   eager        )r   r   r   rL   r   r   r   viewr   r   r   r   rX   _attn_implementationr   r   r   r   r   reshaper   r   r:   r,   r   r   
batch_size
seq_lengthrv   querieskeysvaluesattention_interfacer   r   s                r&   rJ   zOvis2VisionAttention.forward   a    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc(?;;++w6"9$++:Z:Z"[$7nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0L((r%   rd   r   r   r   r    r5   r!   r   r   r/   rJ   rO   rP   s   @r&   r   r      V    GX, 26$)||$) !.$)
 
u||Xell33	4$)r%   r   c                   $     e Zd Z fdZd Z xZS )Ovis2MLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  | j                  | j                  |j                        | _
        t        |j                     | _        y rT   rW   ra   s     r&   r5   zOvis2MLP.__init__   rb   r%   c                     | j                  | j                  | j                  |            | j                  |      z        }|S rd   re   rf   s      r&   rJ   zOvis2MLP.forward  rh   r%   ri   rP   s   @r&   r   r      rj   r%   r   c            
            e Zd ZdZ fdZ	 ddej                  deej                     deej                  eej                     f   fdZ	 xZ
S )Ovis2Attentionr   c                 x   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        y r   r   ra   s     r&   r5   zOvis2Attention.__init__
  r   r%   r,   r   r   c           
      :   |j                   \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	t        }
| j                  j                  dk7  rt        | j                  j                     }
 |
| |||	|| j                  | j                  | j                  sdn| j                        \  }}|j!                  |||      j#                         }| j%                  |      }||fS r   r   r   s                r&   rJ   zOvis2Attention.forward  r   r%   rd   r   rP   s   @r&   r   r     r   r%   r   c                        e Zd Zdef fdZ	 	 ddej                  deej                     dee   de	ej                  ej                  f   fdZ
 xZS )	Ovis2VisionEncoderLayerrX   c                     t         |           t        |      | _        t	        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        y rd   )r4   r5   r   	attentionr   ffnr2   r;   r   	rms_norm1	rms_norm2ra   s     r&   r5   z Ovis2VisionEncoderLayer.__init__E  sZ    '/F#%f&8&8&:M:MN%f&8&8&:M:MNr%   r,   r   output_attentionsr   c                     | j                  |      }| j                  ||      \  }}||z   }| j                  |      }| j                  |      }||z   }|r||fS |d fS )N)r,   r   )r   r   r   r   )r:   r,   r   r   norm_hidden_statesr   r   
mlp_outputs           r&   rJ   zOvis2VisionEncoderLayer.forwardL  sv     "^^M:$(NNASdrN$s!\%3!^^M:XX01
%
20A|,\W[G\\r%   NF)r   r   r   r   r5   r!   r   r   boolr/   rJ   rO   rP   s   @r&   r   r   D  sm    O0 O 26,1	]||] !.] $D>	]
 
u||U\\)	*]r%   r   c            
       x     e Zd ZdZdef fdZe	 	 	 d	deej                     dee
   dee
   defd       Z xZS )
Ovis2VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Ovis2VisionEncoderLayer`].

    Args:
        config: Ovis2VisionConfig
    rX   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
r4   r5   rX   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)r:   rX   _r=   s      r&   r5   zOvis2VisionEncoder.__init__f  sP    mmeTZTlTlNm$n%<V%D$no&+# %os   A#r   r   output_hidden_statesr   c                    ||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}| j                  D ]&  }|r||fz   } ||||      }	|	d   }|s||	d   fz   }( |r||fz   }t	        |||      S )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr$   )r   r   r   last_hidden_stater,   r-   )rX   r   r   r   r   )
r:   inputs_embedsr   r   r   encoder_statesall_attentionsr,   encoder_layerlayer_outputss
             r&   rJ   zOvis2VisionEncoder.forwardm  s    < 2C1N-TXT_T_TqTq$8$D $++JjJj 	  40d%![[ 	FM#!/=2B!B)"3M *!,M !/=3C2E!E	F  +}.>>N+(%
 	
r%   NNN)r   r   r   r    r   r5   r   r   r!   r   r   r   rJ   rO   rP   s   @r&   r   r   ]  sm    ,0 ,  26,0/3<
 !.<
 $D>	<

 'tn<
 
<
 <
r%   r   c            	       p     e Zd Zdef fdZe	 	 	 ddeej                     dee	   dee	   fd       Z
 xZS )Ovis2VisionTransformerrX   c                     t         |           || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        d| _        y r   )r4   r5   rX   rl   r   r   encoderr2   r;   r   r   r   ra   s     r&   r5   zOvis2VisionTransformer.__init__  sO    /7)&1$V%7%79L9LM&+#r%   r   r   r   c                 (   ||n| j                   j                  }||n| j                   j                  }| j                  |      }| j	                  ||||d      }|d   }| j                  |      }t        ||j                  |j                        S )NT)r   r   r   r   return_dictr   r   )	rX   r   r   r   r   r   r   r,   r-   )r:   r   r   r   r   r,   encoder_outputsr   s           r&   rJ   zOvis2VisionTransformer.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 5,,')/!5 ' 
 ,A. MM*;</)77&11
 	
r%   r   )r   r   r   r   r5   r   r   r!   r   r   rJ   rO   rP   s   @r&   r   r     s^    ,0 ,  26,0/3
 !.
 $D>	

 'tn
 
r%   r   c                   P     e Zd Zdej                  dej                  f fdZ xZS )Ovis2VisualEmbeddingTablevisual_tokensr   c                    |j                   t        j                  t        j                  t        j                  t        j
                  t        j                  fv rt        | !  |      S t        j                  || j                        S rd   )rB   r!   int8int16int32int64longr4   rJ   r   r8   )r:   r   r=   s     r&   rJ   z!Ovis2VisualEmbeddingTable.forward  sW    5::u{{EKKV[V`V`"aa7?=11||M4;;77r%   )r   r   r   r!   r   rJ   rO   rP   s   @r&   r   r     s#    8U\\ 8ell 8 8r%   r   c                   B    e Zd ZU eed<   dZdZdgZdZdZ	dZ
dZdZdZdZy)Ovis2PreTrainedModelrX   modelTr   r+   N)r   r   r   r   r#   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_cache_class_supports_flash_attn_supports_flex_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendr$   r%   r&   r  r    sF    &*#/0"3 N!"&r%   r  r*   r   c                     | j                  |      }|j                  |d      d   }t        j                  | t        j                        j                  ||d      }||j                         z
  |z   }|S )NTr@   r   )memory_formatg      ?)r   maxr!   
zeros_likelegacy_contiguous_formatscatter_detach)r*   r   y_softindexy_hardrets         r&   hard_softmaxr    sk    ^^C FJJsDJ)!,EfE4R4RS\\]`bgilmF
6==?
"V
+CJr%   c                        e Zd ZU eed<   def fdZdej                  deej                  ej                  f   fdZ
 xZS )Ovis2VisionModelrX   c                    t         |   |       || _        t        |      | _        |j
                  | _        |j                  | _        t        j                  |j                  |j                  z  |j                  z  | j                  | j
                  z
  d      | _        t        j                  | j                  | j
                  z
        | _        y NFrU   )r4   r5   rX   r   transformernum_visual_indicator_tokens
vocab_sizer   rZ   r;   hidden_stridehead_linear	LayerNorm	head_normra   s     r&   r5   zOvis2VisionModel.__init__  s     1&9+1+M+M( ++99!5!558L8LLOOd>>>

 doo8X8X&XYr%   r   r   c           	         | j                  |      }|j                  }| j                  j                  dkD  r|j                  \  }}}| j                  j                  }t        t        j                  |            }||z  |k7  rt        d      |||z  z
  |z  }	t        j                  j                  |ddd|	d|	fdd      }||	z  }|j                  |||z  |||z  ||      }|j                  dddddd      }|j                  |d	||z  |z        }| j                  |      }
| j                  |
      }
| j                  j                   d
k(  r$t        j                  j#                  |
d	d      }|S | j                  j                   dk(  rt%        |
d	      }|S | j                  j                   dk(  r!t        j                  j'                  |
d	      }S )Nr   z.Token sequence length must be a perfect squarer   constantr   r         r?   gumbel_argmaxT)r   hard	st_argmaxr   r   )r  r   rX   r   rL   intmathsqrtr   r   r   padr   permuter!  r#  tokenize_functiongumbel_softmaxr  r   )r:   r   outputsr   
num_imagesseq_len
hidden_dimr   sqrt_lpad_sizer*   
prob_tokens               r&   rJ   zOvis2VisionModel.forward  s   ""<0#55;;$$q(.?.E.E+J KK55M7+,F') !QRR%-)?@MQH " 1 12CaAxYZ\dEegqst uhF 1 9 9Fm3]FmD[]jlv! !2 9 9!Q1a K 1 9 9B =
 J! !!"34';;((O;55f"45PJ  [[**k9%f"5J  [[**i7..v2.>Jr%   )r   r   r   r   r#   r5   r!   r"   r/   r   rJ   rO   rP   s   @r&   r  r    sC    Z0 Z"E$5$5 "%ell@Z:[ "r%   r  zu
    The Ovis2 model which consists of a vision backbone and a language model, without a language modeling head.
    c            !       B    e Zd Zi Zdef fdZd Zd Zd Zd Z	de
j                  de
j                  fd	Zd
e
j                  de
j                  de
j                  fdZee	 	 	 	 	 	 	 	 	 	 	 	 	 dd
e
j                  de
j                  dee
j$                     dee
j                     deee
j                        dee
j                     dee
j                     dee   dee   dee   dee   dee
j                     deee
j$                  f   deeef   fd              Z xZS )
Ovis2ModelrX   c                    t         |   |       t        |j                        | _        t        j                  |j                        | _        t        |j                  j                  |j                        | _        |j                  j                  | _        |j                  | _
        |j                  | _        | j                          y rd   )r4   r5   r  vision_configvision_towerr   from_configtext_configlanguage_modelr   r  r;   visual_embeddings_tablevisual_vocab_sizevisual_indicator_token_ids	post_initra   s     r&   r5   zOvis2Model.__init__4  s     ,V-A-AB'33F4F4FG'@AUAUA`A`bhbtbt'u$!'!5!5!@!@ ++*0*K*K'r%   c                 6    | j                   j                         S rd   )rA  get_input_embeddingsrM   s    r&   rG  zOvis2Model.get_input_embeddings?  s    ""7799r%   c                 :    | j                   j                  |       y rd   )rA  set_input_embeddingsr:   r   s     r&   rI  zOvis2Model.set_input_embeddingsB  s    007r%   c                     || _         y rd   rA  r:   decoders     r&   set_decoderzOvis2Model.set_decoderE  s
    %r%   c                     | j                   S rd   rL  rM   s    r&   get_decoderzOvis2Model.get_decoderH  s    """r%   r   r   c                 4   | j                  |      }|j                  \  }}}t        j                  ||| j                   j                  f|j
                  |j                  d|j                        }t        j                  ||gd      }| j                  |      }t        j                  | j                  | j                   j                  z
  | j                  t        j                        j                  |j                        }| j                  |      }||fS )a  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
               The tensors corresponding to the input images.
            vision_feature_layer (`Union[int, list[int]]`, *optional*):
                The index of the layer to select the vision feature. If multiple indices are provided,
                the vision feature of the corresponding indices will be concatenated to form the
                vision features.
            vision_feature_select_strategy (`str`, *optional*):
                The feature selection strategy used to select the vision feature from the vision backbone.
                Can be one of `"default"` or `"full"`
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        F)rB   devicerequires_gradlayoutr   r+  r   )r>  rL   r!   zerosr  rB   rS  rU  catrB  r   rC  r   rC   )	r:   r   image_featuresr   img_seq_lenr   padding_tensorvisual_indicatorvisual_indicator_featuress	            r&   get_image_featureszOvis2Model.get_image_featuresK  s    ( **<8%3%9%9"
Kd&7&7&S&ST &&!((!((
 NN#CK55nE <<""T%6%6%R%RR""**
 "^""
#	 	
 %)$@$@AQ$R!888r%   	input_idsr   rX  c                 P   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }|j                  d   |j                  d   z  }||   j                         |j                         k7  rt        d| d|       |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        rB   rS  r?   r   r   z6Image features and image tokens do not match: tokens: z, features )rG  r!   tensorrX   image_token_idr   rS  allsum	unsqueeze	expand_asrC   rL   numelr   )r:   r^  r   rX  special_image_maskn_image_tokensn_image_featuress          r&   get_placeholder_maskzOvis2Model.get_placeholder_maskt  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL+,2248L8L8NNHHXXcdtcuv  "!r%   r   rt   r+   labels	use_cacher   r   r   cache_positionlogits_to_keepc                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
|d u |d uz  rt        d      | | j	                         |      }| | j                  |      \  }}| j                  |||      }|j                  ||      }t        | j                        D ]  \  }}|Y| | j	                         t        j                  |t        j                  |j                              k(  }|j                  d      }n||k(  j                  |j                        }|j!                         s||   j#                  ||         j                  |j                  |j$                        ||<     | j&                  d	||||||	|
d||d
|}t)        |j*                  |j,                  |j.                  |j0                  |      S d       S )
Nz:You must specify exactly one of input_ids or inputs_embedsr   )r   rX  r`  r?   T)
r   rt   r+   r   rm  r   r   r   rn  ro  )r   r+   r,   r-   r   r$   )rX   r   r   r   rG  r]  rk  masked_scatter	enumeraterD  r!   ra  r   rS  rc  rC   anyrf  rB   rA  r   r   r+   r,   r-   )r:   r^  r   r   rt   r+   r   rl  rm  r   r   r   rn  ro  r   rX  r\  rh  ivisual_indicator_idmaskr3  s                         r&   rJ   zOvis2Model.forward  s"   & 2C1N-TXT_T_TqTq$8$D $++JjJj 	 -t";<YZZ 7D557	BM#8<8O8O]i8O8j5N5!%!:!:+- "; "
 *889K^\M*3D4S4S*T &&$(,GD,E,E,G%8

S`SgSgh- D  88B<D%)<<@@AUAUVD88:1!4"=#67M00-2E2EF "$'  &$%% 
)%+'/!5))
 
 (%77#33!//))2>2J
 	

 QU
 	
r%   NNNNNNNNNNNNr   )r   r   r   _checkpoint_conversion_mappingr   r5   rG  rI  rO  rQ  r!   r"   r]  
LongTensorrk  r   r   r   r   r.   r   r   r,  r/   r   rJ   rO   rP   s   @r&   r;  r;  ,  s    &("	{ 	:8&#'9'''9 
		'9R"))":?:K:K"]b]n]n"0  '+*.1537=A59-1$(,0/3&*5934J
##J
 ''J
 !.	J

 u//0J
 "$u'8'8"9:J
   1 12J
 ))*J
 D>J
 $D>J
 'tnJ
 d^J
 !!1!12J
 c5<</0J
  
u..	/!J
  J
r%   r;  c            !       H    e Zd Zi ZdgZdef fdZd Zd Zde	j                  fdZd Zd	 Zd
ej                  fdZed        Zed        Zed        Zee	 	 	 	 	 	 	 	 	 	 	 	 	 ddej.                  d
ej                  deej2                     deej.                     deeej                        deej                     deej.                     dee   dee   dee   dee   deej.                     deeej2                  f   deeef   fd              Z 	 	 	 	 	 	 d fd	Z! xZ"S )Ovis2ForConditionalGenerationzlm_head.weightrX   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y r  )
r4   r5   r;  r  r   rZ   r;   r  lm_headrE  ra   s     r&   r5   z&Ovis2ForConditionalGeneration.__init__  sF     '
yy!3!3V5F5FUSr%   c                 6    | j                   j                         S rd   )r  rG  rM   s    r&   rG  z2Ovis2ForConditionalGeneration.get_input_embeddings  s    zz..00r%   c                 :    | j                   j                  |       y rd   )r  rI  rJ  s     r&   rI  z2Ovis2ForConditionalGeneration.set_input_embeddings  s    

''.r%   r   c                     | j                   S rd   )r~  rM   s    r&   get_output_embeddingsz3Ovis2ForConditionalGeneration.get_output_embeddings  s    ||r%   c                 :    | j                   j                  |       y rd   )r  rO  rM  s     r&   rO  z)Ovis2ForConditionalGeneration.set_decoder  s    

w'r%   c                 6    | j                   j                         S rd   )r  rQ  rM   s    r&   rQ  z)Ovis2ForConditionalGeneration.get_decoder  s    zz%%''r%   r   c                 :    | j                   j                  |      S )Nrq  )r  r]  )r:   r   s     r&   r]  z0Ovis2ForConditionalGeneration.get_image_features  s    zz,,,,GGr%   c                 .    | j                   j                  S rd   )r  rA  rM   s    r&   rA  z,Ovis2ForConditionalGeneration.language_model  s    zz(((r%   c                 .    | j                   j                  S rd   )r  r>  rM   s    r&   r>  z*Ovis2ForConditionalGeneration.vision_tower  s    zz&&&r%   c                     t        d      )NzNot needed for Ovis2)AttributeErrorrM   s    r&   multi_modal_projectorz3Ovis2ForConditionalGeneration.multi_modal_projector  s    344r%   r^  r   rt   r+   r   rl  rm  r   r   r   rn  ro  c                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
 | j                  d||||||||	|
d|d|}|d   }t	        |t
              rt        | d      n|}| j                  |dd|ddf         }d}|4 | j                  d||| j                   j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Ovis2ForConditionalGeneration

        >>> model = Ovis2ForConditionalGeneration.from_pretrained("thisisiron/Ovis2-2B-hf")
        >>> processor = AutoProcessor.from_pretrained("thisisiron/Ovis2-2B-hf")

        >>> prompt = "<|im_start|>user\n<image>\nDescribe the image.<|im_end|>\n<|im_start|>assistant\n"
        >>> url = "http://images.cocodataset.org/val2014/COCO_val2014_000000537955.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True)[0]
        "user\n\nDescribe the image.\nassistant\nThe image features a brown dog standing on a wooden floor, looking up with"
        ```NT)r^  r   r   rt   r+   r   rm  r   r   r   rn  r   )r*   rl  r  )r)   r*   r+   r,   r-   r   r$   )rX   r   r   r  
isinstancer,  slicer~  loss_functionr@  r  r(   r+   r,   r-   r   )r:   r^  r   r   rt   r+   r   rl  rm  r   r   r   rn  ro  r   r3  r,   slice_indicesr*   r)   s                       r&   rJ   z%Ovis2ForConditionalGeneration.forward  s7   \ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 $** 
%)%+'/!5)
 
  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD +#33!//)) ' ; ;
 	
r%   c           	      N    t        
|   |f|||||d|}	|d   dk(  r||	d<   |	S )N)r+   r   r   rn  ro  r   r   )r4   prepare_inputs_for_generation)r:   r^  r+   r   r   r   rn  ro  r   model_inputsr=   s             r&   r  z;Ovis2ForConditionalGeneration.prepare_inputs_for_generation[  sV     w<
+')))
 
 !! ,8L(r%   rx  )NNNNNN)#r   r   r   ry  _tied_weights_keysr   r5   rG  rI  r   Moduler  rO  rQ  r!   r"   r]  propertyrA  r>  r  r   r   rz  r   r   r.   r   r   r,  r/   r(   rJ   r  rO   rP   s   @r&   r|  r|    s
   %'"*+{ 1/ryy ((Hu/@/@ H ) ) ' ' 5 5  '+*.1537=A59-1$(,0/3&*5934R
##R
 ''R
 !.	R

 u//0R
 "$u'8'8"9:R
   1 12R
 ))*R
 D>R
 $D>R
 'tnR
 d^R
 !!1!12R
 c5<</0R
  
u11	2!R
  R
n  r%   r|  )r  r;  r|  )r   )8r-  dataclassesr   typingr   r   r   r!   r   activationsr	   
generationr
   integrationsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   utilsr   r   r   autor   configuration_ovis2r   r   r   r(   r  r2   rR   rl   r   floatr   r   r   r   r   r   r   r~   r   r  r,  r  r  r;  r|  __all__r$   r%   r&   <module>r     s9  ,  ! , ,   ! ) 7 9 H F B B  ? 
<6 < <  
<+ < <2 Y'J299 J (J(RYY  BII P %II%<<% 
% <<	%
 U\\*% % %.:)299 :)zryy  :)RYY :)z]8 ]2M
 M
`'
RYY '
T8 8'? ' C 2+ 2j 
g
% g

g
T [$8/ [ [| Rr%   