
    h              	       v   d dl Z d dlZd dlmZ d dlmZmZmZ d dlZd dl	m
Z
 d dlZddlmZmZ ddlmZmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z; ddl<m=Z=  e(j|                  e?      Z@ G d de,e      ZA G d de      ZB G d de;      ZC G d de8      ZD G d de
j                        ZF G d d e0      ZG G d! d"e3      ZH G d# d$e4      ZI G d% d&e.      ZJ G d' d(e      ZKdZL G d) d*e2      ZM G d+ d,e1      ZN G d- d.e/      ZO G d/ d0e
j                        ZQd1eej                     d2eej                     d3eSd4ee   fd5ZT G d6 d7e:      ZU G d8 d9e9      ZV G d: d;eM      ZWg d<ZXy)=    N)Callable)AnyOptionalUnion   )CacheDynamicCache)PretrainedConfiglayer_type_validation)create_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPast SequenceClassifierOutputWithPast)rope_config_validation)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg   )Gemma2Config)	Gemma2AttentionGemma2ForCausalLM	Gemma2MLPGemma2ModelGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingapply_rotary_pos_embeager_attention_forward)PaligemmaCausalLMOutputWithPast!PaliGemmaForConditionalGenerationPaliGemmaModelPaligemmaModelOutputWithPast)SiglipVisionConfigc                       e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZed        Zej                  d        Zy)Gemma3TextConfigaN   
    This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma3Text-7B.
    e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 262208):
            Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Gemma3TextModel`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 1000000.0):
            The base period of the RoPE embeddings.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
            Scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096):
            In Gemma3Text, every other layer uses sliding window attention. This is the size of the sliding window.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        final_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the attention scores.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        rope_local_base_freq (float, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings for local attention.

    ```python
    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
    >>> configuration = Gemma3TextConfig()
    >>> # Initializing a model from the gemma3_text-7b style configuration
    >>> model = Gemma3TextModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    gemma3_textNc                    t        j                  d||||d| || _        |	| _        || _        || _        || _        || _        || _        || _	        |
| _
        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        t1        |        |j3                  dd      | _        | j*                  Et7        | j                        D cg c]!  }t9        |dz   | j4                  z        rdnd# c}| _        t;        | j*                         y c c}w )N)pad_token_idbos_token_ideos_token_idtie_word_embeddingssliding_window_pattern      sliding_attentionfull_attention )r
   __init__
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headshead_dimnum_key_value_headsinitializer_rangerms_norm_eps	use_cache
rope_thetaattention_biasattention_dropouthidden_activationquery_pre_attn_scalarsliding_windowfinal_logit_softcappingattn_logit_softcappinglayer_typesrope_local_base_freqrope_scalingr   get_sliding_window_patternrangeboolr   )selfr;   r=   r>   r?   r@   rB   rA   rI   r<   rC   rD   rE   r0   r2   r1   r3   rF   rG   rH   rJ   rK   rN   rL   rM   rP   rO   kwargsis                                g/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/gemma3/modular_gemma3.pyr:   zGemma3TextConfig.__init__   s_   < 	!! 	
%%% 3		

 	
 %'>$&!2!2#6  #6 !2("$,!2!2%:",'>$&<#&$8!(t$ (.zz2JA'N$# t556  (,QUd6R6R,R'S#Yii D 	d../	 s   :&D>c                 N    t        j                  dt               | j                  S )NzTThe `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.)warningswarnFutureWarningrR   rU   s    rX   r4   z'Gemma3TextConfig.sliding_window_pattern   s"    b	
 +++    c                     || _         y N)rR   rU   values     rX   r4   z'Gemma3TextConfig.sliding_window_pattern  s
    ',$r^   )i@  i 	  i $              gelu_pytorch_tanhi   {Gz?ư>Tr   r6   r   Tg    .AF        rf   i   NNNNg     @)	__name__
__module____qualname____doc__
model_typer:   propertyr4   setterr9   r^   rX   r-   r-   <   s    rh J - ' ! $#%7F0P , , ""- #-r^   r-   c                        e Zd ZdZdZddddZeedZ	 	 	 	 	 	 	 dde	e
eeeef   f      d	e	e
eeeef   f      d
ededededef fdZ xZS )Gemma3Configa  
    This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
    Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the PaliGemma-2B.

    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
            The config object of the text backbone.
        vision_config (`Union[AutoConfig, dict]`,  *optional*):
            Custom vision config or dict.
        mm_tokens_per_image (`int`, *optional*, defaults to 256):
            The number of tokens per image embedding.
        boi_token_index (`int`, *optional*, defaults to 255999):
            The begin-of-image token index to wrap the image prompt.
        eoi_token_index (`int`, *optional*, defaults to 256000):
            The end-of-image token index to wrap the image prompt.
        image_token_index (`int`, *optional*, defaults to 262144):
            The image token index to encode the image prompt.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.


    Example:

    ```python
    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig

    >>> # Initializing a Siglip-like vision config
    >>> vision_config = SiglipVisionConfig()

    >>> # Initializing a Gemma3 Text config
    >>> text_config = Gemma3TextConfig()

    >>> # Initializing a Gemma3 gemma-3-4b style configuration
    >>> configuration = Gemma3Config(vision_config, text_config)

    >>> # Initializing a model from the gemma-3-4b style configuration
    >>> model = Gemma3TextConfig(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```gemma3image_token_indexboi_token_indexeoi_token_index)image_token_idboi_token_ideoi_token_id)text_configvision_configr{   r|   mm_tokens_per_imagerC   c                 z   | t               }t        j                  d       nt        |t              rt        di |}t        |t              rt        di |}n!|t               }t        j                  d       || _        || _        || _        || _	        || _
        || _        || _        t        	| 8  di | y )Nz@text_config is None, using default Gemma3TextConfig text config.zFvision_config is None, using default SiglipVisionConfig vision config.r9   )r-   loggerinfo
isinstancedictr+   r{   r|   r}   rv   rw   ru   rC   superr:   )
rU   r{   r|   r}   rv   rw   ru   rC   rV   	__class__s
            rX   r:   zGemma3Config.__init__D  s     *,KKKZ[T**9[9KmT*.??M".0MKK`a&*#6 ..!2!2"6"r^   )NNrf   i i  i   rh   )rk   rl   rm   rn   ro   attribute_mapr-   r+   sub_configsr   r   r   strr   intfloatr:   __classcell__r   s   @rX   rs   rs     s    .` J-))M (+K JNMQ#&&&!(#'#e$4d38n$DEF#  &8$sCx.&H IJ# !	#
 # # # !# #r^   rs   c                       e Zd Zy)Gemma3ModelOutputWithPastNrk   rl   rm   r9   r^   rX   r   r   f      r^   r   c                       e Zd Zy)Gemma3CausalLMOutputWithPastNr   r9   r^   rX   r   r   j  r   r^   r   c            	       Z     e Zd ZdZd	dedededef fdZdej                  f fdZ	 xZ
S )
Gemma3TextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    num_embeddingsembedding_dimpadding_idxembed_scalec                 v    t         |   |||       | j                  dt        j                  |      d       y )Nr   F)
persistent)r   r:   register_buffertorchtensor)rU   r   r   r   r   r   s        rX   r:   z&Gemma3TextScaledWordEmbedding.__init__s  s3    D]ELL,ERWXr^   	input_idsc                     t         |   |      | j                  j                  | j                  j
                        z  S r`   )r   forwardr   toweightdtype)rU   r   r   s     rX   r   z%Gemma3TextScaledWordEmbedding.forwardw  s2    wy)D,<,<,?,?@Q@Q,RRRr^   )g      ?)rk   rl   rm   rn   r   r   r:   r   Tensorr   r   r   s   @rX   r   r   n  sG    Ys Y3 YS Y_d YS S Sr^   r   c                   $     e Zd Zdef fdZ xZS )	Gemma3MLPconfigc                 $    t         |   |       y r`   r   r:   rU   r   r   s     rX   r:   zGemma3MLP.__init__|       r^   rk   rl   rm   r-   r:   r   r   s   @rX   r   r   {  s    !/ ! !r^   r   c                   *     e Zd Zddedef fdZ xZS )Gemma3RMSNormdimepsc                 (    t         |   ||       y )Nr   r   r   )rU   r   r   r   s      rX   r:   zGemma3RMSNorm.__init__  s    Sc*r^   )ri   )rk   rl   rm   r   r   r:   r   r   s   @rX   r   r     s    +C +e + +r^   r   c                   &     e Zd Zddef fdZ xZS )Gemma3RotaryEmbeddingr   c                 $    t         |   |       y r`   r   )rU   r   devicer   s      rX   r:   zGemma3RotaryEmbedding.__init__  r   r^   r`   r   r   s   @rX   r   r     s    !/ ! !r^   r   c                   0    e Zd Zdedef fdZ eddd      	 	 ddej                  d	ej                  d
e	ej                     de	e
   de	ej                     dee   deej                  e	ej                     e	eej                        f   fd       Z xZS )Gemma3Attentionr   	layer_idxc                 *   |j                   |   dk(  | _        t        |   ||       | j                  r|j                  nd | _        t        |j                  |j                        | _        t        |j                  |j                        | _	        y )Nr7   r   )
rN   
is_slidingr   r:   rK   r   rA   rD   q_normk_normrU   r   r   r   s      rX   r:   zGemma3Attention.__init__  sp     ,,Y7;NN+7;f33D#V=P=PQ#V=P=PQr^   past_key_valuepast_key_values4.58new_nameversionhidden_statesposition_embeddingsattention_maskcache_positionrV   returnc                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }| j                  |	      }	| j                  |
      }
|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  rt        | j                  j                     } || |	|
||f| j                   r| j"                  nd| j$                  | j&                  d|\  }} |j(                  g |d j+                         }| j-                  |      }||fS )Nr6   r   )sincosr   eagerrj   )dropoutscalingrK   )shaperA   q_projview	transposek_projv_projr   r   r%   updater   r&   r   _attn_implementationr   trainingrH   r   rK   reshape
contiguouso_proj)rU   r   r   r   r   r   rV   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   cache_kwargsattention_interfaceattn_outputattn_weightss                     rX   r   zGemma3Attention.forward  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7
%
 /3mmD**LL..
%
 
%
!\ *k));;;;FFHkk+.L((r^   )NN)rk   rl   rm   r-   r   r:   r   r   r   r   r   
LongTensorr   r   tupler   r   r   s   @rX   r   r     s    R/ RC R %0A6R ,059-)||-) #\\-) !.	-)
 "%-) !!1!12-) -.-) 
u||Xell3XeELL>Q5RR	S-) S-)r^   r   c                   t    e Zd Zdedef fdZ eddd      	 	 	 	 	 	 ddej                  d	ej                  d
ej                  de	ej                     de	ej                     de	e   de	e   de	e   de	ej                     deej                  e	eej                  ej                  f      f   fd       Z xZS )Gemma3DecoderLayerr   r   c                    t         |           || _        |j                  | _        || _        |j
                  |   | _        t        ||      | _        t        |      | _
        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        y )N)r   r   r   )r   r:   r   r=   r   rN   attention_typer   	self_attnr   mlpr   rD   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   s      rX   r:   zGemma3DecoderLayer.__init__  s    !--"$00;()LV$,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'r^   r   r   r   r   r   position_embeddings_globalposition_embeddings_localr   position_idsoutput_attentionsrE   r   r   c
                 T   |}| j                  |      }| j                  j                  r|}n|} | j                  d||||||||	d|
\  }}| j                  |      }||z   }|}| j	                  |      }| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )N)r   r   r   r   r   r   rE   r   r9   )r   r   r   r   r   r   r   )rU   r   r   r   r   r   r   r   rE   r   rV   residualr   self_attn_weightsoutputss                  rX   r   zGemma3DecoderLayer.forward  s     !,,]; >>$$";"<+94>> 
,
' 3)%+/)
,
 
,
(( 55mD =0 66}E/77F =0 ")++Gr^   )NNNFFN)rk   rl   rm   r-   r   r:   r   r   r   r   r   r   rT   r   FloatTensorr   r   r   s   @rX   r   r     s   c/ cC c %0A6R 2637+/,1$)590||0 %*LL0 $)<<	0
 !.0 u//00 "%0 $D>0 D>0 !!1!120 
u  (51B1BEDUDU1U+V"WW	X0 S0r^   r   c                       e Zd ZdZg dZd Zy)Gemma3PreTrainedModel )r   SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadc                     t        j                  | |       t        |t              r%|j                  j
                  j                          y y r`   )r   _init_weightsr   Gemma3MultiModalProjectormm_input_projection_weightdatazero_)rU   modules     rX   r  z#Gemma3PreTrainedModel._init_weights  s:    %%dF3f78--2288: 9r^   N)rk   rl   rm   base_model_prefix_no_split_modulesr  r9   r^   rX   r   r     s    ;r^   r   c                       e Zd ZU eed<   def fdZ	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee
   deej                     dee   d	ee   d
ee   deej                     dee   defdZ xZS )Gemma3TextModelr   c                 6   t         |   |       t        |j                  |j                  | j
                  | j                  j                  dz        | _        t        j                  |      }|j                  |_        ddi|_        t        |      | _        y )N      ?)r   	rope_typedefaultr   )r   r:   r   r;   r=   r   r   embed_tokenscopydeepcopyrO   rF   rP   r   rotary_emb_localr   s     rX   r:   zGemma3TextModel.__init__  s      :v1143C3CQUQ\Q\QhQhjmQm
 v&"77*I6 5V Dr^   r   r   r   r   inputs_embedsrE   r   output_hidden_statesr   rV   r   c
                 <   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|r$|"| j                  st        | j                         }|	F||j                         nd}t        j                  |||j                  d   z   |j                        }	||	j!                  d      }t#        |x}t$              s*| j                   |||	||d}t'        d
i |t)        d
i |d	}|}| j+                  ||      }| j-                  ||      }|rd
nd }|rd
nd }| j.                  d | j                   j0                   D ]:  }|r||fz  } ||f||||j2                     |||||	d|
}|d   }|s2||d   fz  }< | j5                  |      }|r||fz  }t7        ||||      S )N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr  r   r6   r   r   input_embedsr   r   r   r   r8   r7   r9   )r   r   r   r   r   r   rE   r   )last_hidden_stater   r   
attentions)r   r   r  rE   
ValueErrorgradient_checkpointingr   r   warning_oncer  r	   get_seq_lengthr   aranger   r   	unsqueezer   r   r   r   
rotary_embr  layersr?   r   normr   )rU   r   r   r   r   r  rE   r   r  r   rV   past_seen_tokenscausal_mask_mappingmask_kwargsr   r   r   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                        rX   r   zGemma3TextModel.forward,  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0*$++>O!CRC^==?de"\\  =#6#6q#99$++N )33A6L ?-F ++ -"0"0#2 ,K #5"C{"C%F%U%U# & &*__]L%Q"$($9$9-$V! #7BD0d![[)H4;;+H+HI 	6M#!m%55!)+E*C2=3O3OP) /"3#- M *!,M =#3"55)	6, 		-0-!11&+++%	
 	
r^   	NNNNNNNNN)rk   rl   rm   r-   __annotations__r:   r   r   r   r   r   r   rT   r   r   r   r   r   r   s   @rX   r  r    s    E/ E" 151537+/59$(,0/359i
E,,-i
 !.i
 u//0	i

 "%i
   1 12i
 D>i
 $D>i
 'tni
 !!1!12i
 +,i
 
!i
r^   r  c                   4     e Zd ZU eed<   dZdef fdZ xZS )Gemma3ForCausalLMr   language_modelc                 D    t         |   |       t        |      | _        y r`   )r   r:   r  modelr   s     rX   r:   zGemma3ForCausalLM.__init__  s     $V,
r^   )rk   rl   rm   r-   r0  r  r:   r   r   s   @rX   r2  r2    s!    (-/ - -r^   r2  c                   D     e Zd Zdef fdZdej                  fdZ xZS )r  r   c                    t         |           t        j                  t	        j
                  |j                  j                  |j                  j                              | _	        t        |j                  j                  |j                  j                        | _        t        |j                  j                  |j                  j                  z        | _        t        |j"                  dz        | _        | j                   | j$                  z  | _        t        j(                  | j&                  | j&                        | _        y )Nr   r  )kernel_sizestride)r   r:   nn	Parameterr   zerosr|   r=   r{   r  r   layer_norm_epsmm_soft_emb_normr   
image_size
patch_sizepatches_per_imager}   tokens_per_sider8  	AvgPool2davg_poolr   s     rX   r:   z"Gemma3MultiModalProjector.__init__  s    *,,,KK,,88&:L:L:X:XY+
' !.  ,,&2F2F2U2U!
 "%V%9%9%D%DH\H\HgHg%g!h"6#=#=s#BC11T5I5II1A1A$JZJZ[r^   vision_outputsc                    |j                   \  }}}|j                  dd      }|j                  ||| j                  | j                        }|j	                         }| j                  |      }|j                  d      }|j                  dd      }| j                  |      }t        j                  || j                        }|j                  |      S )Nr6   r   )r   r   r   rA  r   rD  flattenr>  r   matmulr  type_as)	rU   rE  
batch_size_
seq_lengthreshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputss	            rX   r   z!Gemma3MultiModalProjector.forward  s    $2$8$8!
Az"0":":1a"@"9"A"A
D$:$:D<R<R#
 #:"D"D"F $.E F 5 = =a @ 5 ? ?1 E $ 5 56K L#(<<0EtGfGf#g '//??r^   )	rk   rl   rm   rs   r:   r   r   r   r   r   s   @rX   r  r    s#    \| \ @ell @r^   r  token_type_idsimage_group_idstokens_per_imager   c           
      Z      ydt         dt         dt         dt         dt        f
 fd}|S )z
    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
    not start and end indices.
    N	batch_idxhead_idxq_idxkv_idxr   c                 H   t        j                  |
j                  d   k  |d      }
| |f   }t        j                  |
j                  d   k  |d      }	| |f   }t        j                  |	j                  d   k  |d      }
| |f   dk(  |dk(  z  }	| |f   |k(  }||z  S )Nr6   r   r   )r   wherer   )rU  rV  rW  rX  safe_idxtoken_type_ids_at_kv_idximage_group_ids_at_kv_idxis_image_blocksame_image_blockrR  rQ  s            rX   
inner_maskz0token_type_ids_mask_function.<locals>.inner_mask  s     ;;v(<(<Q(??K#1)X2E#F #(;;v8L8LQ8O/OQikl#m $3Ix4G$H!$)KK9N9Nq9Q0QSlnp$q!(E)9:a?D\`aDab*9e+;<@YY  000r^   )r   rT   )rQ  rR  rS  r`  s   ``  rX   token_type_ids_mask_functionra    s>     1c 1S 1 1c 1d 1" r^   c            !          e Zd ZdZdej
                  dej
                  fdZd Zee		 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dej                  deej
                     d	eej                     d
eeeej                     ef      deej                     deej                     deej                     deej                     dee   dee   dee   dee   deeef   fd              Zy)Gemma3ModelFpixel_valuesr   c                 `    | j                  |      j                  }| j                  |      }|S )a  
        Projects the last hidden state from the vision model into language model space.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        )rd  )vision_towerr  multi_modal_projector)rU   rd  rE  image_featuress       rX   get_image_featureszGemma3Model.get_image_features  s3     ***EWW33NCr^   c                     t        d      NzWe don't want to inherit itAttributeErrorrU   super_kwargss     rX   _update_causal_maskzGemma3Model._update_causal_mask      :;;r^   Nr   r   r   r   rQ  r   r  labelsrE   r   r  return_dictc                 B   |d u |d uz  rt        d      ||n| j                  j                  }||n| j                  j                  }||n| j                  j                  }|R| j                  j
                  | j                  k\  r/|| j                  j
                  k(  }|j                         }d||<   n|}| | j                         |      }|F||j                         nd}t        j                  |||j                  d   z   |j                        }|]| j                  |      }|j                  |j                  |j                         }| j#                  |||      }|j%                  ||      }t'        |x}t(              s5| j                  j+                         |||||d}||j                  d   dk7  r|dk(  j                  |j                        }|t,        j.                  j1                  |dd      d d d d	f    z  }t        j2                  |j5                         d
      dz
  }t        j6                  ||t        j8                  |d	|j                              }t;        |j                  |j                        || j                  j<                        |d<   t?        di |tA        di |d} | jB                  d|||||
||d|d	|}tE        |jF                  |
r|jH                  nd |jJ                  |jL                  |      S d       S )Nr  r   r6   r  )r  rh  r  r6   r   rb   r   r   or_mask_functionr  T)	r   r   r   r  rE   r   r  rs  r   )r  r   r   r  image_hidden_statesr9   )'r  r   r   r  use_return_dictrx   r;   cloneget_input_embeddingsr"  r   r#  r   r   ri  r   r   get_placeholder_maskmasked_scatterr   r   get_text_configr:  
functionalpadcumsumr   rZ  	full_likera  r}   r   r   r3  r   r  r   r   r  )rU   r   rd  r   r   r   rQ  r   r  rr  rE   r   r  rs  	lm_kwargsspecial_image_maskllm_input_idsr(  rh  r)  r*  is_imagenew_image_startrR  r   s                            rX   r   zGemma3Model.forward  s[   & -t";<YZZ1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]  T[[%?%?4??%R!*dkk.H.H!H%OO-M01M,-%M 7D557FM!CRC^==?de"\\ "2]5H5H5K"KTaThThN
 #!44\BN+..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M ?-F ++557 -"0"0#2 ,K )m.A.A!.D.I
 +a/33N4I4IJ"*bmm.?.?&XY.?.Z[\^a_a^a[a.b-b"b"',,/B/B/D!"Lq"P"'++ou~rZbZiZi/j# 3O"%%n&;&;<ot{{OnOn3./ #5"C{"C%F%U%U#
 &$%% 
.%+'/!5)
 
 )%777@G33d!//))2>2J
 	

 QU
 	
r^   )NNNNNNNNNNNNN)rk   rl   rm   accepts_loss_kwargsr   r   ri  rp  r   r   r   r   r   r   listr   rT   r   r   r   r9   r^   rX   rc  rc    s   u||  <  '+*.1537KO595959-1$(,0/3&*g
##g
 ''g
 !.	g

 u//0g
 "%U->->(?(F"GHg
 !!1!12g
 !!1!12g
   1 12g
 ))*g
 D>g
 $D>g
 'tng
 d^g
  
u//	0!g
  g
r^   rc  c            "           e Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej
                  dej                  deej                     deej
                     dee	e
ej                     ef      deej
                     deej
                     deej                     d	eej
                     d
ee   dee   dee   dee   de	eej                  f   de	eef   fd       Z	 	 	 	 	 	 	 	 	 	 d fd	Zd Ze	 ddedej                  deej                     dej                  dee   deej                     deej                     defd       Z xZS )Gemma3ForConditionalGenerationr   rd  r   r   r   rQ  r   r  rr  rE   r   r  rs  logits_to_keepr   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  } | j                  d||||||||
|	||||d|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|	O|j                         }|dddddf   }|	dddf   }||dd|j                  d    df   j                  |j                        }||j                  |j                        dk7     j                         }||j                  |j                        dk7     j                         }n |j                         }|j                         }t        j                         }|j!                  d| j                   j"                  j$                        }|j!                  d      j                  |j                        } |||      }|s|f|dd z   }||f|z   S |S t'        |||j(                  |j*                  |j,                  |j.                        S )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenize=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        N)r   rd  rQ  r   r   r   r  rE   rr  r   r  rs  r   r   .r   r6   )losslogitsr   r   r  ry  r9   )r   r   r  rz  r5  r   r   slicelm_headr   r   r   r   r   r:  CrossEntropyLossr   r{   r;   r   r   r   r  ry  )rU   r   rd  r   r   r   rQ  r   r  rr  rE   r   r  rs  r  r  r   r   slice_indicesr  r  shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelsoutputs                               rX   r   z&Gemma3ForConditionalGeneration.forwardh  s}   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$** 
%))%+'/!5#)
 
"  
8B>SV8W~ot4]kmA}a,?@A\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5DY,F'+'7D7V#CVC+#33!//)) ' ; ;
 	
r^   c                 T    t        |   |f||||||	|
|d|}|d   dk(  r||d<   |S )N)r   r  r   r   r   rE   r  rQ  r   rd  )r   prepare_inputs_for_generation)rU   r   r   r  r   r   rd  r   rQ  rE   r  rr  rV   model_inputsr   s                 rX   r  z<Gemma3ForConditionalGeneration.prepare_inputs_for_generation  s]      w<
+')%)))
 
 !!+7L(r^   c                     t        d      rk  rl  rn  s     rX   5_prepare_4d_causal_attention_mask_with_cache_positionzTGemma3ForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position  rq  r^   r   r  c                    | j                         |||||d}||j                  d   dk7  r|dk(  j                  |j                        }	|	t        j
                  j                  |	dd      d d d df    z  }
t        j                  |
j                         d      dz
  }t        j                  |	|t        j                  |d            }t        |j                  |j                        || j                        |d<   t        d	i |S )
Nr  r6   ru  r   rv  r   rw  rx  r9   )r  r   r   r   r:  r  r  r   r  r   rZ  r  ra  r}   r   )r   r  r   r   r   r   rQ  rV   r*  r  r  rR  s               rX   r   z8Gemma3ForConditionalGeneration.create_masks_for_generate  s    ,,.(,,.(
 %,*<*<Q*?1*D
 '!+//0E0EFH&"--*;*;HfTU*;*VWXZ][]Z]W]*^)^^O#ll?+>+>+@aH1LO#kk(OU__UcegEhiO.J!!."7"78/6KeKe/K*+ )7;77r^   )NNNNNNNNNNNNNr   )
NNNNNNNTNNr`   )rk   rl   rm   r   r   r   r   r   r   r   r  r   rT   r   r   r   r   r  r  staticmethodr
   r   r   r   r   s   @rX   r  r  g  s-    '+*.1537KO595959-1$(,0/3&*34|
##|
 ''|
 !.	|

 u//0|
 "%U->->(?(F"GH|
 !!1!12|
 !!1!12|
   1 12|
 ))*|
 D>|
 $D>|
 'tn|
 d^|
 c5<</0|
" 
u22	3#|
 |
B "H<  26!8 !8ll!8 !.!8 	!8
 "%!8 u||,!8 !.!8 
!8 !8r^   r  c                   T    e Zd ZddddZ fdZd Zd Zee	 	 	 	 	 	 	 	 	 dde	j                  d	ee	j                     d
ee	j                     dee	j                     dee   dee	j                     dee	j                     dee	j                     dee   dee   defd              Z xZS )Gemma3ForSequenceClassificationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projector)z^language_model.modelz^vision_towerz^multi_modal_projectorc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  j                  | j                  d      | _	        | j                          y )NF)bias)r   r:   
num_labelsrc  r5  r:  Linearr{   r=   score	post_initr   s     rX   r:   z(Gemma3ForSequenceClassification.__init__:  sZ      ++ (
YYv11==tUZ[
 	r^   c                 6    | j                   j                         S r`   )r5  r|  r]   s    rX   r|  z4Gemma3ForSequenceClassification.get_input_embeddingsC  s    zz..00r^   c                 :    | j                   j                  |       y r`   )r5  set_input_embeddingsra   s     rX   r  z4Gemma3ForSequenceClassification.set_input_embeddingsF  s    

''.r^   r   rd  r   r   r   r  rQ  rr  rE   rV   r   c
                     | j                   |f|||||||	d|
}|j                  }| j                  |      }||j                  d   }n|j                  d   }| j                  j
                  j                  |dk7  rt        d      | j                  j
                  j                  d}n||| j                  j
                  j                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                   j"                   d       |t        j                  ||j                  	      |f   }d}|| j%                  |||| j                  
      }t'        |||j(                  |j*                  |j,                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )r   rd  r   r   r  rQ  rE   Nr   r6   z=Cannot handle batch sizes > 1 if no padding token is defined.r   )r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  )r  rr  pooled_logitsr   )r  r  r   r   r  )r5  r  r  r   r   r{   r0   r  r   r   r   int32r#  argmaxr   r!  r   rk   loss_functionr   r   r   r  )rU   r   rd  r   r   r   r  rQ  rr  rE   rV   transformer_outputsr   r  rJ  last_non_pad_tokennon_pad_masktoken_indicesr  r  s                       rX   r   z'Gemma3ForSequenceClassification.forwardI  s   , )djj

)%%+')

 

 ,==M* "+J&,,Q/J;;""//7J!O\]];;""//7!#"%)@)@)M)MMQQRXR_R_afalalmL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab%%VFR_hlhshs%tD/ /??-;;*55
 	
r^   r/  )rk   rl   rm   _checkpoint_conversion_mappingr:   r|  r  r   r   r   r   r   r   r   r   rT   r   r   r   r   r   r   s   @rX   r  r  3  s)   !7-"?&"1/  '+481537+/5959-1$(C
##C
 u001C
 !.	C

 u//0C
 "%C
   1 12C
 !!1!12C
 ))*C
 D>C
 +,C
 
*C
  C
r^   r  )rs   r-   r   r  r2  r  rc  r  )Yr  rZ   collections.abcr   typingr   r   r   r   torch.nnr:  torch.utils.checkpointcache_utilsr   r	   configuration_utilsr
   r   masking_utilsr   r   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   gemma2.configuration_gemma2r   gemma2.modeling_gemma2r   r   r    r!   r"   r#   r$   r%   r&   paligemma.modeling_paligemmar'   r(   r)   r*   siglipr+   
get_loggerrk   r   r-   rs   r   r   	Embeddingr   r   r   r   r   r   GEMMA3_START_DOCSTRINGr   r  r2  Moduler  r   r   ra  rc  r  r  __all__r9   r^   rX   <module>r     s      $ ' '    . J m m B 9 Y 9 F & R R 0 6
 
 
  ( 
		H	%I-|%5 I-X[## [#|	 < 		#B 	
SBLL 
S!	 !
+M +
!1 !8)o 8)v?3 ?D  ;1 ;{
k {
|-) -!@		 !@HU\\*ell+  h	B~
. ~
BI8%F I8X[
&; [
|	r^   