
    hV                        d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	Z	 d dl
m	c mZ d dlmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7  e0jp                  e9      Z: G d de	jv                        Z< G d de	jv                        Z= G d dej                  jv                        Z> G d de	jv                        Z? G d de	j                        ZA ed        G d! d e	jv                               ZB G d" d#e	jv                        ZCd$ej                  d%ej                  d&ej                  d'eEej                  ej                  f   fd(ZFd)ej                  d*eGd'ej                  fd+ZH	 d^d,e	jv                  d-ej                  d.ej                  d/ej                  d0eej                     d1eId2eIfd3ZJ	 d^d,e	jv                  d-ej                  d.ej                  d/ej                  d0eej                     d1eId2eIfd4ZK G d5 d6e	jv                        ZL G d7 d8e      ZMe. G d9 d:e)             ZNe. G d; d<eN             ZO G d= d>eNe      ZPe e.d?@       G dA dBe#                    ZQ G dC dDej                  jv                        ZR G dE dFe	jv                        ZSdG ZT G dH dIe	jv                        ZUdJej                  d-ej                  fdKZVd-ej                  d.ej                  dJej                  d'eEej                  ej                  f   fdLZW G dM dNe	jv                        ZX G dO dPe	jv                        ZY G dQ dRe      ZZ G dS dTe	jv                        Z[ G dU dVe	jv                        Z\ G dW dXe	jv                        Z] G dY dZeN      Z^ G d[ d\eNe      Z_g d]Z`y)_    N)	dataclass)CallableOptionalUnion)Llama4VisionConfig   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_maskcreate_chunked_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)check_model_inputs   )Llama4ConfigLlama4TextConfigc                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Llama4TextExpertsconfigc                    t         |           |j                  | _        |j                  | _        |j
                  | _        | j                  | _        t        j                  t        j                  | j                  | j
                  d| j                  z              | _        t        j                  t        j                  | j                  | j                  | j
                  f            | _        t        |j                     | _        y N   )super__init__num_local_expertsnum_expertsintermediate_sizehidden_size
expert_dimnn	Parametertorchemptygate_up_proj	down_projr	   
hidden_actact_fnselfr&   	__class__s     h/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/llama4/modeling_llama4.pyr+   zLlama4TextExperts.__init__/   s    !33!'!9!9!--00LLT5E5EtGWGWYZ]a]l]lYl)mnekk43C3CT__VZVfVf2g&hiV../    hidden_statesreturnc                 v   |j                  | j                  j                  d   d| j                        }t	        j
                  || j                        }|j                  dd      \  }}t	        j
                  || j                  |      z  | j                        }|j                  d| j                        }|S )a2  
        This should really not be run on a single machine, as we are reaching compute bound:
        - the inputs are expected to be "sorted" per expert already.
        - the weights are viewed with another dim, to match num_expert, 1, shape * num_tokens, shape

        Args:
            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
            selected_experts (torch.Tensor): (batch_size * token_num, top_k)
            routing_weights (torch.Tensor): (batch_size * token_num, top_k)
        Returns:
            torch.Tensor
        r   r)   dim)	viewr5   shaper/   r3   bmmchunkr8   r6   )r:   r>   gate_upgateupnext_statess         r<   forwardzLlama4TextExperts.forward9   s     &**4+<+<+B+B1+Er4K[K[\))M4+<+<====+biidkk$&7!7$..I!&&r4+;+;<r=   )	__name__
__module____qualname__r#   r+   r3   TensorrL   __classcell__r;   s   @r<   r%   r%   .   s+    0/ 0U\\ ell r=   r%   c                   &     e Zd Zd fd	Zd Z xZS )Llama4TextMLPc                 f   t         |           ||j                  }|| _        t	        j
                  |j                  |d      | _        t	        j
                  |j                  |d      | _        t	        j
                  ||j                  d      | _	        t        |j                     | _        y NFbias)r*   r+   r.   r&   r1   Linearr/   	gate_projup_projr6   r	   r7   activation_fn)r:   r&   r.   r;   s      r<   r+   zLlama4TextMLP.__init__P   s    $ & 8 86#5#57HuUyy!3!35FUS#4f6H6HuU#F$5$56r=   c                     | j                  | j                  |            | j                  |      z  }| j                  |      S N)r\   rZ   r[   r6   )r:   xr6   s      r<   rL   zLlama4TextMLP.forward\   s7    &&t~~a'89DLLOK	~~i((r=   r^   rM   rN   rO   r+   rL   rQ   rR   s   @r<   rT   rT   O   s    
7)r=   rT   c                   8     e Zd Zddef fdZd Zd Zd Z xZS )Llama4TextL2Normepsc                 0    t         |           || _        y r^   )r*   r+   rc   )r:   rc   r;   s     r<   r+   zLlama4TextL2Norm.__init__b   s    r=   c                     |t        j                  |j                  d      j                  dd      | j                  z         z  S Nr)   rA   T)keepdimr3   rsqrtpowmeanrc   r:   r_   s     r<   _normzLlama4TextL2Norm._normf   4    5;;quuQx}}R}>IJJJr=   c                 ^    | j                  |j                               j                  |      S r^   )rm   floattype_asrl   s     r<   rL   zLlama4TextL2Norm.forwardi   s"    zz!'')$,,Q//r=   c                      d| j                    S )Nzeps=rc   r:   s    r<   
extra_reprzLlama4TextL2Norm.extra_reprl   s    dhhZ  r=   )gư>)	rM   rN   rO   rp   r+   rm   rL   ru   rQ   rR   s   @r<   rb   rb   a   s    E K0!r=   rb   c                   2     e Zd Zd fd	Zd Zd Zd Z xZS )Llama4TextRMSNormc                     t         |           || _        t        j                  t        j                  |            | _        y)z<
        Llama4RMSNorm is equivalent to T5LayerNorm
        N)r*   r+   rc   r1   r2   r3   onesweight)r:   r/   rc   r;   s      r<   r+   zLlama4TextRMSNorm.__init__q   s0     	ll5::k#:;r=   c                     |t        j                  |j                  d      j                  dd      | j                  z         z  S rf   rh   rl   s     r<   rm   zLlama4TextRMSNorm._normy   rn   r=   c                 |    | j                  |j                               j                  |      }|| j                  z  S r^   )rm   rp   rq   rz   )r:   r_   outputs      r<   rL   zLlama4TextRMSNorm.forward|   s0    AGGI&..q1##r=   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tuplerz   rE   rc   rt   s    r<   ru   zLlama4TextRMSNorm.extra_repr   s'    ))*+6$((<<r=   )gh㈵>)rM   rN   rO   r+   rm   rL   ru   rQ   rR   s   @r<   rw   rw   p   s    <K$=r=   rw   c                   (     e Zd Z fdZ fdZ xZS )Llama4Routerc                     t         |   |j                  |j                  d       |j                  | _        |j
                  | _        y rV   )r*   r+   r/   r,   r-   num_experts_per_toktop_kr9   s     r<   r+   zLlama4Router.__init__   s>    ++V-E-EER!33//
r=   c                 t   t         |   |      }t        j                  || j                  d      \  }}t        j
                  |t        d            j                  d||      }t        j                  j                  j                  |j                               j                  |j                        }||fS )Nr!   rB   z-inf)r*   rL   r3   topkr   	full_likerp   scatter_r1   
functionalsigmoidtodtype)r:   r>   router_logitsrouter_top_valuerouter_indicesrouter_scoresr;   s         r<   rL   zLlama4Router.forward   s    6+0::mTZZUV+W(.uV}ENNqR`brs++33M4G4G4IJMMmNaNabm++r=   r`   rR   s   @r<   r   r      s    0
, ,r=   r   Llama4TextMoec                   $     e Zd Z fdZd Z xZS )r   c                     t         |           |j                  | _        |j                  | _        |j                  | _        t        |      | _	        t        |      | _        t        |      | _        y r^   )r*   r+   r   r   r/   
hidden_dimr,   r-   r%   expertsr   routerrT   shared_expertr9   s     r<   r+   zLlama4TextMoe.__init__   s[    //
 ,,!33(0"6**62r=   c                    |j                  d| j                        }| j                  |      \  }}|j                  |j                  d   d      }||j                  dd      j                  dd      z  }| j                  |      }| j                  |      }|j                  |j                  |j                  d   d|j                  d         j                  d             ||fS )NrA   r!   r   rB   )
reshaper   r   repeatrE   	transposer   r   add_sum)r:   r>   r   r   	routed_in
routed_outouts          r<   rL   zLlama4TextMoe.forward   s    %--b$//B'+{{='A$}!(()<)<Q)?C	 7 71 = E Eb! LL	\\),
  /##M$7$7$:B
@P@PQS@TUYY^_Y`aM!!r=   r`   rR   s   @r<   r   r      s    3"r=   c                   ~     e Zd ZU ej                  ed<   ddef fdZ ej                         e	d               Z
 xZS )Llama4TextRotaryEmbeddinginv_freqr&   c                 `   t         |           |j                  dnd| _        |j                  | _        |j                  | _        || _        t        | j                     | _	        | j                  | j                  |      \  }| _
        | j                  d|d       | j                  | _        y )Nllama3defaultr   F)
persistent)r*   r+   rope_scaling	rope_typemax_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr&   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r:   r&   devicer   r;   s       r<   r+   z"Llama4TextRotaryEmbedding.__init__   s    %+%8%8%D)"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r=   c                    | j                   d d d d f   j                         j                  |j                  d   dd      }|d d d d d f   j                         }t	        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                  |j
                        |z  j                  dd      }t        j                  t        j                  |      |      }|| j                  z  }d d d        |S # 1 sw Y   S xY w)	Nr   rA   r!   mpscpuF)device_typeenabledr)   )r   rp   expandrE   
isinstancer   typestrr3   autocastr   r   polar	ones_liker   )r:   r_   position_idsinv_freq_expandedposition_ids_expandedr   freqs	freqs_ciss           r<   rL   z!Llama4TextRotaryEmbedding.forward   s
    !MM$4-8>>@GGHZHZ[\H]_acde ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	;&))!((36KKVVWXZ[\EEOOE$:EBI!D$:$::I	;
 	;
 s   A'D88Er^   )rM   rN   rO   r3   rP   __annotations__r#   r+   no_gradr   rL   rQ   rR   s   @r<   r   r      s>    ll// / U]]_
  
r=   r   xqxkr   r?   c           	      &   t        j                   | j                         j                  g | j                  d d dd       }t        j                   |j                         j                  g |j                  d d dd       }t        j
                  ||d d d d d d d f   z        j                  d      }t        j
                  ||d d d d d d d f   z        j                  d      }|j                  |       |j                  |      fS )NrA   r)   r   )r3   view_as_complexrp   r   rE   view_as_realflattenrq   )r   r   r   xq_xk_xq_outxk_outs          r<   apply_rotary_embr      s    
 

 2
 2 2 IBHHSbM I2 Iq I
JC


 2
 2 2 IBHHSbM I2 Iq I
JCi1dA&> >?GGJFi1dA&> >?GGJF>>"v~~b111r=   r>   n_repc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r!   N)rE   r   r   )r>   r   batchnum_key_value_headsslenhead_dims         r<   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr=   modulequerykeyvalueattention_maskscalingdropoutc                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
d      }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr)   r   rA   rB   ptrainingr!   )r   num_key_value_groupsr3   matmulr   rE   r1   r   softmaxr   r   
contiguousr   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r<   eager_attention_forwardr      s     3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2(>L==((6??([L,,|\:K''1-88:K$$r=   c                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            | j
                  dz  z  }
|#|d d d d d d d |j                  d   f   }|
|z   }
t        j                  j                  |
d      }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )	Nr)   r         r   rA   rB   r   r!   )r   r   r3   r   r   r   rE   r1   r   r   r   r   r   r   s                r<   vision_eager_attention_forwardr      s     3 ; ;<JUF$?$?@L<<z';';Aq'ABV__VZEZZL!$Q1.D
0@0@0D.D%DE#k1==((2(>L==((6??([L,,|\:K''1-88:K$$r=   c                   N    e Zd ZdZdef fdZ eddd      	 	 ddej                  d	e	ej                  ej                  f   d
e
ej                     de
e   de
ej                     dee   de	ej                  e
ej                     e
e	ej                        f   fd       Z xZS )Llama4TextAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr&   c                    t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  | _        |j                  |j                  z  | _	        |j                  | _        | j                  dz  | _
        |j                  | _        |j                  | _        |j                  | _        |j                  | _        d| _        |j                   |   | _        t%        j&                  |j
                  |j                  | j                  z  |j(                        | _        t%        j&                  |j
                  |j                  | j                  z  |j(                        | _        t%        j&                  |j
                  |j                  | j                  z  |j(                        | _        t%        j&                  |j                  | j                  z  |j
                  |j(                        | _        | j                  j2                  r(| j"                  rt5        |j6                        | _        y y y )Nr   r   TrW   )r*   r+   r&   	layer_idxgetattrr/   num_attention_headsr   r   r   r   
attn_scalefloor_scaleattn_temperature_tuningattention_dropout	is_causalno_rope_layersuse_roper1   rY   attention_biasq_projk_projv_projo_projuse_qk_normrb   rms_norm_epsqk_normr:   r&   r   r;   s      r<   r+   zLlama4TextAttention.__init__  s   "
F4F4F&JdJd4de#)#=#= $*$>$>&B\B\$\!#)#=#= }}d* ++!--'-'E'E$!'!9!9--i8ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 ;;""t}}+F,?,?@DL (5"r=   past_key_valuepast_key_values4.58new_nameversionr>   position_embeddingsr   cache_positionr   r?   c                 j   |j                   d d }g |d| j                  }| j                  |      j                  |      }	 | j	                  |      j                  g |d| j                   }
| j                  |      j                  |      j                  dd      }| j                  r)t        |	|
|j                  |	j                              \  }	}
t        | d      r"| j                  |	      }	| j                  |
      }
| j                  r| j                  st        j                  t        j                   |j#                         dz   | j$                  z        dz         | j&                  z  dz   }|j                  d|d   ddf      j)                  g |dd      }|	|z  j                  |	j*                        }	|	j                  dd      }	|
j                  dd      }
|%d|i}|j-                  |
|| j.                  |      \  }
}t0        }| j2                  j4                  dk7  rt6        | j2                  j4                     } || |	|
||f| j8                  sdn| j:                  | j<                  d	|\  }} |j>                  g |d jA                         }| jC                  |      }||fS )
NrA   r!   r)   r        ?r  eager        )r   r   )"rE   r   r   rD   r  r  r   r   r   r   r   hasattrr  r   r3   logfloorrp   r   r   r   r   updater   r   r&   _attn_implementationr   r   r   r   r   r   r  )r:   r>   r  r   r	  r  r   input_shapehidden_shapequery_statesr   r   attn_scalescache_kwargsattention_interfacer   r   s                    r<   rL   zLlama4TextAttention.forward8  s    $))#2.88b8$--8{{=166|D4T[[/44UkU2Ut}}U
{{=166|DNNqRST=='7j*=*@*@ATAT*U($L* 4#<<5Lj1J ''		%++~';';'='CtGWGW&WX[^^_bfbqbqqtww  &**A{21+EFMMNbP[Nb]^Nb`aNbcK(;6::<;M;MNL#--a3))!Q/
&,n=L'6'='=j,X\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r=   NN)rM   rN   rO   __doc__r#   r+   r   r3   rP   r   r   r
   
LongTensorr   r   rL   rQ   rR   s   @r<   r   r     s    GA/ A< %0A6R ,0599)||9) #5<<#=>9) !.	9)
 "%9) !!1!129) -.9) 
u||Xell3XeELL>Q5RR	S9) S9)r=   r   c                       e Zd Z fdZ eddd      	 	 	 	 	 	 ddej                  deej                     deej                     dee	ej                        d	ee
   d
eej                     dee	ej                  ej                  f      dee   de	ej                  ee	ej                  ej                  f      f   fd       Z xZS )Llama4TextDecoderLayerc                    t         |           |j                  | _        || _        |j                  |   | _        t        ||      | _        ||j                  v | _	        | j                  rt        |      | _        nt        ||j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )N)r.   rs   )r*   r+   r/   r   layer_typesattention_typer   	self_attn
moe_layersis_moe_layerr   feed_forwardrT   intermediate_size_mlprw   r  input_layernormpost_attention_layernormr  s      r<   r+   zLlama4TextDecoderLayer.__init__v  s    !--"$00;,VY?%):):: -f 5D -fHdHd eD01C1CI\I\](9&:L:LRXReRe(f%r=   r  r	  r
  r  r>   r   r   	use_cacher  r  r   r?   c           
         |}	| j                  |      } | j                  d||||||d|\  }
}|	|
z   }|}	| j                  |      }| j                  |      }| j                  r|\  }}|	|j                  |	j                        z   }|S )N)r>   r  r   r	  r.  r   )r,  r'  r-  r*  r)  rD   rE   )r:   r>   r   r   r	  r.  r  r  r   residualattention_states_s               r<   rL   zLlama4TextDecoderLayer.forward  s     !,,]; -dnn 
' 3)+)
 
! !#33 !55mD))-8,M1 =#5#5hnn#EEr=   )NNNFNN)rM   rN   rO   r+   r   r3   rP   r   r!  r   boolr   r   FloatTensorrL   rQ   rR   s   @r<   r#  r#  u  s   g %0A6R 26379=$)59KO"||" !." u//0	"
 "%"56" D>" !!1!12" &eELL%,,,F&GH" -." 
u  (51B1BEDUDU1U+V"WW	X" S"r=   r#  c                   <    e Zd ZU eed<   dZdgZdZdZdZ	dZ
dZd Zy)Llama4PreTrainedModelr&   Tr	  Fc                 V   t        | j                  d      r| j                  j                  n| j                  j                  j                  }t	        |t
        j                        rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y t	        |t
        j                        rf|j                  j                  j                  d|       |j                  2|j                  j                  |j                     j                          y y t	        |t
        j                        rJ|j                  j                  j                  d       |j                  j                  j                          y t	        |t               r&|j                  j                  j                  d       y t	        |t"              rO|j$                  j                  j                  d|       |j&                  j                  j                  d|       y t	        |t(              ra|j*                  j                  j                  |j,                         |j.                  j                  j                  |j,                         y y )Ninitializer_ranger  )rk   stdr  )r:  )r  r&   r9  text_configr   r1   rY   rz   datanormal_rX   zero_	Embeddingpadding_idx	LayerNormfill_rw   r%   r5   r6   Llama4VisionModelclass_embeddingscalepositional_embedding_vlm)r:   r   r:  s      r<   _init_weightsz#Llama4PreTrainedModel._init_weights  s    t{{$78 KK))((:: 	
 fbii(MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> .-MM$$S)KK""$ 12MM$$S) 12$$,,#3,?!!))s)< 12""''//FLL/A++0088V\\8J 3r=   N)rM   rN   rO   r"   r   supports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendrG  r0  r=   r<   r7  r7    s:    &*##4"5 N!"&Kr=   r7  c                   8    e Zd ZU dgZdZeed<   eee	dZ
def fdZeee	 	 	 	 	 	 	 ddej                   deej$                     deej                      d	ee   d
eej(                     dee   deej                      dee   deeef   fd                     Z xZS )Llama4TextModelr#  modelr&   )
attentionsr>   r   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t#        |      | _        d| _        | j)                          y c c}w )Nrs   r&   F)r*   r+   pad_token_idr@  
vocab_sizer1   r?  r/   embed_tokens
ModuleListrangenum_hidden_layersr#  layersrw   r  normr   
rotary_embgradient_checkpointing	post_initr  s      r<   r+   zLlama4TextModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammHMfNfNfHgh9#FI6h
 &f&8&8f>Q>QR	36B&+# 	 is   D	input_idsr   r   r	  inputs_embedsr.  r  r   r?   c                    |d u |d uz  rt        d      |>| j                  |j                  | j                  j                  j                              }|r|t        | j                        }|F||j                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        |x}
t              s*| j                  |||||d}t        d
i |t        d
i |d}
|}| j!                  ||      }| j"                  d | j                  j$                   D ]  } ||f|
|j&                     |||||d|}! | j)                  |      }t+        ||r|	      S d 	      S )N:You must specify exactly one of input_ids or inputs_embedsrT  r   r!   )r   )r&   input_embedsr   r  r	  r   )full_attentionchunked_attention)r   r   r	  r.  r  r  )last_hidden_stater	  r0  )
ValueErrorrW  r   rz   r   r   r&   get_seq_lengthr3   arangerE   	unsqueezer   dictr   r   r]  r[  rZ  r&  r\  r   )r:   r`  r   r   r	  ra  r.  r  r   past_seen_tokenscausal_mask_mappingmask_kwargsr>   freq_cisdecoder_layers                  r<   rL   zLlama4TextModel.forward  s    -t";<YZZ  --ill4;L;L;S;S;Z;Z.[\M0*$++>O!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L ?-F ++ -"0"0#2 ,K #5"C{"C%?%N+%N#
 & ??=,?![[)H4;;+H+HI 
	M)	2=3O3OP) /#-$,	 	M
	 		-0&+/8O
 	
>B
 	
r=   )NNNNNNN)rM   rN   rO   _no_split_modulesbase_model_prefixr#   r   r   r#  r   _can_record_outputsr+   r   r    r   r3   r!  r   rP   r
   r5  r4  r   r   r   r   r   rL   rQ   rR   s   @r<   rP  rP    s   12)/&/    '+1537+/59$(59C
##C
 !.C
 u//0	C

 "%C
   1 12C
 D>C
 !!1!12C
 +,C
 
u--	.C
   C
r=   rP  c                       e Zd ZU dgZdZdgZddiZeed<   def fdZ	e
e	 	 	 	 	 	 	 	 	 ddej                  d	eej                     d
eej                     deeeeej&                     f      deej&                     deej                     dee   deej                     deeej                  f   dee   deeef   fd              Z xZS )Llama4ForCausalLMr#  language_modelzlm_head.weightlm_headcolwise_repr&   c                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y rV   )
r*   r+   rP  rQ  rV  r1   rY   r/   rx  r_  r9   s     r<   r+   zLlama4ForCausalLM.__init__=  sU     $V,
 ++yy!3!3V5F5FUS 	r=   r`  r   r   r	  ra  labelsr.  r  logits_to_keepr   r?   c
                 l    | j                   d|||||||d|
}|d   }t        |	t              rt        |	 d      n|	}| j	                  |dd|ddf         }d}|* | j
                  d||| j                  j                  d|
}t        |||j                  |j                  |j                        S )az  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Llama4ForCausalLM

        >>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)r`  r   r   r	  ra  r.  r  r   N)logitsr{  rV  )lossr~  r	  r>   rR  r0  )rQ  r   intslicerx  loss_functionr&   rV  r   r	  r>   rR  )r:   r`  r   r   r	  ra  r{  r.  r  r|  r   outputsr>   slice_indicesr~  r  s                   r<   rL   zLlama4ForCausalLM.forwardF  s    J $** 	
)%+')	
 	
  
8B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD%#33!//))
 	
r=   )	NNNNNNNNr   )rM   rN   rO   rr  rs  _tied_weights_keys_tp_planr#   r   r+   r   r   r3   r!  r   rP   r   r
   listr5  r4  r  r   r   r   r   rL   rQ   rR   s   @r<   rv  rv  6  sU   12(*+=)H/   '+1537KO59-1$(5934<
##<
 !.<
 u//0	<

 "%tE4E4E/F(F"GH<
   1 12<
 ))*<
 D><
 !!1!12<
 c5<</0<
 +,<
 
u,,	-<
  <
r=   rv  zQ
    Base class for Llava causal language model (or autoregressive) outputs.
    )custom_introc                      e Zd ZU dZdZeej                     ed<   dZ	ej                  ed<   dZ
eeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   dZeej                     ed<   y)	Llama4CausalLMOutputWithPasta\  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nr  r~  r	  r>   rR  image_hidden_states)rM   rN   rO   r   r  r   r3   r5  r   r~  r	  r  r>   r   rR  r  r0  r=   r<   r  r    s      )-D(5$$
%, $FE$9=OXd5#4#456=8<M8E%"3"345<59Ju001297;%"3"34;r=   r  c                   $     e Zd Z fdZd Z xZS )Llama4VisionMLP2c                 ~   t         |           |j                  | _        |j                  | _        t	        j
                  | j                  |j                  d      | _        t	        j
                  |j                  |j                  d      | _	        t	        j                         | _        |j                  | _        y rV   )r*   r+   r/   r.   r1   rY   projector_input_dimfc1projector_output_dimfc2GELUr\   projector_dropoutr   r9   s     r<   r+   zLlama4VisionMLP2.__init__  s    !--!'!9!999T33V5O5OV[\99V88&:U:U\abWWY//r=   c                     | j                  |      }| j                  |      }t        j                  || j                  | j                        }| j                  | j                  |            S )Nr   )r  r\   Fr   r   r  r:   r>   s     r<   rL   zLlama4VisionMLP2.forward  sT    /**=9		-4<<$--X!!$((="9::r=   r`   rR   s   @r<   r  r    s    0;r=   r  c                   $     e Zd Z fdZd Z xZS )Llama4MultiModalProjectorc                     t         |           t        j                  |j                  j
                  |j                  j                  d      | _        y rV   )	r*   r+   r1   rY   vision_configvision_output_dimr;  r/   linear_1r9   s     r<   r+   z"Llama4MultiModalProjector.__init__  s?    		  22**
r=   c                 (    | j                  |      }|S r^   )r  )r:   image_featuresr>   s      r<   rL   z!Llama4MultiModalProjector.forward  s    n5r=   r`   rR   s   @r<   r  r    s    
r=   r  c           
      J   | j                   \  }}}t        t        j                  |            }| j	                  |||d      } | j                         \  }}}}| j	                  ||t        ||z        t        ||z              }|j                  dddd      j                         }|j	                  |t        ||z        t        ||z        t        ||dz  z              }|j                  dddd      j                         }|j	                  |d|j                   d         }	|	S )NrA   r   r)   r!   r   )rE   r  mathsqrtrD   sizepermuter   )
input_tensorshuffle_ratio
batch_sizenum_patcheschannels
patch_sizeheightwidthreshaped_tensoroutput_tensors
             r<   pixel_shuffler    s%   (4(:(:%JXTYY{+,J$$ZZLL*6*;*;*='Jx"''
FC@U<VX[\dgt\tXuvO%--aAq9DDFO%**C./U]5J1KSQY]jlm]mQnMoO &--aAq9DDFO#((R9N9Nr9RSMr=   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Llama4VisionPixelShuffleMLPc                     t         |           |j                  | _        t        |j                  | j                  dz  z        | _        |j                  | _        t        |      | _	        y r(   )
r*   r+   pixel_shuffle_ratior  r  	inner_dimr  
output_dimr  mlpr9   s     r<   r+   z$Llama4VisionPixelShuffleMLP.__init__  sX    #)#=#= V77D<T<TVW<WXY 55#F+r=   encoded_patchesr?   c                 P    t        || j                        }| j                  |      S r^   )r  r  r  )r:   r  s     r<   rL   z#Llama4VisionPixelShuffleMLP.forward  s#    '9Q9QRxx((r=   rM   rN   rO   r+   r3   rP   rL   rQ   rR   s   @r<   r  r    s#    ,)u|| ) )r=   r  freqs_cic                     |j                   }t        |j                        D cg c]  \  }}|dk(  s||dz
  k(  r|nd }}} | j                  | S c c}}w )Nr!   )ndim	enumeraterE   rD   )r  r   r  idrE   s         r<   reshape_for_broadcastr    sW    ::D=Fu{{=STTQ!q&AMQq0TET8==%   Us   Ac                 B   t        j                   | j                         j                  g | j                  d d dd       }t        j                   |j                         j                  g |j                  d d dd       }t        ||      }|j                  |j                        }t        j                  ||z        j                  d      }t        j                  ||z        j                  d      }|j                  |       |j                  |      fS )NrA   r)   )r  r   r   )r3   r   rp   r   rE   r  r   r   r   r   rq   )r   r   r  query_key_	query_outkey_outs          r<   vision_apply_rotary_embr    s    
 ""#85;;=#8#8#R%++cr:J#RB#RPQ#RSF  !4!4!4!Lciin!Lb!L!!LMD$hfEH{{6==)H""6H#45==a@I  199!<GU#W__S%999r=   c                        e Zd Zdef fdZ	 	 d
dej                  dej                  deej                     dee   de	e
   deej                  eej                     eeej                        f   fd	Z xZS )Llama4VisionAttentionr&   c                    t         |           || _        |j                  | _        |j
                  | _        |j                  |j
                  z  | _        d| _        |j                  | _	        | j                  dz  | _
        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  z  | j                  d      | _        y )Nr!   r   TrW   )r*   r+   r&   r/   	embed_dimr   	num_headsr   r   r   r   r1   rY   r   r  r  r  r9   s     r<   r+   zLlama4VisionAttention.__init__  s   ++33**f.H.HH$%!!'!9!9}}d*ii0NUYZii0NUYZii0NUYZii >UYZr=   r>   r  r   r	  r   r?   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      }| j	                  |      j                  |      }	| j                  |      j                  |      }
t        ||	|      \  }}	|j                  dd      }|	j                  dd      }	|
j                  dd      }
t        }| j                  j                  dvrt        | j                  j                     } || ||	|
d f| j                  sdn| j                  d dd|\  }} |j                  g |d j                         }| j!                  |      }||fS )	NrA   )r  r!   r)   )r  flex_attentionr  F)r   r   r   )rE   r   r   rD   r  r  r  r   r   r&   r  r   r   r   r   r   r  )r:   r>   r  r   r	  r   r  r  r  r   r   r  r   r   s                 r<   rL   zLlama4VisionAttention.forward  su    $))#2.88b8$--8{{=166|D[[/44\B
{{=166|D#:<^f#g j#--a3))!Q/
#--a3(F;;++3NN"9$++:Z:Z"[$7
%
  $}}C$2H2H
%
 
%
!\ *k));;;;FFHkk+.L((r=   r  )rM   rN   rO   r   r+   r3   rP   r   r
   r   r   r   rL   rQ   rR   s   @r<   r  r    s    [1 [& 26+/()||() ,,() !.	()
 "%() -.() 
u||Xell3XeELL>Q5RR	S()r=   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Llama4VisionMLPc                 &   t         |           || _        t        j                         | _        t        j                  |j                  |j                  d      | _	        t        j                  |j                  |j                  d      | _
        y )NTrW   )r*   r+   r&   r1   r  r\   rY   r/   r.   r  r  r9   s     r<   r+   zLlama4VisionMLP.__init__7  se    WWY99V//1I1IPTU99V55v7I7IPTUr=   r>   r?   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r^   )r  r\   r  r  s     r<   rL   zLlama4VisionMLP.forward>  s4    /**=9/r=   r  rR   s   @r<   r  r  6  s$    VU\\ ell r=   r  c            
            e Zd Zdef fdZ	 	 ddej                  dej                  deej                     dee   fdZ	 xZ
S )	Llama4VisionEncoderLayerr&   c                    t         |           |j                  | _        t        |      | _        t        |      | _        t        j                  |j                        | _	        t        j                  |j                        | _
        y r^   )r*   r+   r/   r  r'  r  r  r1   rA  r,  r-  r9   s     r<   r+   z!Llama4VisionEncoderLayer.__init__F  sb    !--.v6"6*!||F,>,>?(*V5G5G(H%r=   hidden_stater  r   output_attentionsc                     |}| j                  |      }| j                  |||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )N)r  r   )r,  r'  r-  r  )r:   r  r  r   r  r1  r   r  s           r<   rL   z Llama4VisionEncoderLayer.forwardP  s      ++L9%)^^) &4 &
"l
  ,.  44\Bxx-,./&Gr=   r  )rM   rN   rO   r   r+   r3   rP   r   r4  rL   rQ   rR   s   @r<   r  r  E  sZ    I1 I 26,0ll ,, !.	
 $D>r=   r  c                        e Zd ZdZdef fdZ	 	 	 	 ddej                  dej                  deej                     dee	   dee	   d	ee	   d
e
eef   fdZ xZS )Llama4VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Llama4VisionEncoderLayer`].

    Args:
        config: Llama4VisionConfig
    r&   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        || _        y c c}w )NF)
r*   r+   r&   r1   rX  rY  rZ  r  r[  r^  )r:   r&   r3  r;   s      r<   r+   zLlama4VisionEncoder.__init__z  sW    mmuU[UmUmOn$o!%=f%E$op&+# %ps   A*r>   r  r   r  output_hidden_statesreturn_dictr?   c                 z   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}| j                  D ]&  }	|r||fz   } |	||||      }
|r	||
d   fz   }|
d   }( |r||fz   }|st        d |||fD              S t        |||      S )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr0  )r  r   r  r  r!   r   c              3   &   K   | ]	  }||  y wr^   r0  .0vs     r<   	<genexpr>z.Llama4VisionEncoder.forward.<locals>.<genexpr>  s     eqWXWde   rg  r>   rR  )r&   r  r  use_return_dictr[  r   r   )r:   r>   r  r   r  r  r  encoder_statesall_attentionsencoder_layerlayer_outputss              r<   rL   zLlama4VisionEncoder.forward  s    > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d![[ 	-M#!/=2B!B)*-"3!	M !!/=3C2E!E)!,M	-   +}.>>Ne]NN$Seee+>Vd
 	
r=   NNNN)rM   rN   rO   r   r   r+   r3   rP   r   r4  r   r   r   rL   rQ   rR   s   @r<   r  r  q  s    1  26,0/3&*?
||?
 ,,?
 !.	?

 $D>?
 'tn?
 d^?
 
uo%	&?
r=   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Llama4UnfoldConvolutionc                 <   t         |           |j                  }t        |t              r||f}t
        j                  j                  ||j                        | _        t        j                  |j                  |d   z  |d   z  |j                  d      | _        y )N)kernel_sizestrider   r!   FrW   )r*   r+   r  r   r  r3   r1   UnfoldunfoldrY   num_channelsr/   linear)r:   r&   r  r;   s      r<   r+   z Llama4UnfoldConvolution.__init__  s    ''k3'&4Khhoo+fFWFWoXii+a.0;q>A
r=   r>   r?   c                 p    | j                  |      }|j                  ddd      }| j                  |      }|S )Nr   r)   r!   )r  r  r  r  s     r<   rL   zLlama4UnfoldConvolution.forward  s8    M2%--aA6M2r=   r  rR   s   @r<   r  r    s#    

U\\ ell r=   r  c                   $     e Zd Z fdZd Z xZS )Llama4VisionRotaryEmbeddingc                    t         |           |j                  |j                  z  }t	        j
                  |dz  t        j                        j                  |dz  d      }t	        j                  ||d d gd      }d|d<   ||z  }||z  }|j                  |j                  z  dz  }d|j                  t	        j
                  d|d      d |dz   j                         |z  z  z  }|dz   d	   |d d d d f   z  j                  dd
      }|dz   d	   |d d d d f   z  j                  dd
      }	t	        j                  ||	gd
      j                         j                         dd d df   }
|
j                  |j                  d
dd      dk  d      }
t	        j                   t	        j"                  t	        j$                  |
      t	        j&                  |
      gd
            }|| _        y )Nr)   )r   r!   r   rB   r   )rA   rA   r  ).NrA   .)r*   r+   
image_sizer  r3   rj  int32r   catr/   r   
rope_thetarp   repeat_interleaver   masked_fillr   stackcossinr  )r:   r&   idximg_idxfrequencies_xfrequencies_yfreq_dim	rope_freqfreqs_xfreqs_yr   rp  r;   s               r<   r+   z$Llama4VisionRotaryEmbedding.__init__  s   6#4#44,,sAvU[[9AA#q&!L))Wgbqk2:#3%%)C)CCqH6,,a11MN_QY]^Q^1`1f1f1hks1stu	!A%y1IdD!m4LL__`agi_j!A%y1IdD!m4LL__`agi_j		7G,"5;;=HHJ3PSRSPS8T!!'//"a";a"?C((eii6F		RWHX5Y_a)bc r=   c                 L    | j                   j                  |j                        S r^   )r  r   r   r  s     r<   rL   z#Llama4VisionRotaryEmbedding.forward  s    }} 4 455r=   r`   rR   s   @r<   r  r    s    !"6r=   r  c                        e Zd ZU dZdgZeed<   def fdZd Z	 	 	 	 dde	j                  dee	j                     dee   d	ee   d
ee   deeee	j                  df   f   fdZ xZS )rC  vision_modelr  r&   c                 r   t         |   |       |j                  | _        |j                  | _        |j                  | _        |j
                  | _        | j                  | j                  z  dz  dz   | _        |j                  dz  | _        t        |      | _	        t        j                  | j                  t        j                  | j                        z        | _        t        j                  | j                  t        j                  | j                  | j                        z        | _        t!        |      | _        t        j$                  | j                        | _        t        j$                  | j                        | _        t+        |      | _        t/        |      | _        | j3                          y )Nr)   r!   r   )r*   r+   r  r  r/   r  r  rE  r  patch_embeddingr1   r2   r3   randnrD  rF  r  rotary_embeddingrA  layernorm_prelayernorm_postr  rQ  r  vision_adapterr_  r9   s     r<   r+   zLlama4VisionModel.__init__  sA     ++ ++!--"// OOt>1DqH''-
6v>!||DJJTEUEU9V,VW(*TZZ%++dN^N^`d`p`pBq5q(r% ;F C  \\$*:*:; ll4+;+;< )0
9&Ar=   c                     | j                   S )zg
        This function is used to fetch the first embedding layer to activate grads on inputs.
        )r  rt   s    r<   get_input_embeddingsz&Llama4VisionModel.get_input_embeddings  s     ###r=   pixel_valuesr   r  r  r  r?   .c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|j                  \  }}}}	d}
d}| j                  |      }|j                  \  }}}|j                  ||
z  |z  ||      }| j                  j                  |j                  d   d|j                  d         }t        j                  ||gd      }|dz  }|j                  ||
z  |||      }| j                  j                  |j                  |j                        }||z   }| j                  |      }|j!                  |d|      }| j#                  |      }| j%                  |d|||      }|j&                  }| j)                  |      }|ddddddf   }| j+                  |      }|r|j,                  nd}|r|d   }nd}|st/        d	 |||fD              S t1        |||
      S )a  

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, MllamaVisionModel

        >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
        >>> model = MllamaVisionModel.from_pretrained(checkpoint)
        >>> processor = AutoProcessor.from_pretrained(checkpoint)

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> output = model(**inputs)

        >>> print(output.last_hidden_state.shape)
        torch.Size([1, 1, 4, 1025, 7680])
        ```
        Nr!   r   rA   rB   r   r   )r   r  r  r  r)   c              3   &   K   | ]	  }||  y wr^   r0  r  s     r<   r  z,Llama4VisionModel.forward.<locals>.<genexpr>j  s     _qQRQ^_r  r  )r&   r  r  r  rE   r  r   rD  r   r3   r  rF  r   r   r   r  rD   r  rQ  rg  r  r  r>   r   r   )r:   r  r   r  r  r  batch_size_times_num_tilesr  r  r  num_concurrent_media
num_chunksr  r3  r  r   rD  positional_embeddingr  r}   r>   rR  s                         r<   rL   zLlama4VisionModel.forward  sS   > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] COBTBT?"L&% 
++L9%1%7%7";
 $++&)==
JKYc
 ..55l6H6H6KQP\PbPbcePfgyy,!@aHq $++&)==z;Xb
  $<<??lFXFXamatat?u#&::)),7#(()CRT((6!5/  
 //**<8#AssAI. **<80D,,$JJ_\=*$M___*'!
 	
r=   r  )rM   rN   rO   rs  rr  r   r   r+   r  r3   rP   r   r4  r   r   r   rL   rQ   rR   s   @r<   rC  rC    s    &341 2$ 26,0/3&*_
ll_
 !._
 $D>	_

 'tn_
 d^_
 
ellC&7 88	9_
r=   rC  c            (           e Zd ZU ddgZi ZdZeed<   def fdZd Z	d Z
d Zd	 Zd
 Zd Zdej                   deeee   f   defdZdej,                  dej                   dej                   fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d#dej,                  dej                   deej4                     deej,                     dee   deej                      deeeee   f      dee   deej,                     dee   dee   dee   dee   deej,                     deeej4                  f   dej4                  dee   d eee f   f$d!       Z!	 	 	 	 	 	 d$d"Z" xZ#S )%Llama4ForConditionalGenerationr#  r   r&   c                 h   t         |   |       t        |j                        | _        t        |      | _        t        |j                        | _	        |j                  j                  | _
        | j                  j                  | j                  j                  nd| _        | j                          y )NrA   )r*   r+   rC  r  r  r  multi_modal_projectorrv  r;  rw  rV  r&   rU  r_  r9   s     r<   r+   z'Llama4ForConditionalGeneration.__init__y  s     -f.B.BC%>v%F"/0B0BC ,,778<8P8P8\DKK44bdr=   c                 6    | j                   j                         S r^   )rw  r  rt   s    r<   r  z3Llama4ForConditionalGeneration.get_input_embeddings  s    ""7799r=   c                 :    | j                   j                  |       y r^   )rw  set_input_embeddings)r:   r   s     r<   r$  z3Llama4ForConditionalGeneration.set_input_embeddings  s    007r=   c                 6    | j                   j                         S r^   )rw  get_output_embeddingsrt   s    r<   r&  z4Llama4ForConditionalGeneration.get_output_embeddings  s    ""88::r=   c                 :    | j                   j                  |       y r^   )rw  set_output_embeddings)r:   new_embeddingss     r<   r(  z4Llama4ForConditionalGeneration.set_output_embeddings  s    11.Ar=   c                 :    | j                   j                  |       y r^   )rw  set_decoder)r:   decoders     r<   r+  z*Llama4ForConditionalGeneration.set_decoder  s    ''0r=   c                 6    | j                   j                         S r^   )rw  get_decoderrt   s    r<   r.  z*Llama4ForConditionalGeneration.get_decoder  s    ""..00r=   r  vision_feature_layervision_feature_select_strategyc                     |dvrt        d| j                         |j                         D ci c]  \  }}|	|| }}} | j                  |fddi|}|j                  }|S c c}}w )a  
        Obtains image last hidden states from the vision tower and apply al projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layer (`Union[int, list[int]]`):
                The index of the layer to select the vision feature. If multiple indices are provided,
                the vision feature of the corresponding indices will be concatenated to form the
                vision features.
            vision_feature_select_strategy (`str`):
                The feature selection strategy used to select the vision feature from the vision backbone.
                Can be one of `"default"` or `"full"`
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        )r   fullz$Unexpected select feature strategy: r  F)rh  r0  itemsr  rg  )	r:   r  r/  r0  r   kr  image_outputsr  s	            r<   get_image_featuresz1Llama4ForConditionalGeneration.get_image_features  s    . *1DDCDDgDgChijj#)<<>C41aQ]!Q$CC))),]U]V\]$66 Ds
   
A&A&r`  ra  r  c                 ,   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }||   j                         |j                         k7  rt        d| d|j                  d          |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        r  rA   z6Image features and image tokens do not match: tokens: z, features r   )r  r3   tensorr&   image_token_idlongr   allr   rk  	expand_asr   numelrh  rE   )r:   r`  ra  r  special_image_maskn_image_tokenss         r<   get_placeholder_maskz3Llama4ForConditionalGeneration.get_placeholder_mask  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1/99"=GGVYYZgZnZno+,2248L8L8NNHHXXcdrdxdxyzd{c|}  "!r=   r   r   r	  r{  r.  r  r  r  r  r|  image_sizesr   r?   c                 *   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j                  j
                  }||n| j                   j                  j                  }|du |duz  rt        d      ||t        d      | | j                         |      }|| j                  ||||      }|j                  d|j                  d            }| j                  |      j                  |j                  |j                        }| j!                  |||      }|j#                  ||      } | j$                  d|||||
|||||d
|}|d   }d}|	<||dd|j&                  d	   d	z
   df   j                  |j                        }|d
ddddf   |j                  |j                        dk7     j)                         }|	d
d	df   |j                  |	j                        dk7     j)                         }n1|d
ddddf   j)                         }|	d
d	df   j)                         }t+        j,                         } ||j                  d|j                  d            |j                  d      j                  |j                              }|s|f|d	d z   }||f|z   S |S t/        |||j0                  |j2                  |j4                  |      S d      S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration

        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
        ```Nrc  zdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one)r  r/  r0  rA  rA   )ra  r  )
r   r   r	  ra  r.  r  r  r  r  r|  r   r!   .)r  r~  r	  r>   rR  r  r0  )r&   r  r  r  r  r/  r0  rh  r  r6  rD   r  r!  r   r   r   r@  masked_scatterrw  rE   r   r1   CrossEntropyLossr  r	  r>   rR  )r:   r`  r  r   r   r	  ra  r/  r0  r{  r.  r  r  r  r  r|  rA  r   r  vision_flatprojected_vision_flatr>  r  r~  r  shift_attention_maskshift_logitsshift_labelsloss_fctr}   s                                 r<   rL   z&Llama4ForConditionalGeneration.forward  s   b 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] $/ !**?? 	 .9 +**II 	' -t";<YZZ#(Av   7D557	BM#!44)%9/M'	 5 N )--b.2E2Eb2IJK$($>$>{$K$N$N$$m&9&9%! "&!:!:G\ "; " *889KMbcM%$%% 
)%+'/!5#))
 
 ) (6a6<<?Q;N9O9Q6Q'R'U'UV\VcVc'd$%c3B3k23G3J3J6==3Y]^3^_jjl%c12g/C/F/Fv}}/UYZ/Z[ffh%c3B3k2==?%c12g99;**,H!!"l&7&7&;<l>O>OPR>S>V>VWcWjWj>kD Y,F'+'7D7V#CVC+#33!//))2>2J
 	
 QU
 	
r=   c           	      f     | j                   j                  |f|||||d|}	|d   dk(  r||	d<   |	S )N)r	  ra  r   r  r|  r   r  )rw  prepare_inputs_for_generation)
r:   r`  r	  ra  r  r   r  r|  r   model_inputss
             r<   rL  z<Llama4ForConditionalGeneration.prepare_inputs_for_generationW  s_     It**HH
+')))
 
 !! ,8L(r=   )NNNNNNNNNNNNNNr   N)NNNNNN)$rM   rN   rO   rr  r  rs  r"   r   r+   r  r$  r&  r(  r+  r.  r3   r5  r   r  r  r   r6  r!  r@  r   r   rP   r
   r4  r   r   r   r  rL   rL  rQ   rR   s   @r<   r  r  s  sb   13MNH	| 	:8;B11'' $CcN3 ),	<"))":?:K:K"]b]n]n".  '+*.1537+/59@D8<-1$(,0/3&*5934$(#I
##I
 ''I
 !.	I

 u//0I
 "%I
   1 12I
 'uS$s)^'<=I
 )1I
 ))*I
 D>I
 $D>I
 'tnI
 d^I
 !!1!12I
  c5<</0!I
" \\#I
$ +,%I
& 
u22	3'I
 I
\ r=   r  )r7  rP  rC  rv  r  )r  )ar  dataclassesr   typingr   r   r   r3   torch.nnr1   torch.nn.functionalr   r  /transformers.models.llama4.configuration_llama4r   activationsr	   cache_utilsr
   r   
generationr   integrationsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr    configuration_llama4r"   r#   
get_loggerrM   loggerModuler%   rT   rb   rw   rY   r   r   r   rP   r   r   r  r   rp   r   r   r   r#  r7  rP  rv  r  r  r  r  r  r  r  r  r  r  r  r  r  rC  r  __all__r0  r=   r<   <module>rf     s     ! , ,     N ! . ) 7 K B 9 m m K F & R R 0 / @ 
		H	%		 B)BII )$!uxx !=		 =(,299 , _-"BII " ."*		 B	2	2	2 ||	2 5<<%&		2	UU\\ 	U# 	U%,, 	U( %II%<<% 
% <<	%
 U\\*% % %D %II%<<% 
% <<	%
 U\\*% % %4[)")) [)|37 3l #KO #K #KL `
+ `
 `
FN
- N
b 
<; < <2;uxx ;"		 (
)")) 
)!ELL ! !:<<:	: ll: 5<<%&	:8)BII 8)vbii )9 )XO
")) O
dbii (6")) 6,C
- C
L@%:O @Fr=   