
    h+                        d Z ddlmZmZ ddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZ ddlm Z   ejB                  e"      Z#dZ$dZ% G d dejL                        Z'd$dZ( G d dejL                        Z) G d de      Z* G d de      Z+ G d de      Z, G d d e      Z- G d! d"e      Z.g d#Z/y)%zPyTorch Phi-3 model.    )CallableOptionalN)nn   )ACT2FN)Cache)GenerationMixin)FlashAttentionKwargs)ALL_ATTENTION_FUNCTIONS)Unpack)logging)deprecate_kwarg   )MistralDecoderLayerMistralForCausalLM MistralForSequenceClassificationMistralForTokenClassificationMistralPreTrainedModeleager_attention_forwardrotate_half   )
Phi3Configz microsoft/Phi-3-mini-4k-instructr   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Phi3MLPc                 *   t         |           || _        t        j                  |j
                  d|j                  z  d      | _        t        j                  |j                  |j
                  d      | _        t        |j                     | _        y )Nr   Fbias)super__init__configr   Linearhidden_sizeintermediate_sizegate_up_proj	down_projr   
hidden_actactivation_fn)selfr    	__class__s     c/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/phi3/modular_phi3.pyr   zPhi3MLP.__init__3   sp    IIf&8&8!f>V>V:V]bc6#;#;V=O=OV[\#F$5$56    hidden_statesreturnc                     | j                  |      }|j                  dd      \  }}|| j                  |      z  }| j                  |      S )Nr   dim)r$   chunkr'   r%   )r(   r,   	up_statesgates       r*   forwardzPhi3MLP.forward;   sL    %%m4	#//!/4i 2 24 88	~~i((r+   )__name__
__module____qualname__r   torchFloatTensorr5   __classcell__r)   s   @r*   r   r   2   s'    7)U%6%6 )5;L;L )r+   r   c                 `   |j                  |      }|j                  |      }|j                  d   }| dd|f   | d|df   }}|dd|f   |d|df   }
}	t        j                  ||z  t	        |      |z  z   |gd      }t        j                  |	|z  t	        |	      |z  z   |
gd      }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    r/   .Nr0   )	unsqueezeshaper9   catr   )qkcossinposition_idsunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds                r*   apply_rotary_pos_embrN   D   s    ( --
&C
--
&C2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6Eii%#++e*<s*BCVLRTUGii%#++e*<s*BCVLRTUGGr+   c                   Z    e Zd ZdZddedee   f fdZ eddd      	 	 dd	e	j                  d
ee	j                  e	j                  f   dee	j                     dee   dee	j                     dee   dee	j                  ee	j                     eee	j                        f   fd       Z xZS )Phi3Attentionz=Multi-headed attention from 'Attention Is All You Need' paperr    	layer_idxc                 |   t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        |j                  | _        | j                  dz  | _
        |j                  | _        d| _        |j                  | j                  z  d|j                  | j                  z  z  z   }t        j                  |j                  | j                  z  |j
                  d      | _        t        j                  |j
                  |d      | _        y )Nhead_dimg      Tr   Fr   )r   r   r    rQ   getattrr"   num_attention_headsrS   num_key_value_headsnum_key_value_groupsscalingattention_dropout	is_causalr   r!   o_projqkv_proj)r(   r    rQ   op_sizer)   s       r*   r   zPhi3Attention.__init__g   s    "
F4F4F&JdJd4de$*$>$>&B\B\$\!#)#=#= }}d*!'!9!9,,t}}<qFD^D^aeananDn?ooii : :T]] JFL^L^ejk		&"4"4gEJr+   past_key_valuepast_key_values4.58new_nameversionr,   position_embeddingsattention_maskcache_positionkwargsr-   c           
         |j                   d d }g |d| j                  }| j                  |      }	| j                  j                  | j                  z  }
|	dd |
f   }|	d|
|
| j
                  | j                  z  z   f   }|	d|
| j
                  | j                  z  z   d f   }|j                  |      j                  dd      }|j                  |      j                  dd      }|j                  |      j                  dd      }|\  }}t        ||||      \  }}|'|||d}|j                  ||| j                  |      \  }}t        }| j                  j                  dk7  rt        | j                  j                     } || ||||f| j                  sdn| j                  | j                   t#        | j                  dd       d	|\  }} |j$                  g |d j'                         }| j)                  |      }||fS )
Nr/   .r   r   )rD   rC   rf   eagerg        sliding_window)dropoutrX   rj   )r?   rS   r\   r    rU   rV   view	transposerN   updaterQ   r   _attn_implementationr   trainingrY   rX   rT   reshape
contiguousr[   )r(   r,   rd   re   r_   rf   rg   input_shapehidden_shapeqkv	query_posquery_states
key_statesvalue_statesrC   rD   cache_kwargsattention_interfaceattn_outputattn_weightss                       r*   r5   zPhi3Attention.forwardv   s$    $))#2.88b8$--8mmM*KK33dmmC	3

?+i)d6N6NQUQ^Q^6^*^^^_
3	D,D,Dt}},T T VVW#((6@@AF__\2<<QB
#((6@@AF&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HLL"4;;0@$G
%
 
%
!\ *k));;;;FFHkk+.L((r+   )N)NN)r6   r7   r8   __doc__r   r   intr   r   r9   Tensortupler   
LongTensorr   r
   r5   r;   r<   s   @r*   rP   rP   d   s    GKz Khsm K %0A6R ,0590)||0) #5<<#=>0) !.	0)
 "%0) !!1!120) -.0) 
u||Xell3XeELL>Q5RR	S0) S0)r+   rP   c                       e Zd Zdedef fdZ eddd      	 	 	 	 	 	 ddej                  d	e	ej                     d
e	ej                     de	e   de	e   de	ej                     de	eej                  ej                  f      dee   deej                   e	eej                   ej                   f      f   fd       Z xZS )Phi3DecoderLayerr    rQ   c                    t         |   ||       || _        t        ||      | _        t        |      | _        t        j                  |j                        | _
        t        j                  |j                        | _        y )N)r    rQ   )r   r   r    rP   	self_attnr   mlpr   Dropoutresid_pdropresid_attn_dropoutresid_mlp_dropout)r(   r    rQ   r)   s      r*   r   zPhi3DecoderLayer.__init__   s`    +&f	J6?"$**V-?-?"@!#F,>,>!?r+   r^   r_   r`   ra   r,   re   rE   	use_cacherf   rd   rg   r-   c                     |}	| j                  |      } | j                  d|||||||d|\  }}
|	| j                  |      z   }|}	| j                  |      }| j	                  |      }|	| j                  |      z   }|S )N)r,   re   rE   r_   r   rf   rd    )input_layernormr   r   post_attention_layernormr   r   )r(   r,   re   rE   r_   r   rf   rd   rg   residualself_attn_weightss              r*   r5   zPhi3DecoderLayer.forward   s     !,,];+94>> 	,
')%+) 3	,
 	,
(( !4#:#:=#II 55mD/ 4#9#9-#HHr+   )NNNFNN)r6   r7   r8   r   r   r   r   r9   r   r   r   r   boolr   r   r
   r:   r5   r;   r<   s   @r*   r   r      s   @z @c @ %0A6R 2637+/$)59KO|| !. u//0	
 "% D> !!1!12 &eELL%,,,F&GH -. 
u  (51B1BEDUDU1U+V"WW	X Sr+   r   c                       e Zd ZdZy)Phi3PreTrainedModelz0.0.5N)r6   r7   r8   _versionr   r+   r*   r   r      s    Hr+   r   c                   "    e Zd Z	 	 	 	 	 	 	 ddZy)Phi3ForCausalLMNc	                    |r_| j                   j                  rI|j                  d   | j                   j                  dz   k\  r |d   }
|
| j                   j                  k  rd }t	        j
                  | f||||||||d|	}|S )Nr   r   )	input_idsr_   re   inputs_embedsrf   rE   r   logits_to_keep)r    rope_scalingr?    original_max_position_embeddingsr	   prepare_inputs_for_generation)r(   r   r_   re   r   rf   rE   r   r   rg   past_lengthmodel_inputss               r*   r   z-Phi3ForCausalLM.prepare_inputs_for_generation   s    $ (("dkk&R&RUV&VV(+KdkkJJJ"&&DD
+)')%)
 
 r+   )NNNNNTN)r6   r7   r8   r   r   r+   r*   r   r      s     &r+   r   c                       e Zd Zy)Phi3ForSequenceClassificationNr6   r7   r8   r   r+   r*   r   r         r+   r   c                       e Zd Zy)Phi3ForTokenClassificationNr   r   r+   r*   r   r     r   r+   r   )r   	Phi3Modelr   r   r   )Nr   )0r~   typingr   r   r9   torch.utils.checkpointr   activationsr   cache_utilsr   
generationr	   modeling_flash_attention_utilsr
   modeling_utilsr   processing_utilsr   utilsr   utils.deprecationr   mistral.modeling_mistralr   r   r   r   r   r   r   configuration_phi3r   
get_loggerr6   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCModuler   rN   rP   r   r   r   r   r   __all__r   r+   r*   <module>r      s      %    !   ) B 5 &  0   + 
		H	%8 )bii )$@C)BII C)L(* (V0 '( 'T	$D 		!> 	r+   