
    hP                        d dl Z d dlmZ d dlZd dlmZ ddlmZmZ ddlm	Z	m
Z
 ddlmZ ddlmZ dd	lmZ d
dlmZ d
dlmZmZmZmZmZmZmZmZ d
dlmZ ddlmZ  ej@                  e!      Z"dZ#dZ$ G d de      Z%d Z& G d dejN                        Z( G d de(      Z) G d de(      Z*e(e)e*dZ+ G d de      Z, G d de      Z- G d  d!e      Z. G d" d#e      Z/ G d$ d%e      Z0 G d& d'e      Z1 G d( d)e      Z2g d*Z3y)+    N)Optional)nn   )CacheStaticCache)_flash_attention_forward!flash_attn_supports_top_left_mask)PreTrainedModel)logging)deprecate_kwarg   )GemmaForCausalLM)LlamaDecoderLayerLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassification
LlamaModelLlamaPreTrainedModelapply_rotary_pos_emb	repeat_kv)
MistralMLP   )DiffLlamaConfigzkajuma/DiffLlama-0.3B-handcutr   c                       e Zd Zy)DiffLlamaMLPN__name__
__module____qualname__     m/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/diffllama/modular_diffllama.pyr   r   2       r!   r   c                 >    ddt        j                  d| z        z  z
  S )Ng?g333333?g333333ӿ)mathexp)	layer_idxs    r"   lambda_init_fnr(   6   s     txxy 01111r!   c                   x    e Zd ZdZddedee   f fdZ eddd      	 	 	 	 	 dd	e	j                  d
ee	j                  e	j                  f   dee	j                     dee	j                     dee   dedee	j                     dee	j                  ee	j                     eee	j                        f   fd       Z xZS )DiffLlamaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperconfigr'   c                    t         |           || _        || _        |-t        j                  d| j                  j                   d       |j                  | _        |j                  | _	        |j                  | _        t        |d| j                  | j                  z        | _        |j                  | _        | j                  | j                  z  | _        |j                   | _        |j"                  | _        d| _        t'        j(                  | j                  | j                  | j                  z  |j*                        | _        t'        j(                  | j                  | j                  | j                  z  |j*                        | _        t'        j(                  | j                  | j                  | j                  z  |j*                        | _        t'        j(                  | j                  | j                  z  | j                  |j*                        | _        t5        |      | _        t'        j8                  t;        j<                  d|j>                  | j                  f            | _         t'        j8                  t;        j<                  d|j>                  | j                  f            | _!        t'        j8                  t;        j<                  d|j>                  | j                  f            | _"        t'        j8                  t;        j<                  d|j>                  | j                  f            | _#        t'        jH                  d| j                  z  |jJ                  d	
      | _&        y )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.head_dimT)biasr   )sizer   F)epselementwise_affine)'super__init__r+   r'   loggerwarning_once	__class__r   attention_dropouthidden_sizenum_attention_heads	num_headsgetattrr-   num_key_value_headsnum_key_value_groupsmax_position_embeddings
rope_theta	is_causalr   Linearattention_biasq_projk_projv_projo_projr(   lambda_init	Parametertorchnormallambda_std_dev	lambda_q1	lambda_k1	lambda_q2	lambda_k2RMSNormrms_norm_eps	groupnormselfr+   r'   r6   s      r"   r3   zDiffLlamaAttention.__init__=   s~   " !8!8 9 :, , "(!9!9!--33
D4D4D4VW#)#=#= $(NNd6N6N$N!'-'E'E$ ++ii 0 0$..4==2PW]WlWlmii 0 0$2J2JT]]2Zagavavwii 0 0$2J2JT]]2Zagavavwii >@P@PW]WlWlm))4ell1f6K6KSWS`S`Rb&cdell1f6K6KSWS`S`Rb&cdell1f6K6KSWS`S`Rb&cdell1f6K6KSWS`S`Rb&cdA$56;N;Nchir!   past_key_valuepast_key_values4.58new_nameversionhidden_statesposition_embeddingsattention_maskposition_ids	use_cachecache_positionreturnc                    |j                         \  }	}
}|
}| j                  |      }| j                  |      }| j                  |      }|j	                  |	|| j
                  | j                        j                  dd      }|j	                  |	|| j                  | j                        j                  dd      }|j	                  |	|| j                  | j                        j                  dd      }|\  }}t        ||||      \  }}|'|||d}|j                  ||| j                  |      \  }}t        || j                        }t        || j                        }t        j                  t        j                   |dd      d      }|j#                  dddd      }t        j$                  ||j                  dd            t'        j(                  | j                        z  }|#|d d d d d d d |j*                  d   f   }||z   }t,        j.                  j1                  |dt        j2                        j5                  |j6                        }t,        j.                  j9                  || j:                  | j<                  	      }t        j>                  t        j@                  | jB                  | jD                  z  dt        j2                              j5                  |j6                        }t        j>                  t        j@                  | jF                  | jH                  z  dt        j2                              j5                  |j6                        }||z
  | jJ                  z   }t        j$                  ||      }t        j                   |dd      \  }}|||z  z
  }d| jJ                  z
  | jM                  |      z  }|j                  dd      jO                         }|jQ                  |	|d      }| jS                  |      }||fS )
Nr   r   sincosr`   dimr   rg   dtype)ptraining)*r/   rC   rD   rE   viewr:   r-   	transposer<   r   updater'   r   r=   rI   catchunkrepeatmatmulr%   sqrtshaper   
functionalsoftmaxfloat32tork   dropoutr7   rm   r&   sumrL   rM   rN   rO   rG   rR   
contiguousreshaperF   )rT   r[   r\   r]   r^   rV   r_   r`   kwargsbsz
target_len_q_lenquery_states
key_statesvalue_statesre   rd   cache_kwargsattn_weightscausal_masklambda_1lambda_2lambda_fullattn_outputattn_output1attn_output2s                              r"   forwardzDiffLlamaAttention.forward_   s|    +//1Z{{=1[[/
{{=1#((eT^^T]]S]]^_abc__S%1I1I4==Yccdeghi
#((eT5M5Mt}}]gghiklm&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$Jz4+D+DE
 t/H/HIyy\1!!D"M#**1aA6||L*2F2Fq!2LMPTPYPYZ^ZgZgPhh%(Aq2HJ4D4DR4H2H)HIK'+5L }},,\r,WZZ[g[m[mn}},,\T=S=S^b^k^k,l99UYYt~~'FBV[VcVcdehh
 99UYYt~~'FBV[VcVcdehh
 )D,<,<<ll<>%*[[aQ%G"l"[<%??4+++t~~k/JJ!++Aq1<<>!))#ub9kk+.L((r!   NNNNFN)r   r   r   __doc__r   r   intr3   r   rI   Tensortuple
LongTensorr   boolr   __classcell__r6   s   @r"   r*   r*   :   s    G j  j8C=  jD %0A6R
 2637+/59<)||<) #5<<#=><) !.	<)
 u//0<) "%<) <) !!1!12<) 
u||Xell3XeELL>Q5RR	S<) S<)r!   r*   c                   *    e Zd ZdZ fdZ eddd      	 	 	 	 	 ddej                  d	eej                  ej                  f   d
e	ej                     de	ej                     de	e   dede	ej                     deej                  df   fd       Z xZS )DiffLlamaFlashAttention2aN  
    DiffLlama flash attention module. This module inherits from `DiffLlamaAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                 B    t        |   |i | t               | _        y r   )r2   r3   r	   _flash_attn_uses_top_left_mask)rT   argsr   r6   s      r"   r3   z!DiffLlamaFlashAttention2.__init__   s#    $)&)
 /P.Q+r!   rU   rV   rW   rX   Nr[   r\   r]   r^   r_   r`   ra   c                 @
   t        |t              rt        d      |j                         \  }}	}
| j	                  |      }| j                  |      }| j                  |      }|j                  ||	| j                  | j                        j                  dd      }|j                  ||	| j                  | j                        j                  dd      }|j                  ||	| j                  | j                        j                  dd      }|+t        j                  d       | j                  ||      \  }}n|\  }}t        ||||      \  }}|'|||d}|j!                  ||| j"                  |      \  }}|j                  dd      }|j                  dd      }|j                  dd      }| j$                  r| j&                  nd}|j(                  }|j*                  j,                  dk7  r|j*                  j,                  nd}|t.        j0                  k(  rt/        j2                         r:t5        t.        d	      rt/        j6                  |      nt/        j8                         }nMt5        | j:                  d
      r| j:                  j<                  }n | j                  j>                  j(                  }t        j                  d| d       |jA                  |      }|jA                  |      }|jA                  |      }t/        jB                  |dd      \  }}|jE                  dddd      }|jE                  dddd      }tG        |||||	||tI        | dd       | jJ                  | jL                  
      }tG        |||||	||tI        | dd       | jJ                  | jL                  
      }t/        jN                  ||gd      }t/        jB                  |dd      \  }}t/        jP                  t/        jR                  | jT                  | jV                  z  dt.        j0                              jA                  |j(                        }t/        jP                  t/        jR                  | jX                  | jZ                  z  dt.        j0                              jA                  |j(                        }||z
  | j\                  z   }|||z  z
  }d| j\                  z
  | j_                  |      z  }|ja                  ||	d      jc                         }| je                  |      }|d fS )Nz`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformersr   r   aY  The attention layers in this model are transitioning from computing the RoPE embeddings internally through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed `position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be removed and `position_embeddings` will be mandatory.rc           mpscpuget_autocast_dtype_pre_quantization_dtypezThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .rf   sliding_window)r^   r{   r   use_top_left_maskr@   rh   rj   )3
isinstancer   
ValueErrorr/   rC   rD   rE   rn   r:   r-   ro   r<   r4   r5   
rotary_embr   rp   r'   rm   r7   rk   devicetyperI   ry   is_autocast_enabledhasattrr   get_autocast_gpu_dtyper+   r   weightrz   rr   rs   r   r;   r   r@   rq   r&   r|   rL   rM   rN   rO   rG   rR   r~   r}   rF   )rT   r[   r\   r]   r^   rV   r_   r`   r   r   r   r   r   r   re   rd   r   dropout_rateinput_dtypedevice_typetarget_dtypevalue_states1value_states2r   r   r   r   r   r   s                                r"   r   z DiffLlamaFlashAttention2.forward   s    o{3} 
 &**,UA{{=1[[/
{{=1
 $((eT^^T]]S]]^_abc__S%1I1I4==Yccdeghi
#((eT5M5Mt}}]gghiklm&G |\BHC*HC#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J $--a3))!Q/
#--a315t--C #((2>2E2E2J2Je2Sl))..Y^%--'((* u&:; ,,[9557  &?@#{{BB#{{1177 >$ (??<8L#|4J'??<8L',{{<'J$}%,,Q1a8%,,Q1a8/% "4)94@"AAnn
 0% "4)94@"AAnn
 ii| <"E%*[[aQ%G"l99UYYt~~'FBV[VcVcdehh
 99UYYt~~'FBV[VcVcdehh
 )D,<,<<"[<%??4+++t~~k/JJ!))#ub9DDFkk+.D  r!   r   )r   r   r   r   r3   r   rI   r   r   r   r   r   r   r   r   r   s   @r"   r   r      s    R %0A6R
 6:37+/59B!||B! #5<<#=>B! !!1!12	B!
 u//0B! "%B! B! !!1!12B! 
u||T!	"B! SB!r!   r   c                   T   e Zd ZdZ eddd      	 	 	 	 	 ddej                  deej                  ej                  f   d	eej                     d
eej                     dee
   dedeej                     deej                  eej                     eeej                        f   fd       Zy)DiffLlamaSdpaAttentiona   
    DiffLlama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `DiffLlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    rU   rV   rW   rX   Nr[   r\   r]   r^   r_   r`   ra   c                    |j                         \  }	}
}| j                  |      }| j                  |      }| j                  |      }|j	                  |	|
| j
                  | j                        j                  dd      }|j	                  |	|
| j                  | j                        j                  dd      }|j	                  |	|
| j                  | j                        j                  dd      }|\  }}t        ||||      \  }}|'|||d}|j                  ||| j                  |      \  }}t        || j                        }t        || j                        }t        j                  t        j                   |dd      d      }|j#                  dddd      }|}||d d d d d d d |j$                  d   f   }|j&                  j(                  dk(  r2|0|j+                         }|j+                         }|j+                         }|d u xr |
dkD  }t        j,                  j.                  j1                  ||||| j2                  r| j4                  nd|	      }t        j                   |dd      \  }}t        j6                  t        j8                  | j:                  | j<                  z  dt        j>                  
            jA                  |jB                        }t        j6                  t        j8                  | jD                  | jF                  z  dt        j>                  
            jA                  |jB                        }||z
  | jH                  z   }|||z  z
  }d| jH                  z
  | jK                  |      z  }|j                  dd      j+                         }|j	                  |	|
d      }| jM                  |      }|d fS )Nr   r   rc   rf   rh   ri   cudar   )	attn_mask	dropout_pr@   rj   )'r/   rC   rD   rE   rn   r:   r-   ro   r<   r   rp   r'   r   r=   rI   rq   rr   rs   rv   r   r   r}   r   rw   scaled_dot_product_attentionrm   r7   r&   r|   rL   rM   ry   rz   rk   rN   rO   rG   rR   rF   )rT   r[   r\   r]   r^   rV   r_   r`   r   r   r   r   r   r   r   re   rd   r   r   r@   r   r   r   r   r   r   s                             r"   r   zDiffLlamaSdpaAttention.forward<  sm    &**,UA{{=1[[/
{{=1#((eT^^T]]S]]^_abc__S%1I1I4==Yccdeghi
#((eT5M5Mt}}]gghiklm&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$Jz4+D+DE
 t/H/HIyy\1!!D"M#**1aA6$%%aA/E1A1A"1E/E&EFK ##v-+2I'224L#..0J'224L  4'5EAI	hh))FF!04d,,3 G 
 &+[[aQ%G"l99UYYt~~'FBV[VcVcdehh
 99UYYt~~'FBV[VcVcdehh
 )D,<,<<"[<%??4+++t~~k/JJ!++Aq1<<>!&&sE26kk+.D  r!   r   )r   r   r   r   r   rI   r   r   r   r   r   r   r   r    r!   r"   r   r   4  s     %0A6R
 2637+/59I!||I! #5<<#=>I! !.	I!
 u//0I! "%I! I! !!1!12I! 
u||Xell3XeELL>Q5RR	SI! SI!r!   r   )eagerflash_attention_2sdpac                   (     e Zd Zdedef fdZ xZS )DiffLlamaDecoderLayerr+   r'   c                 d    t         |   ||       t        |j                     ||      | _        y )N)r+   r'   )r2   r3   DIFFLLAMA_ATTENTION_CLASSES_attn_implementation	self_attnrS   s      r"   r3   zDiffLlamaDecoderLayer.__init__  s-    +4V5P5PQY_ktur!   )r   r   r   r   r   r3   r   r   s   @r"   r   r     s    v v3 v vr!   r   c                       e Zd ZdZdZd Zy)DiffLlamaPreTrainedModelFc                 "   t        j                  | |       t        |t              r|j                  j
                  j                  d| j                  j                         |j                  j
                  j                  d| j                  j                         |j                  j
                  j                  d| j                  j                         |j                  j
                  j                  d| j                  j                         y y )Nr   )r
   _init_weightsr   r*   rL   datanormal_r+   rK   rM   rN   rO   )rT   modules     r"   r   z&DiffLlamaPreTrainedModel._init_weights  s    %%dF3f01!!))!T[[-G-GH!!))!T[[-G-GH!!))!T[[-G-GH!!))!T[[-G-GH	 2r!   N)r   r   r   _supports_flex_attn_supports_attention_backendr   r    r!   r"   r   r     s    "'Ir!   r   c                       e Zd Zy)DiffLlamaModelNr   r    r!   r"   r   r     r#   r!   r   c                       e Zd Zy)DiffLlamaForCausalLMNr   r    r!   r"   r   r     r#   r!   r   c                       e Zd Zy)"DiffLlamaForSequenceClassificationNr   r    r!   r"   r   r     r#   r!   r   c                       e Zd Zy)DiffLlamaForQuestionAnsweringNr   r    r!   r"   r   r     r#   r!   r   c                       e Zd Zy)DiffLlamaForTokenClassificationNr   r    r!   r"   r   r     r#   r!   r   )r   r   r   r   r   r   )4r%   typingr   rI   r   cache_utilsr   r   modeling_flash_attention_utilsr   r	   modeling_utilsr
   utilsr   utils.deprecationr   gemma.modeling_gemmar   llama.modeling_llamar   r   r   r   r   r   r   r   mistral.modeling_mistralr   configuration_diffllamar   
get_loggerr   r4   _CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr   r(   Moduler*   r   r   r   r   r   r   r   r   r   r   __all__r    r!   r"   <module>r      s  $     - i -  0 3	 	 	 2 4 
		H	%5 #	: 	2b) b)JR!1 R!jR!/ R!l  1" v- v
I3 
I	Z 		+ 		)G 		$= 		&A 	r!   