
    h-                     :   d dl Z d dlmZ d dlZd dlmZmZ ddlmZ ddlm	Z	  ej                  e      Z G d dej                        Z G d	 d
ej                        Z G d dej                        Z G d dej                        Z G d dej                        Z G d dej                        Z G d dej                        Z G d dej                        Z G d dej                        Z G d dej                        Z G d dej                        Z G d de      Z G d d ej                        Zi d!ed"ed#d$d%fd&ed'ed(ed)d*ifd+ed,ed-ed.ej6                  d/ed0ed1ed2ej8                  d3ed4ej:                  d5ej<                  d6ej>                  ej>                  ej@                  ejB                  ed7Z" ee"      Z#d8 Z$ e$d(      Z% e$d'      Z& e$d!      Z' e$d&      Z( e$d1      Z) e$d6      Z* e$d0      Z+ e$d/      Z,y)9    N)OrderedDict)Tensornn   )logging)is_torchdynamo_compilingc                        e Zd ZdZdedefdZy)PytorchGELUTanha&  
    A fast C implementation of the tanh approximation of the GeLU activation function. See
    https://huggingface.co/papers/1606.08415.

    This implementation is equivalent to NewGELU and FastGELU but much faster. However, it is not an exact numerical
    match due to rounding errors.
    inputreturnc                 D    t         j                  j                  |d      S )Ntanh)approximate)r   
functionalgeluselfr   s     V/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/activations.pyforwardzPytorchGELUTanh.forward%   s    }}!!%V!<<    N__name__
__module____qualname____doc__r   r    r   r   r
   r
      s    =V = =r   r
   c                        e Zd ZdZdedefdZy)NewGELUActivationz
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
    the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
    r   r   c                     d|z  dt        j                  t        j                  dt        j                  z        |dt        j
                  |d      z  z   z        z   z  S )N      ?      ?       @Hm?g      @)torchr   mathsqrtpipowr   s     r   r   zNewGELUActivation.forward/   sP    U{cEJJtyytww/G5S[^c^g^ghmor^sSsKs/t$uuvvr   Nr   r   r   r   r   r   )   s    
wV w wr   r   c                   J     e Zd ZdZddef fdZdedefdZdedefdZ xZ	S )	GELUActivationa  
    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
    Also see the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
    use_gelu_pythonc                     t         |           |r| j                  | _        y t        j
                  j                  | _        y N)super__init___gelu_pythonactr   r   r   )r   r+   	__class__s     r   r/   zGELUActivation.__init__;   s/    ((DH}}))DHr   r   r   c                 j    |dz  dt        j                  |t        j                  d      z        z   z  S )Nr    r!   r"   )r$   erfr%   r&   r   s     r   r0   zGELUActivation._gelu_pythonB   s,    s{cEIIediin.D$EEFFr   c                 $    | j                  |      S r-   r1   r   s     r   r   zGELUActivation.forwardE       xxr   )F)
r   r   r   r   boolr/   r   r0   r   __classcell__r2   s   @r   r*   r*   3   s=    * *G& GV GV  r   r*   c                        e Zd ZdZdedefdZy)FastGELUActivationz}
    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
    r   r   c                 \    d|z  dt        j                  |dz  dd|z  |z  z   z        z   z  S )Nr    r!   g3E?r#   )r$   r   r   s     r   r   zFastGELUActivation.forwardN   s:    U{cEJJu|/CsXX]M]`eMeGe/f$gghhr   Nr   r   r   r   r<   r<   I   s    iV i ir   r<   c                        e Zd ZdZdedefdZy)QuickGELUActivationzr
    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
    r   r   c                 8    |t        j                  d|z        z  S )NgZd;?)r$   sigmoidr   s     r   r   zQuickGELUActivation.forwardW   s    u}}UU]333r   Nr   r   r   r   r?   r?   R   s    4V 4 4r   r?   c                   <     e Zd ZdZdedef fdZdedefdZ xZS )ClippedGELUActivationa  
    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
    https://huggingface.co/papers/2004.09602.

    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
    initially created.

    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://huggingface.co/papers/1606.08415
    minmaxc                 l    ||kD  rt        d| d| d      t        | 	          || _        || _        y )Nzmin should be < max (got min: z, max: ))
ValueErrorr.   r/   rD   rE   )r   rD   rE   r2   s      r   r/   zClippedGELUActivation.__init__h   s>    9=cU'#aPQQr   xr   c                 j    t        j                  t        |      | j                  | j                        S r-   )r$   clipr   rD   rE   )r   rI   s     r   r   zClippedGELUActivation.forwardp   s!    zz$q'488TXX66r   )	r   r   r   r   floatr/   r   r   r9   r:   s   @r   rC   rC   [   s.    
E  7 7F 7r   rC   c                   2     e Zd ZdZ fdZdedefdZ xZS )AccurateGELUActivationz
    Applies GELU approximation that is faster than default and more accurate than QuickGELU. See:
    https://github.com/hendrycks/GELUs

    Implemented along with MEGA (Moving Average Equipped Gated Attention)
    c                 x    t         |           t        j                  dt        j                  z        | _        y )N   )r.   r/   r%   r&   r'   precomputed_constantr   r2   s    r   r/   zAccurateGELUActivation.__init__|   s'    $(IIa$''k$:!r   r   r   c                     d|z  dt        j                  | j                  |dt        j                  |d      z  z   z        z   z  S )Nr    r   r#      )r$   r   rQ   r(   r   s     r   r   zAccurateGELUActivation.forward   sE    U{a%**T-F-F%RZ]b]f]fglno]pRpJp-q"rrssr   )r   r   r   r   r/   r   r   r9   r:   s   @r   rN   rN   t   s#    ;tV t tr   rN   c                   B     e Zd ZdZ fdZdedefdZdedefdZ xZS )MishActivationz
    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://huggingface.co/papers/1908.08681). Also
    visit the official repository for the paper: https://github.com/digantamisra98/Mish
    c                 `    t         |           t        j                  j                  | _        y r-   )r.   r/   r   r   mishr1   rR   s    r   r/   zMishActivation.__init__   s    ==%%r   r   r   c                 l    |t        j                  t        j                  j	                  |            z  S r-   )r$   r   r   r   softplusr   s     r   _mish_pythonzMishActivation._mish_python   s%    uzz"--"8"8"?@@@r   c                 $    | j                  |      S r-   r6   r   s     r   r   zMishActivation.forward   r7   r   )	r   r   r   r   r/   r   r[   r   r9   r:   s   @r   rV   rV      s6    
&A& AV AV  r   rV   c                        e Zd ZdZdedefdZy)LinearActivationz[
    Applies the linear activation function, i.e. forwarding input directly to output.
    r   r   c                     |S r-   r   r   s     r   r   zLinearActivation.forward   s    r   Nr   r   r   r   r^   r^      s    V  r   r^   c                       e Zd ZdZddZy)LaplaceActivationz
    Applies elementwise activation based on Laplace function, introduced in MEGA as an attention activation. See
    https://huggingface.co/papers/2209.10655

    Inspired by squared relu, but with bounded range and gradient for better stability
    c                     ||z
  j                  |t        j                  d      z        }ddt        j                  |      z   z  S )Nr"   r    r!   )divr%   r&   r$   r4   )r   r   musigmas       r   r   zLaplaceActivation.forward   s<      3!78cEIIe,,--r   N)g۞?g ^/?r   r   r   r   r   r   r   r   ra   ra      s    .r   ra   c                       e Zd ZdZd Zy)ReLUSquaredActivationz`
    Applies the relu^2 activation introduced in https://huggingface.co/papers/2109.08668v2
    c                 n    t         j                  j                  |      }t        j                  |      }|S r-   )r   r   relur$   square)r   r   relu_appliedsquareds       r   r   zReLUSquaredActivation.forward   s)    }}))%0,,|,r   Nrf   r   r   r   rh   rh      s    r   rh   c                        e Zd Z fdZ xZS )ClassInstantierc                 d    t         |   |      }t        |t              r|n|i f\  }} |di |S )Nr   )r.   __getitem__
isinstancetuple)r   keycontentclskwargsr2   s        r   rq   zClassInstantier.__getitem__   s6    '%c*!+GU!;g'2V}V}r   )r   r   r   rq   r9   r:   s   @r   ro   ro      s     r   ro   c                   t     e Zd ZdZddddej
                  df fd	Zdedefd	Zdedefd
Z	dedefdZ
 xZS )XIELUActivationz
    Applies the xIELU activation function introduced in https://arxiv.org/abs/2411.13010

    If the user has installed the nickjbrowning/XIELU wheel, we import xIELU CUDA
    Otherwise, we emit a single warning and use xIELU Python
    g?r    gưFc           
      f   t         |           t        j                  t	        j
                  t	        j                  t	        j                  ||            dz
        j                  d            | _	        t        j                  t	        j
                  t	        j                  t	        j                  ||z
  |            dz
        j                  d            | _
        | j                  dt	        j                  ||             | j                  dt	        j                  ||             || _        t        | j                  j                         j!                         j                         j#                               | _        t        | j&                  j                         j!                         j                         j#                               | _        d | _        	 dd l}t        j.                  j0                  j3                         | _        d}	 ddlm}	  |	| j8                        | _        |dz  }t>        jA                  |       y # t<        $ r$}
|d	|
 d
z  }| j8                  | _        Y d }
~
>d }
~
ww xY w# t<        $ r)}
t>        jA                  dtC        |
             Y d }
~
y d }
~
ww xY w)N)dtyper   r   betaepszUsing experimental xIELU CUDA.)allow_in_graphz& Enabled torch._dynamo for xIELU CUDA.z+ Could not enable torch._dynamo for xIELU (z*) - this may result in slower performance.u   CUDA-fused xIELU not available (%s) – falling back to a Python version.
For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`)"r.   r/   r   	Parameterr$   logexptensor	unsqueezealpha_palpha_nregister_bufferwith_vector_loadsrL   r|   detachcpuitem_beta_scalarr}   _eps_scalar_xielu_cuda_obj	xielu.opsclassesxieluXIELUtorch._dynamor~   _xielu_cuda_xielu_cuda_fn	Exceptionloggerwarning_oncestr)r   alpha_p_initalpha_n_initr|   r}   r{   r   r   msgr~   errr2   s              r   r/   zXIELUActivation.__init__   s    	||EIIeii\Y^8_.`cd.d$e$o$opq$rs||IIeii\D-@ NORSST^^_`a
 	VU\\$e%DEUELLE$BC!2!$))"2"2"4"8"8":"@"@"B"G"G"IJ !2!6!6!8!>!>!@!E!E!GH#	#(==#6#6#<#<#>D 2C78&4T5E5E&F#?? $  7DSEIstt&*&6&6##7  	jC 	sB   "3I> "I 8I> 	I;I61I> 6I;;I> >	J0J++J0rI   r   c           
         t         j                  j                  | j                        }| j                  t         j                  j                  | j
                        z   }t        j                  |dkD  ||z  |z  | j                  |z  z   t        j                  t        j                  || j                              |z
  |z  | j                  |z  z         S )Nr   )r   r   rZ   r   r|   r   r$   whereexpm1rD   r}   )r   rI   r   r   s       r   _xielu_pythonzXIELUActivation._xielu_python   s    --((6))bmm44T\\BB{{EaK!Odii!m+[[1dhh/014?$))a-O
 	
r   c                    |j                   }|j                         dk  r%|j                  d      }|j                         dk  r%|j                         dkD  r"|j                  dd|j	                  d            }||j                   k7  r!t
        j                  d||j                          | j                  j                  || j                  | j                  | j                  | j                  | j                        }|j                  |      S )zDFirewall function to prevent torch.compile from seeing .item() callsrT   r   r   z_Warning: xIELU input tensor expects 3 dimensions but got (shape: %s). Reshaping to (shape: %s).)shapedimr   viewsizer   r   r   r   r   r   r   r   r   )r   rI   original_shaperesults       r   r   zXIELUActivation._xielu_cuda   s    eegkAA eegk557Q;r1affRj)AQWW$q
 %%--LLLL""
 {{>**r   r   c                     | j                   <|j                  r0t               s| j                  |      S t        j                  d       | j                  |      S )Nz:torch._dynamo is compiling, using Python version of xIELU.)r   is_cudar   r   r   r   r   r   s     r   r   zXIELUActivation.forward  sK    ++-**511##$`a!!%((r   )r   r   r   r   r$   bfloat16r/   r   r   r   r   r9   r:   s   @r   ry   ry      s_     nn)V
v 
& 
+V + +2)V ) )r   ry   r   gelu_10i
   )rD   rE   	gelu_fastgelu_newgelu_pythonr+   Tgelu_pytorch_tanhgelu_accuratelaplace
leaky_relulinearrX   
quick_gelurj   relu2relu6rA   silu)swishr   prelur   c           	      |    | t         v r	t         |    S t        d|  dt        t         j                                      )Nz	function z not found in ACT2FN mapping )ACT2FNKeyErrorlistkeys)activation_strings    r   get_activationr   5  sB    F"'((#4"55RSWX^XcXcXeSfRghiir   )-r%   collectionsr   r$   r   r   utilsr   utils.import_utilsr   
get_loggerr   r   Moduler
   r   r*   r<   r?   rC   rN   rV   r^   ra   rh   ro   ry   	LeakyReLUReLUReLU6SigmoidSiLUTanhPReLUACT2CLSr   r   r   r   r   r   r   r   rX   
linear_actr   r   r   <module>r      s    #    8 
		H	%
=bii 
=w		 wRYY ,i i4")) 47BII 72tRYY t RYY "ryy 
.		 
.BII k [)bii [)|
N%s2'>? # !	
 N%6$=>  +   ",,  N % BGG " RXX  rzz!" BGG#$ WWGGXX+. 
	!j ]+*%f;'	L)
ffH%
r   