
    h                        d dl mZmZ d dlZd dlmZ d dlmc mZ d dl	Zddl
mZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZmZmZ d
dlmZ  ej:                  e      Z G d dej@                        Z! G d de      Z"ddZ# G d de      Z$ G d de      Z% G d de      Z& G d de      Z' G d de      Z(g dZ)y)    )CallableOptionalN   )Cache)ALL_ATTENTION_FUNCTIONS)logging)deprecate_kwarg   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaMLP
LlamaModelLlamaRotaryEmbeddingeager_attention_forwardrotate_half   )
OlmoConfigc                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )OlmoLayerNormz/LayerNorm but with no learnable weight or bias.hidden_sizereturnNc                 2    t         |           |f| _        y N)super__init__normalized_shape)selfr   	__class__s     c/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/olmo/modular_olmo.pyr   zOlmoLayerNorm.__init__   s    !,    hidden_statesc                     |j                   }t        j                  |j                  t        j
                        | j                  d d d      j                  |      S )N)dtypegh㈵>)eps)r$   F
layer_normtotorchfloat32r   )r   r"   
orig_dtypes      r    forwardzOlmoLayerNorm.forward#   sO    "((
||M,,5==,A4CXCXZ^`djnorr
 	
r!   )
__name__
__module____qualname____doc__intr   r)   Tensorr,   __classcell__r   s   @r    r   r      s4    9/C /D /
U\\ 
ell 
r!   r   c                        e Zd Z fdZ xZS )OlmoMLPc                 J   t         |   |       t        j                  | j                  | j
                  d      | _        t        j                  | j                  | j
                  d      | _        t        j                  | j
                  | j                  d      | _        y )NF)bias)	r   r   nnLinearr   intermediate_size	gate_projup_proj	down_proj)r   configr   s     r    r   zOlmoMLP.__init__+   ss     4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXr!   )r-   r.   r/   r   r3   r4   s   @r    r6   r6   *   s    Y Yr!   r6   c                 
   | j                   |j                   }}|j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }	|j                  |      |	j                  |      fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r$   	unsqueezer   r(   )
qkcossinposition_idsunsqueeze_dimq_typek_typeq_embedk_embeds
             r    apply_rotary_pos_embrL   2   s|    ( WWaggFF
--
&C
--
&C3w;q>C/0G3w;q>C/0G::fwzz&111r!   c                      e Zd Z eddd      	 	 ddej
                  deej
                  ej
                  f   deej
                     dee   d	eej                     d
eej
                  eej
                     f   fd       Z
y)OlmoAttentionpast_key_valuepast_key_valuesz4.58)new_nameversionNr"   position_embeddingsattention_maskcache_positionr   c                    |j                   d d }g |d| j                  }| j                  |      }	| j                  |      }
| j	                  |      }| j
                  j                  |	j                  | j
                  j                   | j
                  j                         |
j                  | j
                  j                   | j
                  j                         |j                  | j
                  j                   | j
                  j                         |	j                  |      j                  dd      }	|
j                  |      j                  dd      }
|j                  |      j                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j
                  j                  dk7  rt        | j
                  j                     } || |	|
||f| j                   sdn| j"                  | j$                  d|\  }} |j&                  g |d j)                         }| j+                  |      }||fS )	N)minmaxr   r
   )rE   rD   rU   eagerg        )dropoutscaling)shapehead_dimq_projk_projv_projr?   clip_qkvclamp_view	transposerL   update	layer_idxr   _attn_implementationr   trainingattention_dropoutr\   reshape
contiguouso_proj)r   r"   rS   rT   rP   rU   kwargsinput_shapehidden_shapequery_states
key_statesvalue_statesrD   rE   cache_kwargsattention_interfaceattn_outputattn_weightss                     r    r,   zOlmoAttention.forwardO   sB    $))#2.88b8$--8{{=1[[/
{{=1;;+T[[%9%9$9t{{?S?ST4;;#7#7"7T[[=Q=QRT[[%9%9$9t{{?S?ST#((6@@AF__\2<<QB
#((6@@AF&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r!   )NN)r-   r.   r/   r	   r)   r2   tupler   r   
LongTensorr,    r!   r    rN   rN   N   s    %0A6R ,0592)||2) #5<<#=>2) !.	2)
 "%2) !!1!122) 
u||Xell33	42) S2)r!   rN   c                   (     e Zd Zdedef fdZ xZS )OlmoDecoderLayerr?   rg   c                     t         |   ||       t        |j                        | _        t        |j                        | _        t        ||      | _        y )N)r?   rg   )r   r   r   r   input_layernormpost_attention_layernormrN   	self_attnr   r?   rg   r   s      r    r   zOlmoDecoderLayer.__init__   sF    +,V-?-?@(5f6H6H(I%&f	Jr!   )r-   r.   r/   r   r1   r   r3   r4   s   @r    r|   r|      s    Kz Kc K Kr!   r|   c                       e Zd Zd Zy)OlmoRotaryEmbeddingc                    | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	||	fcd d d        S # 1 sw Y   y xY w)
Nr   rW   r   mpscpuF)device_typeenabledr
   )dim)inv_freqfloatexpandr]   r(   device
isinstancetypestrr)   autocastre   catrD   attention_scalingrE   )
r   xrF   inv_freq_expandedposition_ids_expandedr   freqsembrD   rE   s
             r    r,   zOlmoRotaryEmbedding.forward   s0    MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C8	 	 	s    BE22E;N)r-   r.   r/   r,   rz   r!   r    r   r      s    
r!   r   c                   $     e Zd Zdef fdZ xZS )	OlmoModelr?   c           	          t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        t        |j                        | _
        y c c}w r   )r   r   r9   
ModuleListrangenum_hidden_layersr|   layersr   r   normr   s      r    r   zOlmoModel.__init__   s[     mmBGH`H`BabYfi0b
 "&"4"45	 cs   A1)r-   r.   r/   r   r   r3   r4   s   @r    r   r      s    6z 6 6r!   r   c                       e Zd Zy)OlmoForCausalLMN)r-   r.   r/   rz   r!   r    r   r      s    r!   r   )r   r   OlmoPreTrainedModel)Nr   )*typingr   r   r)   torch.nnr9   torch.nn.functional
functionalr&   torch.utils.checkpointcache_utilsr   modeling_utilsr   utilsr   utils.deprecationr	   llama.modeling_llamar   r   r   r   r   r   r   r   configuration_olmor   
get_loggerr-   loggerModuler   r6   rL   rN   r|   r   r   r   __all__rz   r!   r    <module>r      s    %        5  0	 	 	 + 
		H	%
BII 
Yh Y284)N 4)nK( K. 6
 6	& 	r!   