
    h.                     F   d dl mZmZ d dlZd dlZd dlmZ ddlmZmZ ddl	m
Z
 ddlmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ  ej:                  e      Z G d de      Z  G d de      Z! G d de      Z" G d de      Z# G d de      Z$g dZ%y)    )OptionalUnionN)nn   )CacheDynamicCache)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)Unpack)TransformersKwargslogging)deprecate_kwarg   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaPreTrainedModel   )GraniteConfigc                   4     e Zd ZdZddedee   f fdZ xZS )GraniteAttentionz=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                 H    t         |   ||       |j                  | _        y N)super__init__attention_multiplierscalingselfr   r   	__class__s      i/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/granite/modular_granite.pyr   zGraniteAttention.__init__,   s    +22    r   )	__name__
__module____qualname____doc__r   r   intr   __classcell__r$   s   @r%   r   r   )   s"    G3} 3# 3 3r&   r   c                       e Zd Zdedef fdZ eddd      	 	 	 	 	 	 	 ddej                  d	e	ej                     d
e	ej                     de	e   de	e   de	e   de	ej                     de	eej                  ej                  f      deej                  e	eej                  ej                  f      f   fd       Z xZS )GraniteDecoderLayerr   r   c                 l    t         |   ||       |j                  | _        t        ||      | _        y )N)r   r   )r   r   residual_multiplierr   	self_attnr"   s      r%   r   zGraniteDecoderLayer.__init__2   s.    +#)#=#= )9Mr&   past_key_valuepast_key_valuesz4.58)new_nameversionhidden_statesattention_maskposition_idsoutput_attentions	use_cachecache_positionposition_embeddingsreturnc	                    |}
| j                  |      } | j                  d||||||||d|	\  }}|
|| j                  z  z   }|}
| j                  |      }| j	                  |      }|
|| j                  z  z   }|f}|r||fz  }|S )a/  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r7   r8   r9   r4   r:   r;   r<   r=    )input_layernormr2   r1   post_attention_layernormmlp)r#   r7   r8   r9   r4   r:   r;   r<   r=   kwargsresidualself_attn_weightsoutputss                r%   forwardzGraniteDecoderLayer.forward7   s    F !,,]; ,:4>> 
,
')%+/) 3
,
 
,
(( !=43K3K#KK !55mD/ =43K3K#KK ")++Gr&   )NNNFFNN)r'   r(   r)   r   r+   r   r   torchTensorr   
LongTensorr   booltupleFloatTensorrH   r,   r-   s   @r%   r/   r/   1   s   N} N N
 %0A6R 2637+/,1$)59KO?||? !.? u//0	?
 "%? $D>? D>? !!1!12? &eELL%,,,F&GH? 
u  (51B1BEDUDU1U+V"WW	X? S?r&   r/   c                       e Zd Zy)GranitePreTrainedModelN)r'   r(   r)   r@   r&   r%   rP   rP   z   s    r&   rP   c                       e Zd Zdef fdZ	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee	   deej                     dee   d	ee   d
ee   deej                     dee   defdZ xZS )GraniteModelr   c           	          t         |   |       |j                  | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        y c c}w r   )	r   r   embedding_multiplierr   
ModuleListrangenum_hidden_layersr/   layersr"   s      r%   r   zGraniteModel.__init__   sR     $*$?$?!mmEJ6KcKcEde	 3e
es   A(	input_idsr8   r9   r4   inputs_embedsr;   r:   output_hidden_statesr<   rD   r>   c
                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|| j                  z  }|r|t        | j                         }|	F||j                         nd}t        j                  |||j                  d   z   |j                         }	||	j#                  d      }t%        | j                   |||	||      }|}| j'                  ||      }|rd	nd }|rd	nd }| j(                  d | j                   j*                   D ],  }|r||fz  } ||f||||||	|d
|
}|d   }|s$||d   fz  }. | j-                  |      }|r||fz  }t/        ||r|nd ||      S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.F)r   r   r   )device)r   input_embedsr8   r<   r4   r9   r@   )r8   r9   r4   r:   r;   r<   r=   )last_hidden_stater4   r7   
attentions)r   r:   r[   r;   
ValueErrorgradient_checkpointingtrainingloggerwarning_onceembed_tokensrT   r   get_seq_lengthrI   arangeshaper]   	unsqueezer	   
rotary_embrX   rW   normr
   )r#   rY   r8   r9   r4   rZ   r;   r:   r[   r<   rD   past_seen_tokenscausal_maskr7   r=   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                      r%   rH   zGraniteModel.forward   sN    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M%(A(AA0*$++>O!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;&))+%
 & #oom\J #7BD0d![[)H4;;+H+HI 	6M#!m%55!)
*) /"3#-$7
 
M *!,M =#3"55'	6* 		-0  -!11&+/8Od+%	
 	
r&   )	NNNNNNNNN)r'   r(   r)   r   r   r   rI   rK   rJ   r   rN   rL   r   r   r
   rH   r,   r-   s   @r%   rR   rR   ~   s    
} 
 151537+/59$(,0/359_
E,,-_
 !._
 u//0	_

 "%_
   1 12_
 D>_
 $D>_
 'tn_
 !!1!12_
 +,_
 
!_
r&   rR   c                   \   e Zd Z	 	 	 	 	 	 	 	 	 	 	 ddeej
                     deej                     deej
                     deeee	ej                     f      deej                     deej
                     dee   d	ee   d
ee   deej
                     deeej                  f   dee   defdZy)GraniteForCausalLMNrY   r8   r9   r4   rZ   labelsr;   r:   r[   r<   logits_to_keeprD   r>   c                    ||n| j                   j                  }|	|	n| j                   j                  }	 | j                  d||||||||	|
d	|}|j                  }t        |t              rt        | d       n|}| j                  |d d |d d f         }|| j                   j                  z  }d }|* | j                  d||| j                   j                  d|}t        |||j                  |j                  |j                        S )N)	rY   r8   r9   r4   rZ   r;   r:   r[   r<   )logitsru   
vocab_size)lossrx   r4   r7   r`   r@   )r   r:   r[   modelr_   
isinstancer+   slicelm_headlogits_scalingloss_functionry   r   r4   r7   r`   )r#   rY   r8   r9   r4   rZ   ru   r;   r:   r[   r<   rv   rD   rG   r7   slice_indicesrx   rz   s                     r%   rH   zGraniteForCausalLM.forward   s.    2C1N-TXT_T_TqTq$8$D $++JjJj 	
 ,64:: ,
)%+'/!5),
 ,
  118B>SV8W~ot4]kmA}a,?@A$++444%4%%pVFt{{OeOepiopD%#33!//))
 	
r&   )NNNNNNNNNNr   )r'   r(   r)   r   rI   rK   rJ   r   r   listrN   rL   r+   r   r   r   rH   r@   r&   r%   rt   rt      s$    151537KO59-1$(,0/359342
E,,-2
 !.2
 u//0	2

 "%tE4E4E/F(F"GH2
   1 122
 ))*2
 D>2
 $D>2
 'tn2
 !!1!122
 c5<</02
 +,2
 
 2
r&   rt   )rt   rR   rP   )&typingr   r   rI   torch.utils.checkpointr   cache_utilsr   r   masking_utilsr	   modeling_outputsr
   r   processing_utilsr   utilsr   r   utils.deprecationr   llama.modeling_llamar   r   r   r   r   configuration_graniter   
get_loggerr'   rd   r   r/   rP   rR   rt   __all__r@   r&   r%   <module>r      s     #    . / O & 0 0  1 
		H	%3~ 3F+ FR	1 	g
: g
T3
) 3
l Kr&   