
    hpA                        d dl mZmZ d dlZd dlmZ ddlmZ ddlmZm	Z	 ddl
mZ ddlmZmZmZ dd	lmZ d
dlmZ d
dlmZmZmZ d
dlmZmZmZmZmZmZmZ ddl m!Z!  ejD                  e#      Z$ G d de      Z% G d de      Z& G d de      Z' G d de      Z( G d de      Z) G d de      Z* G d de      Z+ G d de      Z,g d Z-y)!    )OptionalUnionN)nn   )Cache)BaseModelOutputWithPastMoeModelOutputWithPast)Unpack)auto_docstringcan_return_tuplelogging)deprecate_kwarg   )BambaConfig)
BambaMixerBambaRMSNormGated HybridMambaAttentionDynamicCache)GraniteFlashAttentionKwargsGraniteMoeSharedAttentionGraniteMoeSharedDecoderLayerGraniteMoeSharedForCausalLMGraniteMoeSharedMLPGraniteMoeSharedModelGraniteMoeSharedPreTrainedModel   )GraniteMoeHybridConfigc                   (     e Zd Zdedef fdZ xZS )GraniteMoeHybridAttentionconfig	layer_idxc                 &    t         |   ||       y Nsuper__init__selfr   r    	__class__s      {/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/modular_granitemoehybrid.pyr%   z"GraniteMoeHybridAttention.__init__,   s    +    __name__
__module____qualname__r   intr%   __classcell__r(   s   @r)   r   r   +   s    ,5 ,# , ,r*   r   c                   (     e Zd Zdedef fdZ xZS )GraniteMoeHybridMambaLayerr   r    c                 8    t         |   t        |      |       y r"   )r$   r%   r   r&   s      r)   r%   z#GraniteMoeHybridMambaLayer.__init__1   s    V,i8r*   r+   r1   s   @r)   r3   r3   0   s    95 9# 9 9r*   r3   c                         e Zd Zd fd	Z xZS )GraniteMoeHybridRMSNormGatedc                 &    t         |   ||       y r"   r#   )r'   hidden_sizeepsr(   s      r)   r%   z%GraniteMoeHybridRMSNormGated.__init__6   s    c*r*   )gư>)r,   r-   r.   r%   r0   r1   s   @r)   r6   r6   5   s    + +r*   r6   c                   $     e Zd Zdef fdZ xZS )GraniteMoeHybridMLPr   c                 $    t         |   |       y r"   r#   r'   r   r(   s     r)   r%   zGraniteMoeHybridMLP.__init__;   s     r*   )r,   r-   r.   r   r%   r0   r1   s   @r)   r;   r;   :   s    !5 ! !r*   r;   c                   x    e Zd Zdedef fdZ eddd      	 	 	 	 	 	 	 ddej                  d	e	ej                     de	e
   d
e	e   de	e   de	ej                     de	e   de	eej                  ej                  f      dee   deej                   e	eej                   ej                   f      f   fd       Z xZS )GraniteMoeHybridDecoderLayerr   r    c                    t         |   ||       t        |      | _        d | _        d | _        |j                  |   dk(  rt        ||      | _        nt        ||      | _        |j                  |   | _	        t        |dd      dkD  | _        y )Nmambanum_local_expertsr   )r$   r%   r;   
shared_mlp	self_attnrA   layers_block_typer3   r   
layer_typegetattrhas_expertsr&   s      r)   r%   z%GraniteMoeHybridDecoderLayer.__init__@   s    +-f5
##I.'93FIFDJ6vyIDN 229= #6+>BQFr*   past_key_valuepast_key_valuesz4.58)new_nameversionhidden_statesattention_maskoutput_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingskwargsreturnc	                    |}
| j                  |      }| j                   | j                  d||||d|	}d}n | j                  d|||||||d|	\  }}|
|| j                  z  z   }|}
| j	                  |      }| j
                  r)| j                  |      \  }}|| j                  |      z   }n| j                  |      }d}|
|| j                  z  z   }|f}|r||fz  }|r||fz  }|S )aC  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs.Can be used to provide `GraniteFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        N)rM   rQ   cache_paramsrN   )rM   rN   rJ   rO   rP   rQ   rS    )input_layernormrA   rD   residual_multiplierpost_attention_layernormrH   block_sparse_moerC   )r'   rM   rN   rJ   rO   rP   rQ   rR   rS   rT   residualself_attn_weightsmoe_hidden_statesrouter_logitsoutputss                  r)   forwardz$GraniteMoeHybridDecoderLayer.forwardP   sG   L !,,];::!&DJJ +-,-	
 M !%/=t~~ 	0+- /"3#-$7	0 	0,M, !=43K3K#KK !55mD/3/D/D]/S,}-0NNM OOM:M M =43K3K#KK ")++G''Gr*   )NNFFNFN)r,   r-   r.   r   r/   r%   r   torchTensorr   r   bool
LongTensortupler
   r   FloatTensorrb   r0   r1   s   @r)   r?   r?   ?   s/   G5 G# G  %0A6R 26+/,1$)59/4KOU||U !.U "%	U
 $D>U D>U !!1!12U 'tnU &eELL%,,,F&GHU 45U 
u  (51B1BEDUDU1U+V"WW	XU SUr*   r?   c                   4     e Zd ZU eed<   dgZdZ fdZ xZS )GraniteMoeHybridPreTrainedModelr   r?   Tc                    t         |   |       t        |t              r|j                  j
                  j                  d       t        j                  t        j                  d|j                  dz               |j                  _        |j                  j
                  j                  d       y t        |t              r&|j                  j
                  j                  d       y y )Ng      ?r   )r$   _init_weights
isinstancer3   dt_biasdatafill_rc   logarange	num_headsA_logDr6   weight)r'   moduler(   s     r)   rl   z-GraniteMoeHybridPreTrainedModel._init_weights   s    f%f89NN%%c* %		%,,q&:J:JQ:N*O PFLLHHMM$ <=MM$$S) >r*   )	r,   r-   r.   r   __annotations___no_split_modules_is_statefulrl   r0   r1   s   @r)   rj   rj      s!    ""78L* *r*   rj   c                   f    e Zd Zdef fdZee	 	 	 	 	 	 	 	 	 	 	 ddej                  de	ej                     de	ej                     de	eeeej                     f      de	ej                     de	e   d	e	e   d
e	e   de	e   de	e   de	ej                     dee   deeef   fd              Zd Z xZS )GraniteMoeHybridModelr   c           	          t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        y c c}w r"   )r$   r%   r   
ModuleListrangenum_hidden_layersr?   layersr&   s      r)   r%   zGraniteMoeHybridModel.__init__   sE     mmNSTZTlTlNmn)&)<n
ns   A	input_idsrN   position_idsrJ   inputs_embedsrP   rO   output_hidden_statesrR   return_dictrQ   rT   rU   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|
|
n| j                   j                  }
|d u |d uz  rt        d      | j                  r%| j                  r|rt        j                  d       d}|| j                  |      }|| j                  z  }|r|t        j                  d       |F||j                         nd}t        j                  |||j                  d   z   |j                         }||j#                  d      }| j%                  |||||      }| j'                  ||      }|}d }| j(                  | j)                  ||      }|rdnd }|rdnd }|	rdnd }| j*                  D ]U  }|j,                  d	k(  r|n|}|r||fz  } ||f||||||	|d
|}|d   }|r|d   	||d   fz  }|	sG|d   M||d   fz  }W | j/                  |      }|r||fz  }|r|j0                  sd|_        t3        |||||      S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzGraniteMoeHybrid requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. Because one was not provided, no cache will be returned.r   r   devicerX   rA   )rN   rJ   rO   rP   rQ   rR   rS   T)last_hidden_staterJ   rM   
attentionsr`   )r   rO   r   rP   use_return_dict
ValueErrorgradient_checkpointingtrainingloggerwarning_onceembed_tokensembedding_multiplierget_seq_lengthrc   rr   shaper   	unsqueeze_update_causal_mask_update_mamba_mask
rotary_embr   rF   normhas_previous_stater	   )r'   r   rN   r   rJ   r   rP   rO   r   rR   r   rQ   rT   past_seen_tokenscausal_mask
mamba_maskrM   rS   all_hidden_statesall_self_attnsall_router_logitsdecoder_layer
layer_masklayer_outputss                           r)   rb   zGraniteMoeHybridModel.forward   s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==Yj I  --i8M%(A(AA 0K
 !CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L..M>?L]
 ,,^^L
 &"??&"&//-"N #7BD0d"6BD![[ 	>M'4'?'?7'JP[J#!m%55!)
) /"3#-%9$7
 
M *!,M  #/"}Q'7&99N# $0%-*;)==%;	>> 		-0  -!11?#E#E15O.%+++%+
 	
r*   c                 R    |}|d   dkD  s|t        j                  |dk(        rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr   )rc   all)r'   rN   rQ   r   s       r)   r   z(GraniteMoeHybridModel._update_mamba_mask6  s7     $
!q ^%?EIIn`aNaDbJr*   )NNNNNNNNNNN)r,   r-   r.   r   r%   r   r   rc   rf   r   rd   r   r   listrh   re   r
   r   rg   r   rb   r   r0   r1   s   @r)   r|   r|      sS   
5 
  '+1537KO59$(,0/3/3&*59s
##s
 !.s
 u//0	s

 "%tE4E4E/F(F"GHs
   1 12s
 D>s
 $D>s
 'tns
 'tns
 d^s
 !!1!12s
 45s
 
u--	.s
  s
j	r*   r|   c                   >     e Zd ZdgZdef fdZ	 	 	 	 	 	 ddZ xZS )GraniteMoeHybridForCausalLMzlm_head.weightr   c                 d    t         |   |       t        |      | _        | j	                          y r"   )r$   r%   r|   model	post_initr=   s     r)   r%   z$GraniteMoeHybridForCausalLM.__init__E  s&     *62
r*   c                 N   |d u }	|	sZ||d   |j                   d   k\  r|d d |j                   d    d f   }ne|j                   d   |j                   d   k7  rF|d d |f   }n<|r:t        | j                  |j                   d   | j                  | j                        }|T|R|j                         j                  d      dz
  }|j                  |dk(  d       |	s|d d |j                   d    d f   }||	rd|i}
nd|j                         i}
|
j                  |||||d       |
S )Nr   r   r   r   r   r   )r   rJ   rP   rN   rQ   )
r   r   r   dtyper   longcumsummasked_fill_
contiguousupdate)r'   r   rJ   rN   r   rQ   r   rP   rT   empty_past_kvmodel_inputss              r)   prepare_inputs_for_generationz9GraniteMoeHybridForCausalLM.prepare_inputs_for_generationK  sW    (4/ )!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	>Y__Q/DKKO %,*>)..077;a?L%%n&91= +A	0B/B/D,DE $+];L')=)=)?@L ,#2&"0"0	
 r*   )NNNNNT)r,   r-   r.   _tied_weights_keysr   r%   r   r0   r1   s   @r)   r   r   B  s2    *+5  7r*   r   )r   r|   rj   ).typingr   r   rc   r   cache_utilsr   modeling_outputsr   r	   processing_utilsr
   utilsr   r   r   utils.deprecationr   bamba.configuration_bambar   bamba.modeling_bambar   r   r   *granitemoeshared.modeling_granitemoesharedr   r   r   r   r   r   r   configuration_granitemoehybridr   
get_loggerr,   r   r   r3   r6   r;   r?   rj   r|   r   __all__rX   r*   r)   <module>r      s     #     O & > > 0 3 b b   C 
		H	%, 9 ,
9 9
+#4 +
!- !
g#? gT*&E *G1 GT@"= @F fr*   