
    h                        d dl mZmZmZ d dlZd dlmc mZ d dlmZ d dl	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZ ddlmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2  ed       G d dejf                               Z4 G d de      Z5 G d dejf                        Z6d Z7dFdZ8dejr                  d e:d!ejr                  fd"Z;	 dGd#ejf                  d$ejr                  d%ejr                  d&ejr                  d'eejr                     d(e<d)e<d*e(e*   fd+Z= G d, d-ejf                        Z> G d. d/ejf                        Z? G d0 d1ejf                        Z@ G d2 d3e      ZAe+ G d4 d5e&             ZB G d6 d7ejf                        ZCe+ G d8 d9eB             ZD	 	 	 dHd:eejr                  eEejr                     df   d;ee:   d'eejr                     d!eejr                  e:f   fd<ZFe+ G d= d>eBe             ZG G d? d@eeB      ZH G dA dBeeB      ZI G dC dDeeB      ZJg dEZKy)I    )CallableOptionalUnionN)nn)check_model_inputs   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GenericForQuestionAnswering GenericForSequenceClassificationGenericForTokenClassificationGradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)OutputRecorder   )MiniMaxConfigRMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )MiniMaxRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z=
        MiniMaxRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      j/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/minimax/modeling_minimax.pyr(   zMiniMaxRMSNorm.__init__7   s1     	ll5::k#:; #    c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )N   T)keepdim)	dtypetor*   float32powmeanrsqrtr-   r,   )r.   hidden_statesinput_dtypevariances       r2   forwardzMiniMaxRMSNorm.forward?   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r3   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler,   shaper-   )r.   s    r2   
extra_reprzMiniMaxRMSNorm.extra_reprF   s*    ))*+6$2G2G1HIIr3   )gư>)__name__
__module____qualname__r(   rA   rE   __classcell__r1   s   @r2   r%   r%   5   s    $;Jr3   r%   c                        e Zd Z fdZd ZdefdZ fdZdef fdZd Z	defd	Z
d
ej                  fdZdefdZ xZS )MiniMaxCachec                 0    t         |           g | _        y N)r'   r(   linear_cacher.   r1   s    r2   r(   zMiniMaxCache.__init__K   s    02r3   c                     t        t        | j                        |dz         D ]  }| j                  j                  g         || j                  |<   y Nr!   )rangelenrO   append)r.   	layer_idxrO   _s       r2   set_linear_cachezMiniMaxCache.set_linear_cacheO   sK    s4,,-y1}= 	)A$$R(	)'3)$r3   rV   c                 >    |t        |       k  r| j                  |   S y rN   )rT   rO   r.   rV   s     r2   get_linear_cachezMiniMaxCache.get_linear_cacheU   s"    s4y $$Y//r3   c                 Z    t        t        | 	         t        | j                              S rN   )maxr'   __len__rT   rO   rP   s    r2   r^   zMiniMaxCache.__len__Z   s"    57?$c$*;*;&<==r3   c                     |t        | j                        k  r"| j                  |   g k7  r| j                  |   fS t        |   |      S rN   )rT   rO   r'   __getitem__)r.   rV   r1   s     r2   r`   zMiniMaxCache.__getitem__]   sM    s4,,--$2C2CI2NRT2T%%i022w"9--r3   c              #   L   K   t        t        |             D ]	  }| |     y wrN   )rS   rT   rZ   s     r2   __iter__zMiniMaxCache.__iter__b   s(     s4y) 	"Iy/!	"s   "$repeatsc                     t        t        |             D ]`  }| j                  |   g k7  r.| j                  |   j                  |d      | j                  |<   C| j                  |   j                  |       b y )Nr   dim)rS   rT   rO   repeat_interleavelayersbatch_repeat_interleave)r.   rc   rV   s      r2   ri   z$MiniMaxCache.batch_repeat_interleavef   ss    s4y) 	HI  +r1/3/@/@/K/]/]^ekl/]/m!!),I&>>wG		Hr3   indicesc                     t        t        |             D ]T  }| j                  |   g k7  r"| j                  |   |df   | j                  |<   7| j                  |   j	                  |       V y )N.)rS   rT   rO   rh   batch_select_indices)r.   rj   rV   s      r2   rl   z!MiniMaxCache.batch_select_indicesm   sk    s4y) 	EI  +r1/3/@/@/KGUXL/Y!!),I&;;GD		Er3   
max_lengthc                     t        d      )Nz*MiniMaxCache doesnot support `crop` method)RuntimeError)r.   rm   s     r2   cropzMiniMaxCache.cropt   s    GHHr3   )rF   rG   rH   r(   rX   intr[   r^   r`   rb   ri   r*   Tensorrl   rp   rI   rJ   s   @r2   rL   rL   J   s]    34# 
>.S .
"Hs HEELL EIs Ir3   rL   c                   Z    e Zd Zdedef fdZd Zd Z eddd	      	 	 dd
e	j                  dee	j                  e	j                  f   dee	j                     dee   dee	j                     dee   dee	j                  ee	j                     eee	j                        f   fd       Z xZS )MiniMaxLightningAttentionconfigrV   c                    t         |           || _        t        |dd       xs |j                  |j
                  z  | _        |j
                  | _        |j                  | _        |j                  | _        t        |j                     | _        t        | j                  | j
                  z        | _        t        j                  |j                  | j
                  | j                  z  dz  d      | _        t        j                  | j
                  | j                  z  |j                  d      | _        t        j                  |j                  | j
                  | j                  z  d      | _        | j'                         }| j)                  |      \  }}}| j+                  d|       | j+                  d|       | j+                  d|       | j+                  d|       y )	Nhead_dimr   Fbias
slope_ratequery_decay	key_decaydiagonal_decay)r'   r(   rV   getattrr/   num_attention_headsrw   num_hidden_layers
block_sizer	   
hidden_actact_fnr%   normr   Linearqkv_projout_projoutput_gateget_slope_ratedecay_factorsregister_buffer)r.   ru   rV   rz   r{   r|   r}   r1   s          r2   r(   z"MiniMaxLightningAttention.__init__y   s   "
D9mV=O=OSYSmSm=m#)#=#= !'!9!9 ++V../"4==43K3K#KL			&"4"4d6N6NQUQ^Q^6^ab6bino		$":":T]]"JFL^L^ejk99V%7%79Q9QTXTaTa9ahmn((*
151C1CJ1O.Y\:6]K8[)4-~>r3   c                     ddd| j                   z  z  z  }t        j                  | j                         dz   }d| j                  | j                  dz
  dz   z  z
  dz   }||z  }||z  }|d d d d f   }|S )Nr!   r5      gh㈵>)r   r*   arangerV   r   )r.   baseexponentfactorrates        r2   r   z(MiniMaxLightningAttention.get_slope_rate   s    A!d66678<< 8 89A=T^^t'='='AD'HIIDPX~f}AtTM"r3   c                    t        j                  | j                        dz   }t        j                  | |d d d f   z        }t        j                  | | j                  |d d d f   z
  z        }|d d d f   |d d d f   z
  }|d d d d d d f   }||z  }t        j                  |dk\  | t        d            }t        j                  |      }|||fS )Nr!   r   z-inf)r*   r   r   expwherefloat)r.   rz   block_size_ranger{   r|   r}   s         r2   r   z'MiniMaxLightningAttention.decay_factors   s     <<81<ii.>q$w.G GHIIzkT__?OPQSWPW?X-XYZ	)!T'25EdAg5NN'dAq(89#n4^q%8>/5QW=Y>2I~55r3   past_key_valuepast_key_values4.58new_nameversionr>   position_embeddingsattention_maskcache_positionkwargsreturnc                    |j                   \  }}}	|| j                  z   dz
  | j                  z  }
| j                  | j                  |            }|j	                  ||| j
                  d| j                  z        }t        j                  || j                  d      \  }}}|j                  dd      }|j                  dd      }|j                  dd      }d }||j                  | j                        }|t        j                  || j
                  | j                  | j                        j                  |      }|Q|j                  t        j                        }|j                  |j!                  d      j!                  d       d      }g }t#        |
      D ]b  }|| j                  z  }t%        || j                  z   |      }||z
  }|d d d d ||f   }|d d d d ||f   }|d d d d ||f   }| j&                  d d d |f   }| j(                  d d | d f   }| j*                  d d d d d |d |f   }t        j,                  | j.                   |z        }t        j0                  ||j                  dd            }t        j0                  ||z  |      }t        j0                  ||z  |      }||z   }|j3                  |       t        j0                  ||z  j                  dd      |      } ||z  | z   }e nt        j,                  | j.                         }!g }t#        |      D ]  }|d d d d ||dz   f   }|d d d d ||dz   f   }|d d d d ||dz   f   }t        j0                  |j                  dd      |      }"|!|z  |"z   }t        j0                  ||      }|j3                  |        t        j4                  |d      }|j                  dd      }|j	                  ||| j
                  | j                  z        }| j7                  |      }t9        j:                  | j=                  |            |z  }| j?                  |      }||jA                  | j                  |       ||fS )	Nr!   r   re   r5   r8   r6   r   )!rD   r   r   r   reshaper   rw   r*   split	transposer[   rV   zerosr9   boolmasked_fill	unsqueezerS   minr{   r|   r}   r   rz   matmulrU   catr   Fsigmoidr   r   rX   )#r.   r>   r   r   r   r   r   
batch_sizeseq_lenr/   
num_blocks
qkv_statesquery_states
key_statesvalue_statesattn_weights_interattn_outputi	start_idxend_idxcurrent_block_sizecurrent_query_statescurrent_key_statescurrent_value_statescurrent_query_decaycurrent_key_decaycurrent_diagonal_decayblock_decayattn_weights_intraattn_output_intraattn_output_intercurrent_attn_outputnext_attn_weights_interratiocurrent_attn_weights_inters#                                      r2   rA   z!MiniMaxLightningAttention.forward   s    ,9+>+>(
G[/!3G
[[}!=>
''
GT=U=UWX[_[h[hWhi
16Z\]1^.j,#--a3))!Q/
#--a3 "&!0!A!A$..!Q%!&Z9Q9QSWS`S`bfbobo!p!s!s"
 )!/!2!2!2!D+779Q9QRS9T9^9^_a9b8bdefK:& `/	i$//97C%,y%8"'3Aq)G:K4K'L$%/1i6G0G%H"'3Aq)G:K4K'L$&*&6&6q:M;M:M7M&N#$(NN17I6I6J3J$K!)-)<)<QCVDVCVXkYkXk=k)l&#ii(8;M(MN &+\\2FHZHdHdegikHl%m"$)LL1CF\1\^r$s! %*LL1EH[1[]o$p! '8:K&K#""#67 +0,,'*;;FFr2NPd+' &8+%EH_%_";`@ IIt./EK7^ 	8'3Aq!a!e)O'D$%/1a!a%i%@"'3Aq!a!e)O'D$-2\\:L:V:VWY[]:^`t-u*%*-?%?B\%\"&+ll3GI[&\#""#67	8 ii4 "++Aq1!))*gt?W?WZ^ZgZg?ghii,ii 0 0 ?@;NmmK0 &,,T^^=OP...r3   NN)rF   rG   rH   r"   rq   r(   r   r   r   r*   rr   rC   r   r
   
LongTensorr   r   rA   rI   rJ   s   @r2   rt   rt   x   s    ?} ? ?,	6 %0A6R ,059`/||`/ #5<<#=>`/ !.	`/
 "%`/ !!1!12`/ -.`/ 
u||Xell3XeELL>Q5RR	S`/ S`/r3   rt   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nr6   r5   re   )rD   r*   r   )xx1x2s      r2   rotate_halfr     sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r3   c                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r   r   )qkcossinposition_idsunsqueeze_dimq_embedk_embeds           r2   apply_rotary_pos_embr     sY    ( --
&C
--
&C3w;q>C/0G3w;q>C/0GGr3   r>   n_repr   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r!   N)rD   expandr   )r>   r   batchnum_key_value_headsslenrw   s         r2   	repeat_kvr   .  so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr3   modulequerykeyvaluer   scalingdropoutr   c                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr5   r   r   r6   rf   r8   )ptrainingr!   )r   num_key_value_groupsr*   r   r   rD   r   
functionalsoftmaxr:   r9   r8   r   r   
contiguous)r   r   r   r   r   r   r   r   r   r   attn_weightscausal_maskr   s                r2   eager_attention_forwardr   :  s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r3   c                   0    e Zd ZdZdedef fdZ eddd      	 	 dd	ej                  d
e
ej                  ej                  f   deej                     dee   deej                     dee   de
ej                  eej                     f   fd       Z xZS )MiniMaxAttentionz=Multi-headed attention from 'Attention Is All You Need' paperru   rV   c                    t         |           || _        || _        t	        |dd       xs |j
                  |j                  z  | _        |j                  |j                  z  | _	        | j                  dz  | _
        |j                  | _        d| _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j                  | j                  z  |j
                  d      | _        y )Nrw   g      TFrx   )r'   r(   ru   rV   r~   r/   r   rw   r   r   r   attention_dropout	is_causalr   r   q_projk_projv_projo_projr.   ru   rV   r1   s      r2   r(   zMiniMaxAttention.__init__W  s2   "
D9mV=O=OSYSmSm=m$*$>$>&B\B\$\!}}d*!'!9!9ii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii : :T]] JFL^L^ejkr3   r   r   r   r   r>   r   r   r   r   r   c           
      `   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  rt        | j                  j                     } || |	|
||f| j                  sdn| j                  | j                   t#        | j                  dd       d|\  }} |j$                  g |d j'                         }| j)                  |      }||fS )	Nr6   r!   r5   )r   r   r   eager        sliding_window)r   r   r  )rD   rw   r   viewr   r   r   r   updaterV   r   ru   _attn_implementationr   r   r   r   r~   r   r   r   )r.   r>   r   r   r   r   r   input_shapehidden_shaper   r   r   r   r   cache_kwargsattention_interfacer   r   s                     r2   rA   zMiniMaxAttention.forwarde  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HLL"4;;0@$G
%
 
%
!\ *k));;;;FFHkk+.L((r3   r   )rF   rG   rH   __doc__r"   rq   r(   r   r*   rr   rC   r   r
   r   r   r   rA   rI   rJ   s   @r2   r   r   T  s    Gl} l l %0A6R ,059*)||*) #5<<#=>*) !.	*)
 "%*) !!1!12*) -.*) 
u||Xell33	4*) S*)r3   r   c                   *     e Zd Zdef fdZd Z xZS )MiniMaxBlockSparseTop2MLPru   c                    t         |           |j                  | _        |j                  | _        t        j                  | j
                  | j                  d      | _        t        j                  | j                  | j
                  d      | _	        t        j                  | j
                  | j                  d      | _
        t        |j                     | _        y NFrx   )r'   r(   intermediate_sizeffn_dimr/   
hidden_dimr   r   w1w2w3r	   r   r   r.   ru   r1   s     r2   r(   z"MiniMaxBlockSparseTop2MLP.__init__  s    // ,,))DOOT\\F))DLL$//F))DOOT\\FV../r3   c                     | j                  | j                  |            | j                  |      z  }| j                  |      }|S rN   )r   r  r  r  )r.   r>   current_hidden_statess      r2   rA   z!MiniMaxBlockSparseTop2MLP.forward  s>     $DGGM,B CdggmF\ \ $(= >$$r3   )rF   rG   rH   r"   r(   rA   rI   rJ   s   @r2   r  r    s    	0} 	0%r3   r  c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )MiniMaxSparseMoeBlocka  
    This implementation is
    strictly equivalent to standard MoE with full capacity (no
    dropped tokens). It's faster since it formulates MoE operations
    in terms of block-sparse operations to accommodate imbalanced
    assignments of tokens to experts, whereas standard MoE either
    (1) drop tokens at the cost of reduced performance or (2) set
    capacity factor to number of experts and thus waste computation
    and memory on padding.
    c                    t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _	        t        j                  | j                  | j                  d      | _        t        j                  t        | j                        D cg c]  }t        |       c}      | _        |j"                  | _        y c c}w r  )r'   r(   r/   r  r  r  num_local_expertsnum_expertsnum_experts_per_toktop_kr   r   gate
ModuleListrS   r  expertsrouter_jitter_noisejitter_noise)r.   ru   rW   r1   s      r2   r(   zMiniMaxSparseMoeBlock.__init__  s     ,,//!33//
 IIdoot/?/?eL	}}QVW[WgWgQh%iA&?&G%ij #66 &js   +Cr>   r   c                    |j                   \  }}}| j                  rQ| j                  dkD  rB|t        j                  |      j                  d| j                  z
  d| j                  z         z  }|j                  d|      }| j                  |      }t        j                  |dt        j                        }t        j                  || j                  d      \  }}||j                  dd      z  }|j                  |j                        }t        j                   ||z  |f|j                  |j"                  	      }t        j$                  j&                  j)                  || j*                  
      j-                  ddd      }	t        j.                  |	j                  d      d      j1                         }
|
D ]  }| j2                  |   }t        j4                  |	|   j7                  d            \  }}|d|f   j9                  d|      } ||      |||df   z  }|j;                  d||j                  |j                                |j9                  |||      }||fS ) r   g      ?r6   r!   r   re   T)rf   r7   )r8   device)num_classesr5   )r6   r   N)rD   r   r$  r*   
empty_likeuniform_r  r   r   r   r   topkr  sumr9   r8   r   r'  r   r   one_hotr  permutegreaternonzeror"  r   squeezer   
index_add_)r.   r>   r   sequence_lengthr  router_logitsrouting_weightsselected_expertsfinal_hidden_statesexpert_mask
expert_hit
expert_idxexpert_layeridxtop_xcurrent_stater  s                    r2   rA   zMiniMaxSparseMoeBlock.forward  s7   2?2E2E/
OZ==T..2U--m<EEcDL]L]F]_beievev_vwwM%**2z:		-0))MqL,1JJ

XZ,[))?..2t.DD),,]-@-@A#kk/):6m>Q>QZgZnZn
 hh))112BPTP`P`1aiijkmnpqr]];??x?#@!DLLN
$ 	dJ<<
3L[%<%D%DQ%GHJC *$+6>>r:NM$0$?/RWY\^bRbBc$c!  **1e5J5M5MmNaNa5bc	d 299*oWab"M11r3   )	rF   rG   rH   r  r(   r*   rr   rA   rI   rJ   s   @r2   r  r    s(    	7%2U\\ %2ell %2r3   r  c                       e Zd Zdedef fdZ eddd      	 	 	 	 	 	 	 ddej                  d	e	ej                  ej                  f   d
e
ej                     de
ej                     de
e	ej                        de
e   de
e   de
e   de
ej                     dee   de	ej                  e
e	ej                  ej                  f      f   fd       Z xZS )MiniMaxDecoderLayerru   rV   c                    t         |           |j                  | _        t        ||      | _        t        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        || _        |j                  |   | _        |j                  | _        |j                  | _        | j                  dk(  r4t!        ||      | _        |j"                  | _        |j&                  | _        y t        ||      | _        |j*                  | _        |j,                  | _        y )Nr0   linear_attention)r'   r(   r/   r   	self_attnr  block_sparse_moer%   rms_norm_epsinput_layernormpost_attention_layernormrV   layer_types
layer_typemlp_alpha_factormlp_beta_factorrt   linear_attn_alpha_factorattn_alpha_factorlinear_attn_beta_factorattn_beta_factorfull_attn_alpha_factorfull_attn_beta_factorr   s      r2   r(   zMiniMaxDecoderLayer.__init__  s   !--)&)< 5f =-f.@.@fFYFYZ(6v7I7IvObOb(c%" ,,Y7 & 7 7%55??006vyIDN%+%D%DD"$*$B$BD!-fi@DN%+%B%BD"$*$@$@D!r3   r   r   r   r   r>   r   r   r   output_attentionsoutput_router_logits	use_cacher   r   r   c
                 2   | j                  |      }|} | j                  d||||||||	d|
\  }}|| j                  z  || j                  z  z   }| j	                  |      }|}| j                  |      \  }}|| j                  z  || j                  z  z   }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            attention_mask (`torch.Tensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r>   r   r   r   r   rS  rU  r    )rG  rD  rN  rP  rH  rE  rK  rL  )r.   r>   r   r   r   r   rS  rT  rU  r   r   residualrW   s                r2   rA   zMiniMaxDecoderLayer.forward  s    N ,,];  *4>> 

' 3)%+/)

 

q !4#9#99MDLaLa<aa 55mD 00?q 4#8#88=4K_K_;__r3   )NNNFFFN)rF   rG   rH   r"   rq   r(   r   r*   rr   rC   r   r   r   r   r   FloatTensorrA   rI   rJ   s   @r2   r@  r@    s<   A} A A0 %0A6R
 26379=,1/4$)59=||= #5<<#=>= !.	=
 u//0= "%"56= $D>= 'tn= D>= !!1!12= -.= 
u  (51B1BEDUDU1U+V"WW	X= S=r3   r@  c                   `    e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZ eed      eeegd	Zy
)MiniMaxPreTrainedModelru   modelTr@  r   Fr!   )index)r4  r>   
attentionsN)rF   rG   rH   r"   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr    r  r@  r   rt   _can_record_outputsrW  r3   r2   r[  r[  B  sb    &*#./#4"5N""&'(=QG,')BCr3   r[  c                   ~     e Zd ZU ej                  ed<   ddef fdZ ej                         e	d               Z
 xZS )MiniMaxRotaryEmbeddinginv_freqru   c                    t         |           t        |d      rUt        |j                  t
              r;|j                  j                  d|j                  j                  d            | _        nd| _        |j                  | _	        |j                  | _
        || _        t        | j                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                   | _        y )Nrope_scaling	rope_typetypedefaultrl  F)
persistent)r'   r(   hasattr
isinstancern  dictgetro  max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenru   r   rope_init_fnattention_scalingr   rl  original_inv_freq)r.   ru   r'  rl  r1   s       r2   r(   zMiniMaxRotaryEmbedding.__init__X  s    6>*z&:M:Mt/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r3   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   r6   r!   mpscpuF)device_typeenabledr5   re   r   )rl  r   r   rD   r9   r'  rt  rp  strr*   autocastr   r   r   r{  r   r8   )
r.   r   r   inv_freq_expandedposition_ids_expandedr  freqsembr   r   s
             r2   rA   zMiniMaxRotaryEmbedding.forwardi  sV    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s    BF%%F.rN   )rF   rG   rH   r*   rr   r_  r"   r(   no_gradr   rA   rI   rJ   s   @r2   rk  rk  U  s=    ll/} /" U]]_<  <r3   rk  c                       e Zd Zdef fdZee	 	 	 	 	 	 	 	 ddej                  de	ej                     de	ej                     de	e   de	ej                     de	e   d	e	e   d
e	ej                     dee   defd              Z xZS )MiniMaxModelru   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t#        |      | _        d| _        | j)                          y c c}w )NrB  )ru   F)r'   r(   pad_token_idpadding_idx
vocab_sizer   	Embeddingr/   embed_tokensr!  rS   r   r@  rh   r%   rF  r   rk  
rotary_embgradient_checkpointing	post_initr   s      r2   r(   zMiniMaxModel.__init__{  s     !.. ++LL):):F<N<NPTP`P`ammEJ6KcKcEde	 3e
 #6#5#56;N;NO	0?&+# 	 fs   D	input_idsr   r   r   inputs_embedsrU  rS  r   r   r   c	                    |d u |d uz  rt        d      |r|t               }n*|r(t        |t              st        dt        |       d      || j	                  |      }|F||j                         nd}
t        j                  |
|
|j                  d   z   |j                        }||j                  d      }| j                  j                  t        nt        } || j                  |||||      }|}| j                  ||      }| j                   D ]&  }|j"                  dk(  r|}n|} ||f||||||d	|	}( | j%                  |      }t'        ||
      S )Nz:You must specify exactly one of input_ids or inputs_embedszSMiniMax uses cache of its own and is not compatible with `past_key_values` of type .r   r!   )r'  )ru   input_embedsr   r   r   r   full_attention)r   r   r   r   rU  r   )last_hidden_stater   )
ValueErrorrL   rt  rp  r  get_seq_lengthr*   r   rD   r'  r   ru   r  r   r   r  rh   rJ  r   r   )r.   r  r   r   r   r  rU  rS  r   r   past_seen_tokensmask_functionr   r>   r   decoder_layerinput_attention_masks                    r2   rA   zMiniMaxModel.forward  s    -t";<YZZ0*nOz/<Hefjkzf{e||}~    --i8M!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L.2kk.H.H.P*Vw#;;&))+%
 & #oom\J![[ 	M''+;;'2$ (6$)	$73) /#-	 	M	$ 		-0%++
 	
r3   )NNNNNNNN)rF   rG   rH   r"   r(   r   r   r*   r   r   rr   rL   rY  r   r   r   r   rA   rI   rJ   s   @r2   r  r  y  s    }    '+15372659$(,059G
##G
 !.G
 u//0	G

 ",/G
   1 12G
 D>G
 $D>G
 !!1!12G
 +,G
 
 G
  G
r3   r  gate_logitsr  c                    | t        | t              syt        | t              rC| d   j                  }t        j                  | D cg c]  }|j                  |       c}d      }t        j                  j                  j                  d      }t        j                  ||d      \  }}	t        j                  j                  j                  |	|      }
|>t        j                  |
j                         d      }t        j                  |d      }n|j                  \  }}|j                  d   ||z  z  }|dddddddf   j                  |||||f      j                  d||      j                        }t        j                   |
j                         |z  d      t        j                   |d      z  }|ddddddf   j                  ||||f      j                  d|      j                  |      }t        j                   ||z  d      t        j                   |d      z  }t        j                   ||j#                  d      z        }||z  S c c}w )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   re   r6   )rt  rC   r'  r*   r   r9   r   r   r   r+  r-  r<   r   rD   r   r   r,  r   )r  r  r  r   compute_device
layer_gateconcatenated_gate_logitsr5  rW   r6  r8  tokens_per_expertrouter_prob_per_expertr   r3  r   expert_attention_mask router_per_expert_attention_maskoverall_losss                      r2   load_balancing_loss_funcr    s9   : *[%"@+u%$Q..#(99^i-jPZjmmN.K-jpq#r hh))112JPR1SO**_eDA((%%--.>LK!JJ{'8'8':B "'O!C&4&:&:#
O4::1=*B^_ 4AtT12V&
OUKXYWR,R	 	 "IIk&7&7&9<Q&QWXY\a\e\e!q]
 
 4At+,V&
O[QRWR%R	 	) "'?=]+]cd!ehmhqhq,!i
 "
 99.1G1Q1QRS1TTUL+%%[ .ks   Ic                   p    e Zd ZdgZddiZddgdgfiZ fdZee	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     d	e	e
j                     d
e	e   de	e
j                     de	e
j                     de	e   de	e   de	e
j                     deee
j                  f   dee   defd              Z xZS )MiniMaxForCausalLMzlm_head.weightlm_headcolwise_repr>   logitsc                 N   t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        |j                  | _	        |j                  | _        |j                  | _        | j                          y r  )r'   r(   r  r\  r  r   r   r/   r  router_aux_loss_coefr  r  r  r  r  s     r2   r(   zMiniMaxForCausalLM.__init__/  s     !&)
 ++yy!3!3V5F5FUS$*$?$?!!33#)#=#=  	r3   r  r   r   r   r  labelsrU  rT  r   logits_to_keepr   r   c                 l   ||n| j                   j                  } | j                  d||||||||	d|}|j                  }t	        |
t
              rt        |
 d      n|
}| j                  |dd|ddf         }d}| | j                  ||| j                  fi |}d}|rYt        |j                  | j                  | j                  |      }|+|| j                  |j                  |j                         z  z  }t#        ||||j$                  |j&                  |j(                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MiniMaxForCausalLM

        >>> model = MiniMaxForCausalLM.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r  r   r   r   r  rU  rT  r   )lossaux_lossr  r   r>   r^  r4  rW  )ru   rT  r\  r  rt  rq   slicer  loss_functionr  r  r4  r  r  r  r9   r'  r   r   r>   r^  )r.   r  r   r   r   r  r  rU  rT  r   r  r   outputsr>   slice_indicesr  r  r  s                     r2   rA   zMiniMaxForCausalLM.forward;  sX   P %9$D $++JjJj 	
 +5$** 
+
)%+'!5)
+
 
+
  118B>SV8W~ot4]kmA}a,?@A%4%%ffdooPPD/%%  ((	H !11HKK4LLL(#33!//))!//
 	
r3   )
NNNNNNNNNr   )rF   rG   rH   _tied_weights_keys_tp_plan_pp_planr(   r   r   r   r*   r   rr   r
   rY  r   r   rq   r   r   r   rA   rI   rJ   s   @r2   r  r  )  sO   *+=)H_-z:;H
  151537+/59-1$(/35934R
E,,-R
 !.R
 u//0	R

 "%R
   1 12R
 ))*R
 D>R
 'tnR
 !!1!12R
 c5<</0R
 +,R
 
#R
  R
r3   r  c                       e Zd Zy) MiniMaxForSequenceClassificationNrF   rG   rH   rW  r3   r2   r  r        r3   r  c                       e Zd Zy)MiniMaxForTokenClassificationNr  rW  r3   r2   r  r    r  r3   r  c                       e Zd Zy)MiniMaxForQuestionAnsweringNr  rW  r3   r2   r  r    r  r3   r  )r[  r  r  r  r  r  rR   )r  )Nr5   N)Ltypingr   r   r   r*   torch.nn.functionalr   r   r   transformers.utils.genericr   activationsr	   cache_utilsr
   r   
generationr   integrationsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   r   r   r   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.deprecationr   utils.genericr    configuration_minimaxr"   Moduler%   rL   rt   r   r   rr   rq   r   r   r   r   r  r  r@  r[  rk  r  rC   r  r  r  r  r  __all__rW  r3   r2   <module>r     s  . - ,     9 ! . ) 7 R B  R K F & I I 0 + 0 Y'JRYY J (J(+I< +I\Q/		 Q/h(6	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%4<)ryy <)~%		 %$@2BII @2FW4 Wt _  $!<RYY !<H Z
) Z
 Z
~ "&
-1	O&u||U5<<%8$>?O&#O& U\\*	O&
 5<<O&d e
/ e
 e
P	'GI_ 		$ACY 		"=?U 	r3   