
    h"l                     f   d Z ddlmZ ddlZddlmc mZ ddlmZ ddlm	Z	 ddl
mZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-  ej\                  e/      Z0 G d de"      Z1 G d de,      Z2 G d de      Z3 G d dejh                        Z5 G d de$      Z6 G d de-      Z7 G d  d!e%e      Z8 G d" d#e+      Z9 G d$ d%e*      Z: G d& d'e&      Z; G d( d)e(      Z< G d* d+e)      Z= G d, d-e'      Z>g d.Z?y)/zPyTorch MiniMax model.    )OptionalN)nn   )ACT2FN)CacheDynamicCache)layer_type_validation)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)MoeModelOutputWithPast)Unpack)TransformersKwargslogging)deprecate_kwarg)OutputRecorder   )MixtralConfig)
MixtralAttentionMixtralDecoderLayerMixtralForCausalLMMixtralForQuestionAnswering MixtralForSequenceClassificationMixtralForTokenClassificationMixtralModelMixtralPreTrainedModelMixtralRMSNormMixtralSparseMoeBlockc                   4     e Zd ZdZ	 	 	 	 	 	 	 	 d fd	Z xZS )MiniMaxConfiga  
    This is the configuration class to store the configuration of a [`MiniMaxModel`]. It is used to instantiate an
    MiniMax model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the MiniMax.

    [MiniMaxAI/MiniMax-Text-01-hf](https://huggingface.co/MiniMaxAI/MiniMax-Text-01-hf)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the MiniMax model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`MiniMaxModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 14336):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `8`.
        head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
            The attention head dimension.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
            The maximum sequence length that this model might ever be used with. MiniMax's sliding window attention
            allows sequence of up to 4096*32 tokens.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            The id of the padding token.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`int`, *optional*, defaults to 2):
            The id of the "end-of-sequence" token.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        rope_theta (`float`, *optional*, defaults to 1000000.0):
            The base period of the RoPE embeddings.
        sliding_window (`int`, *optional*):
            Sliding window attention window size. If not specified, will default to `4096`.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        num_experts_per_tok (`int`, *optional*, defaults to 2):
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter
        num_local_experts (`int`, *optional*, defaults to 8):
            Number of experts per Sparse MLP layer.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabeling this will also
            allow the model to output the auxiliary loss. See [here]() for more details
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.
        router_jitter_noise (`float`, *optional*, defaults to 0.0):
            Amount of noise to add to the router.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        block_size (`int`, *optional*, defaults to 256):
            The length of each attention block, determining how queries, keys, and values
            are grouped and processed for intra- and inter-block attention.
        full_attn_alpha_factor (`float`, *optional*, defaults to 1):
            Weight for residual value in residual connection after normal attention.
        full_attn_beta_factor (`float`, *optional*, defaults to 1):
            Weight for hidden state value in residual connection after normal attention.
        linear_attn_alpha_factor (`float`, *optional*, defaults to 1):
            Weight for residual value in residual connection after lightning attention.
        linear_attn_beta_factor (`float`, *optional*, defaults to 1):
            Weight for hidden state value in residual connection after lightning attention.
        mlp_alpha_factor (`float`, *optional*, defaults to 1):
            Weight for residual value in residual connection after MLP.
        mlp_beta_factor (`float`, *optional*, defaults to 1):
            Weight for hidden state value in residual connection after MLP.

    ```python
    >>> from transformers import MiniMaxModel, MiniMaxConfig

    >>> # Initializing a MiniMax style configuration
    >>> configuration = MiniMaxConfig()

    >>> # Initializing a model from the MiniMax style configuration
    >>> model = MiniMaxModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```c	                 V   t        |   di |	 || _        || _        || _        || _        || _        || _        || _        || _	        | j                  ;t        | j                        D 
cg c]  }
t        |
dz   dz        rdnd c}
| _        t        | j                         y c c}
w )N   r   full_attentionlinear_attention )super__init__layer_types
block_sizefull_attn_alpha_factorfull_attn_beta_factorlinear_attn_alpha_factorlinear_attn_beta_factormlp_alpha_factormlp_beta_factorrangenum_hidden_layersboolr	   )selfr)   r*   r+   r,   r-   r.   r/   r0   super_kwargsi	__class__s              i/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/minimax/modular_minimax.pyr(   zMiniMaxConfig.__init__   s     	(<(&$&<#%:"(@%'>$ 0.#W\]a]s]sWt RSD!a%1$5 ;MM D 	d../ s   ,B&)N   r#   r#   r#   r#   r#   r#   )__name__
__module____qualname____doc__r(   __classcell__r7   s   @r8   r!   r!   5   s.    cN  !" !0 0    r!   c                       e Zd Zy)MiniMaxRMSNormNr:   r;   r<   r&   r@   r8   rB   rB          r@   rB   c                        e Zd Z fdZd ZdefdZ fdZdef fdZd Z	defd	Z
d
ej                  fdZdefdZ xZS )MiniMaxCachec                 0    t         |           g | _        y N)r'   r(   linear_cacher4   r7   s    r8   r(   zMiniMaxCache.__init__   s    02r@   c                     t        t        | j                        |dz         D ]  }| j                  j                  g         || j                  |<   y )Nr#   )r1   lenrI   append)r4   	layer_idxrI   _s       r8   set_linear_cachezMiniMaxCache.set_linear_cache   sK    s4,,-y1}= 	)A$$R(	)'3)$r@   rN   c                 >    |t        |       k  r| j                  |   S y rH   )rL   rI   r4   rN   s     r8   get_linear_cachezMiniMaxCache.get_linear_cache   s"    s4y $$Y//r@   c                 Z    t        t        | 	         t        | j                              S rH   )maxr'   __len__rL   rI   rJ   s    r8   rV   zMiniMaxCache.__len__   s"    57?$c$*;*;&<==r@   c                     |t        | j                        k  r"| j                  |   g k7  r| j                  |   fS t        |   |      S rH   )rL   rI   r'   __getitem__)r4   rN   r7   s     r8   rX   zMiniMaxCache.__getitem__   sM    s4,,--$2C2CI2NRT2T%%i022w"9--r@   c              #   L   K   t        t        |             D ]	  }| |     y wrH   )r1   rL   rR   s     r8   __iter__zMiniMaxCache.__iter__   s(     s4y) 	"Iy/!	"s   "$repeatsc                     t        t        |             D ]`  }| j                  |   g k7  r.| j                  |   j                  |d      | j                  |<   C| j                  |   j                  |       b y )Nr   dim)r1   rL   rI   repeat_interleavelayersbatch_repeat_interleave)r4   r[   rN   s      r8   ra   z$MiniMaxCache.batch_repeat_interleave   ss    s4y) 	HI  +r1/3/@/@/K/]/]^ekl/]/m!!),I&>>wG		Hr@   indicesc                     t        t        |             D ]T  }| j                  |   g k7  r"| j                  |   |df   | j                  |<   7| j                  |   j	                  |       V y )N.)r1   rL   rI   r`   batch_select_indices)r4   rb   rN   s      r8   rd   z!MiniMaxCache.batch_select_indices   sk    s4y) 	EI  +r1/3/@/@/KGUXL/Y!!),I&;;GD		Er@   
max_lengthc                     t        d      )Nz*MiniMaxCache doesnot support `crop` method)RuntimeError)r4   re   s     r8   cropzMiniMaxCache.crop   s    GHHr@   )r:   r;   r<   r(   rP   intrS   rV   rX   rZ   ra   torchTensorrd   rh   r>   r?   s   @r8   rF   rF      s]    34# 
>.S .
"Hs HEELL EIs Ir@   rF   c                   Z    e Zd Zdedef fdZd Zd Z eddd	      	 	 dd
e	j                  dee	j                  e	j                  f   dee	j                     dee   dee	j                     dee   dee	j                  ee	j                     eee	j                        f   fd       Z xZS )MiniMaxLightningAttentionconfigrN   c                    t         |           || _        t        |dd       xs |j                  |j
                  z  | _        |j
                  | _        |j                  | _        |j                  | _        t        |j                     | _        t        | j                  | j
                  z        | _        t        j                  |j                  | j
                  | j                  z  dz  d      | _        t        j                  | j
                  | j                  z  |j                  d      | _        t        j                  |j                  | j
                  | j                  z  d      | _        | j'                         }| j)                  |      \  }}}| j+                  d|       | j+                  d|       | j+                  d|       | j+                  d|       y )	Nhead_dimr   F)bias
slope_ratequery_decay	key_decaydiagonal_decay)r'   r(   rN   getattrhidden_sizenum_attention_headsrp   r2   r*   r   
hidden_actact_fnrB   normr   Linearqkv_projout_projoutput_gateget_slope_ratedecay_factorsregister_buffer)r4   rn   rN   rr   rs   rt   ru   r7   s          r8   r(   z"MiniMaxLightningAttention.__init__   s   "
D9mV=O=OSYSmSm=m#)#=#= !'!9!9 ++V../"4==43K3K#KL			&"4"4d6N6NQUQ^Q^6^ab6bino		$":":T]]"JFL^L^ejk99V%7%79Q9QTXTaTa9ahmn((*
151C1CJ1O.Y\:6]K8[)4-~>r@   c                     ddd| j                   z  z  z  }t        j                  | j                         dz   }d| j                  | j                  dz
  dz   z  z
  dz   }||z  }||z  }|d d d d f   }|S )Nr#   r      gh㈵>)rx   rj   arangerN   r2   )r4   baseexponentfactorrates        r8   r   z(MiniMaxLightningAttention.get_slope_rate  s    A!d66678<< 8 89A=T^^t'='='AD'HIIDPX~f}AtTM"r@   c                    t        j                  | j                        dz   }t        j                  | |d d d f   z        }t        j                  | | j                  |d d d f   z
  z        }|d d d f   |d d d f   z
  }|d d d d d d f   }||z  }t        j                  |dk\  | t        d            }t        j                  |      }|||fS )Nr#   r   z-inf)rj   r   r*   expwherefloat)r4   rr   block_size_rangers   rt   ru   s         r8   r   z'MiniMaxLightningAttention.decay_factors  s     <<81<ii.>q$w.G GHIIzkT__?OPQSWPW?X-XYZ	)!T'25EdAg5NN'dAq(89#n4^q%8>/5QW=Y>2I~55r@   past_key_valuepast_key_values4.58new_nameversionhidden_statesposition_embeddingsattention_maskcache_positionkwargsreturnc                    |j                   \  }}}	|| j                  z   dz
  | j                  z  }
| j                  | j                  |            }|j	                  ||| j
                  d| j                  z        }t        j                  || j                  d      \  }}}|j                  dd      }|j                  dd      }|j                  dd      }d }||j                  | j                        }|t        j                  || j
                  | j                  | j                        j                  |      }|Q|j                  t        j                        }|j                  |j!                  d      j!                  d       d      }g }t#        |
      D ]b  }|| j                  z  }t%        || j                  z   |      }||z
  }|d d d d ||f   }|d d d d ||f   }|d d d d ||f   }| j&                  d d d |f   }| j(                  d d | d f   }| j*                  d d d d d |d |f   }t        j,                  | j.                   |z        }t        j0                  ||j                  dd            }t        j0                  ||z  |      }t        j0                  ||z  |      }||z   }|j3                  |       t        j0                  ||z  j                  dd      |      } ||z  | z   }e nt        j,                  | j.                         }!g }t#        |      D ]  }|d d d d ||dz   f   }|d d d d ||dz   f   }|d d d d ||dz   f   }t        j0                  |j                  dd      |      }"|!|z  |"z   }t        j0                  ||      }|j3                  |        t        j4                  |d      }|j                  dd      }|j	                  ||| j
                  | j                  z        }| j7                  |      }t9        j:                  | j=                  |            |z  }| j?                  |      }||jA                  | j                  |       ||fS )	Nr#   r   r]   r   )dtyper   )!shaper*   rz   r}   reshaperx   rp   rj   split	transposerS   rN   zerostor3   masked_fill	unsqueezer1   minrs   rt   ru   r   rr   matmulrM   catr{   Fsigmoidr   r~   rP   )#r4   r   r   r   r   r   r   
batch_sizeseq_lenrw   
num_blocks
qkv_statesquery_states
key_statesvalue_statesattn_weights_interattn_outputr6   	start_idxend_idxcurrent_block_sizecurrent_query_statescurrent_key_statescurrent_value_statescurrent_query_decaycurrent_key_decaycurrent_diagonal_decayblock_decayattn_weights_intraattn_output_intraattn_output_intercurrent_attn_outputnext_attn_weights_interratiocurrent_attn_weights_inters#                                      r8   forwardz!MiniMaxLightningAttention.forward  s    ,9+>+>(
G[/!3G
[[}!=>
''
GT=U=UWX[_[h[hWhi
16Z\]1^.j,#--a3))!Q/
#--a3 "&!0!A!A$..!Q%!&Z9Q9QSWS`S`bfbobo!p!s!s"
 )!/!2!2!2!D+779Q9QRS9T9^9^_a9b8bdefK:& `/	i$//97C%,y%8"'3Aq)G:K4K'L$%/1i6G0G%H"'3Aq)G:K4K'L$&*&6&6q:M;M:M7M&N#$(NN17I6I6J3J$K!)-)<)<QCVDVCVXkYkXk=k)l&#ii(8;M(MN &+\\2FHZHdHdegikHl%m"$)LL1CF\1\^r$s! %*LL1EH[1[]o$p! '8:K&K#""#67 +0,,'*;;FFr2NPd+' &8+%EH_%_";`@ IIt./EK7^ 	8'3Aq!a!e)O'D$%/1a!a%i%@"'3Aq!a!e)O'D$-2\\:L:V:VWY[]:^`t-u*%*-?%?B\%\"&+ll3GI[&\#""#67	8 ii4 "++Aq1!))*gt?W?WZ^ZgZg?ghii,ii 0 0 ?@;NmmK0 &,,T^^=OP...r@   )NN)r:   r;   r<   r!   ri   r(   r   r   r   rj   rk   tupler   r   
LongTensorr   r   r   r>   r?   s   @r8   rm   rm      s    ?} ? ?,	6 %0A6R ,059`/||`/ #5<<#=>`/ !.	`/
 "%`/ !!1!12`/ -.`/ 
u||Xell3XeELL>Q5RR	S`/ S`/r@   rm   c                       e Zd Zy)MiniMaxAttentionNrC   r&   r@   r8   r   r   ~  rD   r@   r   c                       e Zd Zy)MiniMaxSparseMoeBlockNrC   r&   r@   r8   r   r     rD   r@   r   c                       e Zd Zdedef fdZ eddd      	 	 	 	 	 	 	 ddej                  d	e	ej                  ej                  f   d
e
ej                     de
ej                     de
e	ej                        de
e   de
e   de
e   de
ej                     dee   de	ej                  e
e	ej                  ej                  f      f   fd       Z xZS )MiniMaxDecoderLayerrn   rN   c                    t         |   ||       || _        |j                  |   | _        |j
                  | _        |j                  | _        | j                  dk(  r4t        ||      | _        |j                  | _
        |j                  | _        y t        ||      | _        |j                  | _
        |j                  | _        y )Nr%   )r'   r(   rN   r)   
layer_typer/   r0   rm   	self_attnr-   attn_alpha_factorr.   attn_beta_factorr   r+   r,   )r4   rn   rN   r7   s      r8   r(   zMiniMaxDecoderLayer.__init__  s    +" ,,Y7 & 7 7%55??006vyIDN%+%D%DD"$*$B$BD!-fi@DN%+%B%BD"$*$@$@D!r@   r   r   r   r   r   r   r   position_idsoutput_attentionsoutput_router_logits	use_cacher   r   r   c
                 2   | j                  |      }|} | j                  d||||||||	d|
\  }}|| j                  z  || j                  z  z   }| j	                  |      }|}| j                  |      \  }}|| j                  z  || j                  z  z   }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            attention_mask (`torch.Tensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r   r   r   r   r   r   r   r   r&   )input_layernormr   r   r   post_attention_layernormblock_sparse_moer/   r0   )r4   r   r   r   r   r   r   r   r   r   r   residualrO   s                r8   r   zMiniMaxDecoderLayer.forward  s    N ,,];  *4>> 

' 3)%+/)

 

q !4#9#99MDLaLa<aa 55mD 00?q 4#8#88=4K_K_;__r@   )NNNFFFN)r:   r;   r<   r!   ri   r(   r   rj   rk   r   r   r   r3   r   r   FloatTensorr   r>   r?   s   @r8   r   r     s<   A} A A" %0A6R
 26379=,1/4$)59=||= #5<<#=>= !.	=
 u//0= "%"56= $D>= 'tn= D>= !!1!12= -.= 
u  (51B1BEDUDU1U+V"WW	X= S=r@   r   c                   0    e Zd ZdZ eed      eeegdZ	y)MiniMaxPreTrainedModelFr#   )index)router_logitsr   
attentionsN)
r:   r;   r<   _can_compile_fullgraphr   r   r   r   rm   _can_record_outputsr&   r@   r8   r   r     s'    "'(=QG,')BCr@   r   c                       e Zd Z	 	 	 	 	 	 	 	 ddej                  deej                     deej                     dee   deej                     dee	   dee	   d	eej                     d
e
e   defdZy)MiniMaxModelN	input_idsr   r   r   inputs_embedsr   r   r   r   r   c	                    |d u |d uz  rt        d      |r|t               }n*|r(t        |t              st        dt        |       d      || j	                  |      }|F||j                         nd}
t        j                  |
|
|j                  d   z   |j                        }||j                  d      }| j                  j                  t        nt        } || j                  |||||      }|}| j                  ||      }| j                   D ]&  }|j"                  dk(  r|}n|} ||f||||||d	|	}( | j%                  |      }t'        ||
      S )Nz:You must specify exactly one of input_ids or inputs_embedszSMiniMax uses cache of its own and is not compatible with `past_key_values` of type .r   r#   )device)rn   input_embedsr   r   r   r   r$   )r   r   r   r   r   r   )last_hidden_stater   )
ValueErrorrF   
isinstancetypeembed_tokensget_seq_lengthrj   r   r   r   r   rn   sliding_windowr
   r   
rotary_embr`   r   r{   r   )r4   r   r   r   r   r   r   r   r   r   past_seen_tokensmask_functioncausal_maskr   r   decoder_layerinput_attention_masks                    r8   r   zMiniMaxModel.forward  s    -t";<YZZ0*nOz/<Hefjkzf{e||}~    --i8M!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L.2kk.H.H.P*Vw#;;&))+%
 & #oom\J![[ 	M''+;;'2$ (6$)	$73) /#-	 	M	$ 		-0%++
 	
r@   )NNNNNNNN)r:   r;   r<   rj   r   r   rk   rF   r   r3   r   r   r   r   r&   r@   r8   r   r     s     '+15372659$(,059G
##G
 !.G
 u//0	G

 ",/G
   1 12G
 D>G
 $D>G
 !!1!12G
 +,G
 
 G
r@   r   c                        e Zd Z fdZ xZS )MiniMaxForCausalLMc                 "    t        |   di |S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MiniMaxForCausalLM

        >>> model = MiniMaxForCausalLM.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```r&   )r'   r   )r4   r5   r7   s     r8   r   zMiniMaxForCausalLM.forward.  s    . w...r@   )r:   r;   r<   r   r>   r?   s   @r8   r   r   -  s    / /r@   r   c                       e Zd Zy) MiniMaxForSequenceClassificationNrC   r&   r@   r8   r   r   H  rD   r@   r   c                       e Zd Zy)MiniMaxForTokenClassificationNrC   r&   r@   r8   r   r   L  rD   r@   r   c                       e Zd Zy)MiniMaxForQuestionAnsweringNrC   r&   r@   r8   r  r  P  rD   r@   r  )r!   r   r   r   r   r   r  )@r=   typingr   rj   torch.nn.functionalr   
functionalr   activationsr   cache_utilsr   r   configuration_utilsr	   masking_utilsr
   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   processing_utilsr   utilsr   r   utils.deprecationr   utils.genericr   mixtral.configuration_mixtralr   mixtral.modeling_mixtralr   r   r   r   r   r   r   r   r   r   
get_loggerr:   loggerr!   rB   rF   Modulerm   r   r   r   r   r   r   r   r   r  __all__r&   r@   r8   <module>r     s!          ! . 8 R B 9 6 & 0 0 + 9   
		H	%@0M @0F	^ 	+I< +I\Q/		 Q/h	' 		1 	P-/I Pf3 H
< H
V/+ /6	'G 		$A 		"= 	r@   