
    hVv                     T   d dl mZmZmZ d dlZd dlmc mZ d dlmZ d dl	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZ ddlmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2  G d dejf                        Z4 G d dejf                        Z5 ed       G d dejf                               Z6d Z7dBdZ8dejr                  d e:d!ejr                  fd"Z;	 dCd#ejf                  d$ejr                  d%ejr                  d&ejr                  d'eejr                     d(e<d)e<d*e(e*   fd+Z= G d, d-ejf                        Z> G d. d/e      Z? G d0 d1ejf                        Z@e+ G d2 d3e&             ZAe+ G d4 d5eA             ZB	 	 	 dDd6eejr                  eCejr                     df   d7ee:   d'eejr                     d!eejr                  e:f   fd8ZDe+ G d9 d:eAe             ZE G d; d<eeA      ZF G d= d>eeA      ZG G d? d@eeA      ZHg dAZIy)E    )CallableOptionalUnionN)nn)check_model_inputs   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GenericForQuestionAnswering GenericForSequenceClassificationGenericForTokenClassificationGradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)OutputRecorder   )MixtralConfigc                   *     e Zd Zdef fdZd Z xZS )MixtralBlockSparseTop2MLPconfigc                    t         |           |j                  | _        |j                  | _        t        j                  | j
                  | j                  d      | _        t        j                  | j                  | j
                  d      | _	        t        j                  | j
                  | j                  d      | _
        t        |j                     | _        y NFbias)super__init__intermediate_sizeffn_dimhidden_size
hidden_dimr   Linearw1w2w3r	   
hidden_actact_fnselfr%   	__class__s     j/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/mixtral/modeling_mixtral.pyr+   z"MixtralBlockSparseTop2MLP.__init__:   s    // ,,))DOOT\\F))DLL$//F))DOOT\\FV../    c                     | j                  | j                  |            | j                  |      z  }| j                  |      }|S N)r5   r1   r3   r2   )r7   hidden_statescurrent_hidden_statess      r9   forwardz!MixtralBlockSparseTop2MLP.forwardE   s>     $DGGM,B CdggmF\ \ $(= >$$r:   )__name__
__module____qualname__r"   r+   r?   __classcell__r8   s   @r9   r$   r$   9   s    	0} 	0%r:   r$   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )MixtralSparseMoeBlocka  
    This implementation is
    strictly equivalent to standard MoE with full capacity (no
    dropped tokens). It's faster since it formulates MoE operations
    in terms of block-sparse operations to accommodate imbalanced
    assignments of tokens to experts, whereas standard MoE either
    (1) drop tokens at the cost of reduced performance or (2) set
    capacity factor to number of experts and thus waste computation
    and memory on padding.
    c                    t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _	        t        j                  | j                  | j                  d      | _        t        j                  t        | j                        D cg c]  }t        |       c}      | _        |j"                  | _        y c c}w r'   )r*   r+   r.   r/   r,   r-   num_local_expertsnum_expertsnum_experts_per_toktop_kr   r0   gate
ModuleListranger$   expertsrouter_jitter_noisejitter_noise)r7   r%   _r8   s      r9   r+   zMixtralSparseMoeBlock.__init__W   s     ,,//!33//
 IIdoot/?/?eL	}}QVW[WgWgQh%iA&?&G%ij #66 &js   +Cr=   returnc                    |j                   \  }}}| j                  rQ| j                  dkD  rB|t        j                  |      j                  d| j                  z
  d| j                  z         z  }|j                  d|      }| j                  |      }t        j                  |dt        j                        }t        j                  || j                  d      \  }}||j                  dd      z  }|j                  |j                        }t        j                   ||z  |f|j                  |j"                  	      }t        j$                  j&                  j)                  || j*                  
      j-                  ddd      }	t        j.                  |	j                  d      d      j1                         }
|
D ]  }| j2                  |   }t        j4                  |	|   j7                  d            \  }}|d|f   j9                  d|      } ||      |||df   z  }|j;                  d||j                  |j                                |j9                  |||      }||fS ) r   g      ?r!   dimdtyperX   T)rX   keepdim)rY   device)num_classes   )rV   N)shapetrainingrQ   torch
empty_likeuniform_viewrL   FsoftmaxfloattopkrK   sumtorY   zerosr\   r   
functionalone_hotrI   permutegreaternonzerorO   wheresqueezereshape
index_add_)r7   r=   
batch_sizesequence_lengthr/   router_logitsrouting_weightsselected_expertsfinal_hidden_statesexpert_mask
expert_hit
expert_idxexpert_layeridxtop_xcurrent_stater>   s                    r9   r?   zMixtralSparseMoeBlock.forwardf   s7   2?2E2E/
OZ==T..2U--m<EEcDL]L]F]_beievev_vwwM%**2z:		-0))MqL,1JJ

XZ,[))?..2t.DD),,]-@-@A#kk/):6m>Q>QZgZnZn
 hh))112BPTP`P`1aiijkmnpqr]];??x?#@!DLLN
$ 	dJ<<
3L[%<%D%DQ%GHJC *$+6>>r:NM$0$?/RWY\^bRbBc$c!  **1e5J5M5MmNaNa5bc	d 299*oWab"M11r:   )	r@   rA   rB   __doc__r+   rb   Tensorr?   rC   rD   s   @r9   rF   rF   K   s(    	7%2U\\ %2ell %2r:   rF   RMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )MixtralRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z=
        MixtralRMSNorm is equivalent to T5LayerNorm
        N)r*   r+   r   	Parameterrb   onesweightvariance_epsilon)r7   r.   epsr8   s      r9   r+   zMixtralRMSNorm.__init__   s1     	ll5::k#:; #r:   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nr^   rV   T)r[   )	rY   rk   rb   float32powmeanrsqrtr   r   )r7   r=   input_dtypevariances       r9   r?   zMixtralRMSNorm.forward   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r:   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler   r`   r   )r7   s    r9   
extra_reprzMixtralRMSNorm.extra_repr   s*    ))*+6$2G2G1HIIr:   )gư>)r@   rA   rB   r+   r?   r   rC   rD   s   @r9   r   r      s    $;Jr:   r   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..NrV   r^   rZ   )r`   rb   cat)xx1x2s      r9   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r:   c                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkcossinposition_idsunsqueeze_dimq_embedk_embeds           r9   apply_rotary_pos_embr      sY    ( --
&C
--
&C3w;q>C/0G3w;q>C/0GGr:   r=   n_reprS   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r!   N)r`   expandrt   )r=   r   batchnum_key_value_headsslenhead_dims         r9   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr:   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr^   r   r_   rV   rW   )pra   r!   )r   num_key_value_groupsrb   matmul	transposer`   r   rm   rg   r   rk   rY   r   ra   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r9   eager_attention_forwardr      s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r:   c                   0    e Zd ZdZdedef fdZ eddd      	 	 dd	ej                  d
e
ej                  ej                  f   deej                     dee   deej                     dee   de
ej                  eej                     f   fd       Z xZS )MixtralAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr%   	layer_idxc                    t         |           || _        || _        t	        |dd       xs |j
                  |j                  z  | _        |j                  |j                  z  | _	        | j                  dz  | _
        |j                  | _        d| _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j                  | j                  z  |j
                  d      | _        y )Nr   g      TFr(   )r*   r+   r%   r   getattrr.   num_attention_headsr   r   r   r   attention_dropout	is_causalr   r0   q_projk_projv_projo_projr7   r%   r   r8   s      r9   r+   zMixtralAttention.__init__   s2   "
D9mV=O=OSYSmSm=m$*$>$>&B\B\$\!}}d*!'!9!9ii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii : :T]] JFL^L^ejkr:   past_key_valuepast_key_values4.58new_nameversionr=   position_embeddingsr   cache_positionr   rS   c           
      `   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  rt        | j                  j                     } || |	|
||f| j                  sdn| j                  | j                   t#        | j                  dd       d|\  }} |j$                  g |d j'                         }| j)                  |      }||fS )	NrV   r!   r^   )r   r   r   eager        sliding_window)r   r   r   )r`   r   r   re   r   r   r   r   updater   r   r%   _attn_implementationr   ra   r   r   r   rt   r   r   )r7   r=   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   s                     r9   r?   zMixtralAttention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HLL"4;;0@$G
%
 
%
!\ *k));;;;FFHkk+.L((r:   )NN)r@   rA   rB   r   r"   intr+   r   rb   r   r   r   r
   
LongTensorr   r   r?   rC   rD   s   @r9   r   r      s    Gl} l l %0A6R ,059*)||*) #5<<#=>*) !.	*)
 "%*) !!1!12*) -.*) 
u||Xell33	4*) S*)r:   r   c                   D    e Zd Zdedef fdZ eddd      	 	 	 	 ddej                  d	e	ej                  ej                  f   d
e
ej                     de
ej                     de
e	ej                        de
ej                     dee   dej                  fd       Z xZS )MixtralDecoderLayerr%   r   c                    t         |           |j                  | _        t        ||      | _        t        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        y )Nr   )r*   r+   r.   r   	self_attnrF   block_sparse_moer   rms_norm_epsinput_layernormpost_attention_layernormr   s      r9   r+   zMixtralDecoderLayer.__init__+  sm    !--)&)< 5f =-f.@.@fFYFYZ(6v7I7IvObOb(c%r:   r   r   r   r   r=   r   r   r   r   r   rS   c           
          |}| j                  |      } | j                  d||||||d|\  }}	||z   }|}| j                  |      }| j                  |      \  }}	||z   }|S )N)r=   r   r   r   r   r    )r   r   r   r   )
r7   r=   r   r   r   r   r   r   residualrR   s
             r9   r?   zMixtralDecoderLayer.forward5  s     !,,]; *4>> 
' 3)%+)
 
q !=0 !55mD00?q =0r:   )NNNN)r@   rA   rB   r"   r   r+   r   rb   r   r   r   r   r   r   FloatTensorr?   rC   rD   s   @r9   r   r   *  s    d} d d %0A6R
 26379=59 ||  #5<<#=>  !.	 
 u//0  "%"56  !!1!12  +,  
		  S r:   r   c                   ~     e Zd ZU ej                  ed<   ddef fdZ ej                         e	d               Z
 xZS )MixtralRotaryEmbeddinginv_freqr%   c                    t         |           t        |d      rUt        |j                  t
              r;|j                  j                  d|j                  j                  d            | _        nd| _        |j                  | _	        |j                  | _
        || _        t        | j                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                   | _        y )Nrope_scaling	rope_typetypedefaultr   F)
persistent)r*   r+   hasattr
isinstancer   dictgetr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr%   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r7   r%   r\   r   r8   s       r9   r+   zMixtralRotaryEmbedding.__init__\  s    6>*z&:M:Mt/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r:   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   rV   r!   mpscpuF)device_typeenabledr^   rZ   )rY   )r   rh   r   r`   rk   r\   r   r   strrb   autocastr   r   r   r  r   rY   )
r7   r   r   inv_freq_expandedposition_ids_expandedr  freqsembr   r   s
             r9   r?   zMixtralRotaryEmbedding.forwardm  sV    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s    BF%%F.r<   )r@   rA   rB   rb   r   __annotations__r"   r+   no_gradr   r?   rC   rD   s   @r9   r   r   Y  s=    ll/} /" U]]_<  <r:   r   c                   \    e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZ eed      eed	Zy
)MixtralPreTrainedModelr%   modelTr   r   Fr!   )index)rx   r=   
attentionsN)r@   rA   rB   r"   r  base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr    rF   r   r   _can_record_outputsr   r:   r9   r  r  }  s\    &*#./#4"5N""&'(=QG,&r:   r  c                       e Zd Zdef fdZee	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee   deej                     dee   d	eej                     d
ee   defd              Z xZS )MixtralModelr%   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t#        |      | _        d| _        | j)                          y c c}w )Nr   r%   F)r*   r+   pad_token_idpadding_idx
vocab_sizer   	Embeddingr.   embed_tokensrM   rN   num_hidden_layersr   layersr   r   normr   
rotary_embgradient_checkpointing	post_initr   s      r9   r+   zMixtralModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammEJ6KcKcEde	 3e
 #6#5#56;N;NO	0?&+# 	 fs   D	input_idsr   r   r   inputs_embeds	use_cacher   r   rS   c                    |d u |d uz  rt        d      |r|t        | j                        }|| j                  |      }|F||j	                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }| j                  j                  t        nt        }
 |
| j                  |||||      }|}| j                  ||      }| j                  d | j                  j                   D ]  } ||f||||||d|} | j!                  |      }t#        ||      S )	Nz:You must specify exactly one of input_ids or inputs_embedsr'  r   r!   )r\   )r%   input_embedsr   r   r   r   )r   r   r   r   r5  r   )last_hidden_stater   )
ValueErrorr   r%   r,  get_seq_lengthrb   aranger`   r\   r   r   r   r   r0  r.  r-  r/  r   )r7   r3  r   r   r   r4  r5  r   r   past_seen_tokensmask_functionr   r=   r   decoder_layers                  r9   r?   zMixtralModel.forward  sx    -t";<YZZ0*$++>O  --i8M!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L.2kk.H.H.P*Vw#;;&))+%
 & #oom\J![[)H4;;+H+HI 
	M)	$7*) /#-	 	M
	 		-0%++
 	
r:   )NNNNNNN)r@   rA   rB   r"   r+   r   r   r   rb   r   r   r
   r   boolr   r   r   r?   rC   rD   s   @r9   r%  r%    s    }    151537+/59$(59<
E,,-<
 !.<
 u//0	<

 "%<
   1 12<
 D><
 !!1!12<
 +,<
 
 <
  <
r:   r%  gate_logitsrI   c                    | t        | t              syt        | t              rC| d   j                  }t        j                  | D cg c]  }|j                  |       c}d      }t        j                  j                  j                  d      }t        j                  ||d      \  }}	t        j                  j                  j                  |	|      }
|>t        j                  |
j                         d      }t        j                  |d      }n|j                  \  }}|j                  d   ||z  z  }|dddddddf   j                  |||||f      j                  d||      j                        }t        j                   |
j                         |z  d      t        j                   |d      z  }|ddddddf   j                  ||||f      j                  d|      j                  |      }t        j                   ||z  d      t        j                   |d      z  }t        j                   ||j#                  d      z        }||z  S c c}w )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   rZ   rV   )r   r   r\   rb   r   rk   r   rm   rg   ri   rn   r   rh   r`   r   rt   rj   r   )r@  rI   rK   r   compute_device
layer_gateconcatenated_gate_logitsry   rR   rz   r|   tokens_per_expertrouter_prob_per_expertrv   rw   r-  expert_attention_mask router_per_expert_attention_maskoverall_losss                      r9   load_balancing_loss_funcrJ    s9   : *[%"@+u%$Q..#(99^i-jPZjmmN.K-jpq#r hh))112JPR1SO**_eDA((%%--.>LK!JJ{'8'8':B "'O!C&4&:&:#
O4::1=*B^_ 4AtT12V&
OUKXYWR,R	 	 "IIk&7&7&9<Q&QWXY\a\e\e!q]
 
 4At+,V&
O[QRWR%R	 	) "'?=]+]cd!ehmhqhq,!i
 "
 99.1G1Q1QRS1TTUL+%%[ .ks   Ic                   p    e Zd ZdgZddiZddgdgfiZ fdZee	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     d	e	e
j                     d
e	e   de	e
j                     de	e
j                     de	e   de	e   de	e
j                     deee
j                  f   dee   defd              Z xZS )MixtralForCausalLMzlm_head.weightlm_headcolwise_repr=   logitsc                 N   t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        |j                  | _	        |j                  | _        |j                  | _        | j                          y r'   )r*   r+   r%  r  r*  r   r0   r.   rM  router_aux_loss_coefrH   rI   rJ   r2  r6   s     r9   r+   zMixtralForCausalLM.__init__;  s     !&)
 ++yy!3!3V5F5FUS$*$?$?!!33#)#=#=  	r:   r3  r   r   r   r4  labelsr5  output_router_logitsr   logits_to_keepr   rS   c                 l   ||n| j                   j                  } | j                  d||||||||	d|}|j                  }t	        |
t
              rt        |
 d      n|
}| j                  |dd|ddf         }d}| | j                  ||| j                  fi |}d}|rYt        |j                  | j                  | j                  |      }|+|| j                  |j                  |j                         z  z  }t#        ||||j$                  |j&                  |j(                  |j                        S )a~  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MixtralForCausalLM

        >>> model = MixtralForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-v0.1")
        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-v0.1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r3  r   r   r   r4  r5  rS  r   )lossaux_lossrO  r   r=   r  rx   r   )r%   rS  r  r8  r   r   slicerM  loss_functionr*  rJ  rx   rI   rJ   rQ  rk   r\   r   r   r=   r  )r7   r3  r   r   r   r4  rR  r5  rS  r   rT  r   outputsr=   slice_indicesrO  rV  rW  s                     r9   r?   zMixtralForCausalLM.forwardG  sX   P %9$D $++JjJj 	
 +5$** 
+
)%+'!5)
+
 
+
  118B>SV8W~ot4]kmA}a,?@A%4%%ffdooPPD/%%  ((	H !11HKK4LLL(#33!//))!//
 	
r:   )
NNNNNNNNNr   )r@   rA   rB   _tied_weights_keys_tp_plan_pp_planr+   r   r   r   rb   r   r   r
   r   r?  r   r   r   r   r   r?   rC   rD   s   @r9   rL  rL  5  sO   *+=)H_-z:;H
  151537+/59-1$(/35934R
E,,-R
 !.R
 u//0	R

 "%R
   1 12R
 ))*R
 D>R
 'tnR
 !!1!12R
 c5<</0R
 +,R
 
#R
  R
r:   rL  c                       e Zd Zy) MixtralForSequenceClassificationNr@   rA   rB   r   r:   r9   r`  r`        r:   r`  c                       e Zd Zy)MixtralForTokenClassificationNra  r   r:   r9   rd  rd    rb  r:   rd  c                       e Zd Zy)MixtralForQuestionAnsweringNra  r   r:   r9   rf  rf    rb  r:   rf  )rL  rf  r%  r  r`  rd  )Nr!   )r   )Nr^   N)Jtypingr   r   r   rb   torch.nn.functionalr   rm   rf   transformers.utils.genericr   activationsr	   cache_utilsr
   r   
generationr   integrationsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   r   r   r   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.deprecationr   utils.genericr    configuration_mixtralr"   Moduler$   rF   r   r   r   r   r   r   rh   r   r   r   r   r  r%  r   rJ  rL  r`  rd  rf  __all__r   r:   r9   <module>r{     sw  6 - ,     9 ! . ) 7 R B  R K F & I I 0 + 0%		 %$@2BII @2F Y'JRYY J (J((6	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%4<)ryy <)~,4 ,^!<RYY !<H _  $ O
) O
 O
h "&
-1	O&u||U5<<%8$>?O&#O& U\\*	O&
 5<<O&d e
/ e
 e
P	'GI_ 		$ACY 		"=?U 	r:   