
    h%                    |   d dl mZmZmZmZmZ d dlZd dlmZ d dlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z*  e(       rd dl+m,Z, d dl-m.Z.m/Z/ ndZ, e'       r	d dl0m1Z1m2Z2 nd\  Z2Z1 e#jf                  e4      Z5 G d ded      Z6 G d d      Z7 G d dejp                        Z9d  Z:d!ejv                  d"e<d#ejv                  fd$Z=	 dId%ejp                  d&ejv                  d'ejv                  d(ejv                  d)eejv                     d*e>d+e>d,ee    fd-Z?dJd.Z@ G d/ d0ejp                        ZA G d1 d2ej                  jp                        ZBd3ejv                  d4e<fd5ZCd6 ZDd7 ZE eFe,e1e2f      ZGd8 ZH G d9 d:ejp                        ZI G d; d<ejp                        ZJ ed=       G d> d?ejp                               ZK G d@ dAe      ZLe! G dB dCe             ZMe! G dD dEeM             ZNe! G dF dGeMe             ZOg dHZPy)K    )AnyCallableOptional	TypedDictUnionN)nn)ACT2FN   )Cache)GenerationMixin)use_kernel_forward_from_hub)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)is_causal_conv1d_availableis_mamba_2_ssm_available   )BambaConfig)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combined)causal_conv1d_fncausal_conv1d_updateNNc                       e Zd ZU dZej
                  ed<   ej
                  ed<   eed<   eed<   ej                  ed<   y)BambaFlashAttentionKwargsa  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    Attributes:
        cu_seq_lens_q (`torch.LongTensor`)
            Gets cumulative sequence length for query state.
        cu_seq_lens_k (`torch.LongTensor`)
            Gets cumulative sequence length for key state.
        max_length_q (`int`):
            Maximum sequence length for query state.
        max_length_k (`int`):
            Maximum sequence length for key state.
        seq_idx (`torch.IntTensor):
            Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor     f/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/bamba/modeling_bamba.pyr'   r'   @   s7    " ######__r7   r'   F)totalc                       e Zd ZdZdZej                  dfdefdZ	 ddej                  dej                  de
d	eeeef      d
eej                  ej                  f   f
dZdej"                  fdZddee
   d
e
fdZy) HybridMambaAttentionDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    FNconfigc                 ,   |j                   | _         d| _        |j                  }|j                  }g | _        g | _        g | _        t        |j                        D ]*  }| j                   |   dk(  r| xj                  t        j                  ||j                  |j                  z  d|j                  z  |z  z   |||      gz  c_        | xj
                  t        j                  ||j                  |j                  |||      gz  c_        | xj                  t        j                   g g|z  |      gz  c_        | xj
                  t        j                   g g|z  |      gz  c_        | j                  j#                  |       - t        |j                        D cg c]  }t        j                   g g|z  |       c}| _        t        |j                        D cg c]  }t        j                   g g|z  |       c}| _        y c c}w c c}w )NFmamba   devicedtyperA   )layers_block_typehas_previous_statemamba_d_convmamba_d_stateconv_states
ssm_statestransformer_layersrangenum_hidden_layersr1   zerosmamba_expandhidden_sizemamba_n_groupsmamba_n_headsmamba_d_headtensorappend	key_cachevalue_cache)	selfr<   
batch_sizerB   rA   conv_kernel_sizessm_state_sizei_s	            r8   __init__z)HybridMambaAttentionDynamicCache.__init__i   s   !'!9!9"'!..--"$v//0 	2A%%a(G3  KK",,v/A/AAAH]H]D]`nDnn(%#%   KK",,++&%#	$ 	   U\\2$2CF%S$TT ELL"
1B6$R#SS''..q11	24 SXX^XpXpRqrQ%,,tj'8HrTYZ`ZrZrTstqELL"
):6Jt sts    "H!"H
key_statesvalue_states	layer_idxcache_kwargsreturnc                    | j                   |   j                  d   dk(  r|| j                   |<   || j                  |<   nft        j                  | j                   |   |gd      | j                   |<   t        j                  | j                  |   |gd      | j                  |<   | j                   |   | j                  |   fS )Nr   r?   dim)rU   shaperV   r1   cat)rW   r^   r_   r`   ra   s        r8   updatez'HybridMambaAttentionDynamicCache.update   s     >>)$**2.!3(2DNN9%*6DY'(-		4>>)3Lj2Y_`(aDNN9%*/))T5E5Ei5PR^4_ef*gDY'~~i($*:*:9*EEEr7   beam_idxc                    t        t        | j                              D ]S  }| j                  |   j                  }| j                  |   j	                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j	                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j	                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j	                  d|j                  |            | j                  |<   V y)zDReorders the cache for beam search, given the selected beam indices.r   N)	rK   lenrU   rA   index_selecttorV   rH   rI   )rW   rj   r`   rA   s       r8   reorder_cachez.HybridMambaAttentionDynamicCache.reorder_cache   sD   s4>>23 		iI^^I.55F(,y(A(N(NqRZR]R]^dRe(fDNN9%%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'__Y/66F)-)C)P)PQRT\T_T_`fTg)hDOOI&		ir7   c                     || j                   vr| j                   d   n|}t        | j                        |k  ry| j                  |   j                  d   S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )rJ   rl   rU   rg   )rW   r`   s     r8   get_seq_lengthz/HybridMambaAttentionDynamicCache.get_seq_length   sR     3<4CZCZ2ZD++A.`i	t~~)+~~i(..r22r7   N)r   )r-   r.   r/   r0   is_compileabler1   float16r   r]   Tensorr4   r   dictstrr   tupleri   r2   ro   rr   r6   r7   r8   r;   r;   Y   s     N>CmmTX $u{ $uV 26FLLF llF 	F
 tCH~.F 
u||U\\)	*F"ie&6&6 i3 3c 3r7   r;   c                   ~     e Zd ZU ej                  ed<   ddef fdZ ej                         e	d               Z
 xZS )BambaRotaryEmbeddinginv_freqr<   c                    t         |           t        |d      rUt        |j                  t
              r;|j                  j                  d|j                  j                  d            | _        nd| _        |j                  | _	        |j                  | _
        || _        t        | j                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                   | _        y )Nrope_scaling	rope_typetypedefaultr|   F)
persistent)superr]   hasattr
isinstancer~   rw   getr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr<   r   rope_init_fnattention_scalingregister_bufferr|   original_inv_freq)rW   r<   rA   r|   	__class__s       r8   r]   zBambaRotaryEmbedding.__init__   s    6>*z&:M:Mt/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r7   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   rd   r   mpscpuF)device_typeenabledr?   re   rB   )r|   floatexpandrg   rn   rA   r   r   rx   r1   autocast	transposerh   cosr   sinrB   )
rW   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r8   forwardzBambaRotaryEmbedding.forward   sV    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s    BF%%F.rs   )r-   r.   r/   r1   rv   r3   r   r]   no_gradr   r   __classcell__r   s   @r8   r{   r{      s=    ll/{ /" U]]_<  <r7   r{   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nrd   r?   re   )rg   r1   rh   )r   x1x2s      r8   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r7   hidden_statesn_reprb   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rg   r   reshape)r   r   batchnum_key_value_headsslenhead_dims         r8   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr7   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr?   r
   rq   rd   )rf   rB   )ptrainingr   )r   num_key_value_groupsr1   matmulr   rg   r   
functionalsoftmaxfloat32rn   rB   r   r   
contiguous)r   r   r   r   r   r   r   r   r^   r_   attn_weightscausal_maskattn_outputs                r8   eager_attention_forwardr      s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r7   c                 h   |j                  |      }|j                  |      }|j                  d   }| dd|f   | d|df   }}|dd|f   |d|df   }
}	||z  t        |      |z  z   }|	|z  t        |	      |z  z   }t        j                  ||gd      }t        j                  ||
gd      }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Removes the interleaving of cos and sin from GLM

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    rd   .Nre   )	unsqueezerg   r   r1   rh   )qkr   r   r   unsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds                r8   apply_rotary_pos_embr     s    , --
&C
--
&C 2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{{51C78Gs{{51C78G ii&)r2Gii&)r2GGr7   c                   *    e Zd ZdZdedef fdZ eddd      	 	 dd	ej                  d
e
ej                  ej                  f   deej                     dee   deej                     dee   de
ej                  ej                  f   fd       Z xZS )BambaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr<   r`   c                 d   t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        | j                  dz  | _
        |j                  | _        d| _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j                  | j                  z  |j
                  |j                        | _        y )Nr   g      Tbias)r   r]   r<   r`   getattrrO   num_attention_headsr   r   r   r   attention_dropout	is_causalr   Linearattention_biasq_projk_projv_projo_proj)rW   r<   r`   r   s      r8   r]   zBambaAttention.__init__3  sM   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
r7   past_key_valuepast_key_values4.58new_nameversionr   position_embeddingsr   cache_positionr   rb   c                 4   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  rt        | j                  j                     } || |	|
||f| j                  sdn| j                  | j                   d|\  }} |j"                  g |d j%                         }| j'                  |      }||fS )Nrd   r   r?   )r   r   r   eager        )r   r   )rg   r   r   viewr   r   r   r   ri   r`   r   r<   _attn_implementationr   r   r   r   r   r   r   )rW   r   r   r   r   r   r   input_shapehidden_shapequery_statesr^   r_   r   r   ra   attention_interfacer   r   s                     r8   r   zBambaAttention.forwardJ  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r7   r%   )r-   r.   r/   r0   r   r4   r]   r   r1   rv   ry   r   r   r2   r   r   r   r   r   s   @r8   r   r   0  s    G
{ 
s 
. %0A6R ,059))||)) #5<<#=>)) !.	))
 "%)) !!1!12)) +,)) 
u||U\\)	*)) S))r7   r   c                   (     e Zd Zd fd	ZddZ xZS )BambaRMSNormGatedc                     t         |           t        j                  t	        j
                  |            | _        || _        y rs   r   r]   r   	Parameterr1   onesweightvariance_epsilonrW   rO   epsr   s      r8   r]   zBambaRMSNormGated.__init__x  s/    ll5::k#:; #r7   c                    |j                   }|j                  t        j                        }|?|t        j
                  j                  |j                  t        j                              z  }|j                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S Nr?   rd   T)keepdim)rB   rn   r1   r   r   r   silupowmeanrsqrtr   r   )rW   r   gateinput_dtypevariances        r8   r   zBambaRMSNormGated.forward}  s    #))%((7)BMM,>,>twwu}}?U,VVM $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r7   gư>rs   r-   r.   r/   r]   r   r   r   s   @r8   r   r   w  s    $
	;r7   r   input_tensorpad_sizec                     t        | j                        dk(  r
ddddd|ddfnddd|ddf}t        j                  j                  j                  | |dd      S )z
    Padding x tensor with `pad_size` on the seq_len dim (dim=1)

    Assumes that we only have tensors of either size 4 or 3
       r   constant)moder   )rl   rg   r1   r   r   pad)r
  r  	pad_shapes      r8   pad_tensor_by_sizer    sf     47|7I7I3Ja3OAq!Q!Q/VWYZ\]_gijlmUnI88""<ST"UUr7   c                    t        | |      } t        | j                        dk(  r.| j                  | j                  d   d|| j                  d         S | j                  | j                  d   d|| j                  d   | j                  d         S )z
    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
    simultaneously splitting it into chunk sequences.

    Assumes that we only have tensors of either size 4 or 3
    r
   r   rd   r?   )r  rl   rg   r   )r
  r  
chunk_sizes      r8   reshape_into_chunksr    s     &lH=L
<!###L$6$6q$92z<K]K]^_K`aa ##q!2z<3E3Ea3H,J\J\]^J_
 	
r7   c                 "   | j                  d      } | d   j                  g | j                         | } t        j                  t        j                  ||| j
                  t        j                        d      }| j                  | d      } t        j                  | d      }t        j                  t        j                  ||| j
                  t        j                        d      }|j                  | t        j                         }|S )zo
    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
    rd   .Nr@   diagonalr   rq   re   )
sizer   r1   trilr   rA   boolmasked_fillcumsuminf)r
  r  masktensor_segsums       r8   segment_sumr"    s     ""2&J 2<	*11S<3D3D3FS
SL::ejjZ@S@S[`[e[efqstD++TE15LLL26M ::ejjZ@S@S[`[e[efqrsD!--teeiiZ@Mr7   c                     |N|j                   d   dkD  r<|j                   d   dkD  r*| j                  }| |dddddf   z  j                  |      } | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr   r   )rg   rB   rn   )r   r   rB   s      r8   apply_mask_to_padding_statesr$    sa     !n&:&:1&=&AnFZFZ[\F]`aFa##&1d
)CCGGNr7   c                       e Zd ZdZdedef fdZ	 	 	 	 ddej                  de	e
   de	ej                     de	ej                     d	e	ej                     f
d
Z	 	 	 dde	e
   de	ej                     de	ej                     fdZ	 	 	 	 dde	e
   de	ej                     de	ej                     d	e	ej                     fdZ xZS )
BambaMixeruP  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    The are a few differences between this and Mamba2Mixer:
    - The variable use_precomputed_states is slightly different due to the hybrid cache structure
    - There's a few non-obvious bugs fixed with batching in the slow path that exist in main
    - Some extra variables that our layer doesn't need have been removed
    - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
    r<   r`   c           	         t         |           |j                  | _        |j                  | _        |j
                  | _        |j                  | _        t        |j                  | j                  z        | _        || _        |j                  | _        |j                  | _        t"        |j                     | _        |j&                  | _        |j*                  | _        |j.                  | _        |j2                  | _        |j6                  | _        dt;        d      f| _        d| _        d| _         | j                  d| j0                  z  | j                  z  z   | _!        tE        jF                  | jB                  | jB                  |j                  | j                  | jB                  | j                  dz
        | _$        | j                  | jB                  z   | j                  z   }tE        jJ                  | j                  || j(                        | _&        tE        jN                  tQ        jR                  | j                              | _*        tQ        jV                  d| j                  dz         }tE        jN                  tQ        jX                  |            | _-        d	| jZ                  _.        t_        | j                  | j,                  
      | _0        tE        jN                  tQ        jR                  | j                              | _1        d	| jb                  _.        tE        jJ                  | j                  | j                  | j(                        | _2        tf        sth        jk                  d       y th        jk                  d       y )Nr   r  gMbP?g?r?   r   )in_channelsout_channelsr   kernel_sizegroupspaddingr   Tr   a  The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzDThe fast path for Bamba will be used when running the model on a GPU)6r   r]   rQ   	num_headsrO   rG   rZ   rF   rY   r4   rN   intermediate_sizer`   mamba_conv_biasuse_conv_bias
hidden_act
activationr	   actmamba_proj_biasuse_biasrms_norm_epslayer_norm_epsilonrP   n_groupsrR   r   mamba_chunk_sizer  r   time_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1dr   in_projr   r1   r   dt_biasarangelogA_log_no_weight_decayr   normDout_projis_fast_path_availableloggerwarning_once)rW   r<   r`   projection_sizeAr   s        r8   r]   zBambaMixer.__init__  s   --!--$22 & 3 3!$V%8%84;K;K%K!L"#33 ++&++,.."("5"5--++ 11 !$U5\2" ..T]]1BTEXEX1XXii''--==))A-
 004==@4>>Qyy
 ||EJJt~~$>? LLDNNQ./\\%))A,/
&*

#%d&<&<$BYBYZ	ejj89"&		$"8"8$:J:JQUQ^Q^_%>  fgr7   r   cache_paramsr   r   r,   c                 P   t        ||      }| j                  |      }|j                  \  }}}	| j                  | j                  z  }
|d uxr} |j
                  xro |dk(  xrh |j                  | j                     j                  d   |j                  | j                     j                  d   cxk(  xr |k(  nc xr |d uxr |d   dkD  }|r|j                  d      j                  | j                  | j                  | j                  gd      \  }}}t        ||j                  | j                     | j                  j                   j                  d      | j                  j"                  | j$                        }t'        j                  || j                  |
|
gd      \  }}}t'        j(                  | j*                  j-                                }|d d d df   d d d d d f   j/                  d| j0                  | j                        j3                  t&        j4                        }|d d d d d f   j/                  dd| j0                        }| j6                  d d d df   j/                  d| j0                        }| j8                  d d d df   j/                  d| j0                        }|j;                  || j                  |j                  d   | j                  z        }|j;                  || j                  |j                  d   | j                  z        }|j;                  || j                  | j0                        }t=        |j                  | j                     ||||||d |d
      }|j;                  || j                  | j0                  z        }| j?                  ||      }| jA                  |      d d d df   }|S t'        j(                  | j*                  j-                                }| jB                  d	t-        d
      fk(  ri nd| jB                  i}| jD                  r|tG        || j                  j                   j                  d      | j                  j"                  | j6                  |f| j8                  | jH                  || j$                  | j>                  j                   | j>                  jJ                  | j@                  j                   | j@                  j"                  | j0                  | j                  ddd|}|S |j                  | j                  | j                  | j                  gd      \  }}}|v|jM                  dd      }tN        jP                  jS                  || jT                  |j                  d   z
  df      }|j                  | j                     jW                  |       | j$                  dvrH| jY                  | j                  |jM                  dd            dd |f   jM                  dd            }nqt[        |jM                  dd      | j                  j                   j                  d      | j                  j"                  | j$                  |      jM                  dd      }t        ||      }t'        j                  || j                  |
|
gd      \  }}}t]        |j;                  ||d| j0                        |||j;                  ||| j                  d      |j;                  ||| j                  d      f| jH                  | j8                  d |d| j6                  dd|\  }}|*|(|j                  | j                     jW                  |       |j;                  ||d      }| j?                  ||      }| jA                  |      }|S )Nr   r   rd   re   .r   T)zrB  dt_softplusr   r  dt_limitF)rH  r  r,   r3  rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr?   )r  swish)r   r   r   r3  r,   )r  rH  rQ  r,   r[  rB  rR  )/r$  rA  rg   r9  rZ   rE   rH   r`   rI   squeezesplitr/  r>  r.  r$   r@  r   r   r3  r1   exprE  r   r   r   rn   r   rB  rH  r   r    rG  rI  r;  r   r"   r  r   r   r   r   r  rY   copy_r4  r#   r!   )rW   r   rO  r   r   r,   projected_statesrX   seq_lenr\   groups_time_state_sizeuse_precomputed_statesr  hidden_states_B_CdtBCrN  rB  rH  hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedrH   scan_output	ssm_states                              r8   cuda_kernels_forwardzBambaMixer.cuda_kernels_forward  s    5]NS<<6 "/!4!4
GQ!%1D1D!D $ &//&1& ((8>>qA&&t~~6<<Q?& d*& q!A% 	 "*:*B*B1*E*K*K''GR +L +'D#R
 !5!((8""**1-  ! #(++!'')?AWX#M1a 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az4==!''!*2MNAz4==!''!*2MNA%2%7%7
DNNTXTaTa%b"2''7& M *..z4>>DMM;YZM IImT:M --.q$|<C| 
w 4::++-..A$($8$8S%,<O$ObV`bfbvbvUwO }}!56$KK&&..q1KK$$LL ff####'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(-#$ &%l 
A /?.D.D++T]]DNNKQS /E /+'  + 4E3N3NqRS3T0"$--"3"34..1M1S1STV1WWYZ[#K !,,T^^<BB;O??*;;(,$5$?$?1$EFsHWH}U__`acde)% )9+55a;#{{1199!<![[--#'?? ')  i1o & %AARTb$c!&+kk%++-CE[\'#q! *C!&&z7BNFF:wrBFF:wrB*  $ff#(, LL $* &*&Y" (\-E ++DNN;AA)L)..z7BG"iiT: mmK0
r7   c                    |j                   \  }}}|j                  }t        ||      }| j                  |      }	|	j	                  | j
                  | j                  | j                  gd      \  }
}}|d uxr} |j                  xro |dk(  xrh |j                  | j                     j                   d   |j                  | j                     j                   d   cxk(  xr |k(  nc xr |d uxr |d   dkD  }|rY|j                  | j                     j                  dd      |j                  | j                  <   |d d dd d f   j                  |j                  | j                     j                        |j                  | j                     d d d d df<   |j                  | j                     j                  | j                  j                   j                        }t#        j$                  || j                  j                   j'                  d      z  d      }| j(                  r|| j                  j*                  z   }| j-                  |      }n|v|j/                  dd      }t0        j2                  j5                  || j6                  |j                   d   z
  df      }|j                  | j                     j9                  |       | j-                  | j                  |j/                  dd            dd |f   j/                  dd            }t        ||      }t#        j                  || j
                  | j:                  | j<                  z  | j:                  | j<                  z  gd      \  }}}t#        j>                  | j@                  jC                                }|r|j                  | j                     j                  }|d d dd d f   d d d df   }|j/                  dd      jE                  ||j                   d   | jF                        }| jH                  d	   jE                  | jH                  j                   d   | jF                        }t"        j0                  j2                  jK                  ||j                  |j                        z         }t#        jL                  || jN                  d   | jN                  d         }|d
   jE                  | j                  | jF                  | j<                        j                  t"        jP                        }t#        j>                  |d	   |z        j                  |      }|jS                  || j:                  d      dd d d f   }|jE                  || j:                  | j                  | j:                  z  |j                   d         jU                         }|jS                  |d|j                   d         }|d	   |dd d d f   z  }|jS                  |d| jF                        }||d	   z  j                  |      }|j                  | j                     j9                  |j                  | j                     |z  |z          |jS                  || j:                  d      dd d d f   }|jE                  || j:                  | j                  | j:                  z  |j                   d         jU                         }|jS                  |d|j                   d         }|j                  | j                     j                  |j                  |j                        }|jW                  || j                  z  | jF                  | j<                        }|jW                  || j                  z  | j<                  d      }t#        jX                  ||      }|jW                  || j                  | jF                        }| jZ                  d	   jE                  | jZ                  j                   d   | jF                        }|||z  z   j                  |j                        }|jS                  |d      d d d df   }nt0        j2                  jK                  || jH                  z         }t#        jL                  || jN                  d   | jN                  d         }|jS                  ||d| jF                        jC                         }|jS                  ||d| j<                        jC                         }|jS                  ||d| j<                        jC                         }|j]                  | j                  | j:                  z  d| j                        }|j]                  | j                  | j:                  z  d| j                        }| j^                  || j^                  z  z
  | j^                  z  }| jZ                  d	   ta        ||      z  }||d	   z  }|j                  |j                        |z  }||||fD  cg c]  } tc        | || j^                         c} \  }}}}|je                  dddd      }t#        jf                  |d      }!t#        j>                  ti        |            }"|d d d d d d d d d d d f   |d d d d d d d d d d d f   z  }#|#j%                  d      }$|$d	   |"je                  ddddd      d	   z  }%|%j%                  d      }&|&d	   |d d d d d f   z  j%                  d      }'t#        j>                  |!d d d d d d dd f   |!z
        }(||(je                  dddd      d	   z  })|)dd d d f   |d	   z  j%                  d      }*|r<|j                  | j                     d d d df   j                  |*j                        }+nt#        jj                  |*d d d df         }+t#        jl                  |+|*gd      }*t#        j>                  ti        t0        j2                  j5                  |!d d d d d d df   d                  },|,j/                  dd      },|,d
   |*d d d d d df   z  j%                  d      }-|-d d d df   |-d d df   }.}*t#        j>                  |!      }/|dd d d f   |*d d d d d df   z  }0|/je                  dddd      }1|0j%                  d      |1d	   z  }2|'|2z   }|jS                  |d| j                  | jF                        }||z   }|dkD  r|d d d |d d d d f   }|jS                  ||d      }|.*|(|j                  | j                     j9                  |.       | jo                  ||
      }3| jq                  |3j                  |            }4|4S c c} w )Nrd   re   r   r   )shiftsdimsrC   r?   .r  ).NNr   r@   )rf   output_sizer
   r  rq   )r   r   )9rg   rB   r$  rA  r^  r/  r>  r.  rE   rH   r`   rI   rollrn   rA   r@  r   r1   sumr]  r1  r   r4  r   r   r   r  rY   r`  r9  rZ   r_  rE  r   r   r   rB  softplusclampr;  r   r   r   r   bmmrH  repeat_interleaver  r  r  permuter  r"  
zeros_likerh   rG  rI  )5rW   input_statesrO  r   r   rX   rb  r\   rB   ra  r  re  rf  rd  rH   rl  r   rg  rh  rN  cache_devicerB  dAdBdBxrI   ssm_states_reshaped
C_reshapedyrH  r  
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesrn  state_decay_outC_times_statesstate_decay_out_permutedY_offrm  contextualized_statess5                                                        r8   torch_forwardzBambaMixer.torch_forward  sU    ".!3!3
GQ"" 4L.Q<<5&6&<&<''GR '= '
#
 $ &//&1& ((8>>qA&&t~~6<<Q?& d*& q!A% 	 "7C7O7OPTP^P^7_7d7dlnuw7d7xL$$T^^4ARSTVWYZSZA[A^A^_k_w_wx|  yG  yG  `H  `O  `O  BPL$$T^^4Q2X> '224>>BEET[[M_M_MfMfEgK %		dkk0088;;! !!$58H8H$H! $): ; '/@/J/J1a/P, mm//043H3HKgKmKmnpKq3qst2u ((8>>{K $5F5P5PQRTU5V)WX[]e^e]eXe)f)p)pqrtu)v w89JN[#kk##T]]T5H5H%H$--Z^ZmZmJmn
q! YYtzz'')**!'224>>BIIL Aq!GQc\*Ba#**:rxx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!5!5a!8$:N:Nq:QRB/"))$..$--I\I\]``glgtgt`uA))ByMA-.22,2GB
 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PMi0044L4IC ##DNN399''7"<sB 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A &00@CC188[\[b[bCcJ",//*t~~2Mt}}^b^q^q"r
T^^ ;T=P=PRSTJ		-z:Az4>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''T\\(9:BR!5!5a!8$:N:Nq:QRB)11*gr4==Y__aM		*gr43F3FGMMOA		*gr43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'DOO*CCtVH	*-?x-XXJ *ByM9M](()B.A cpqrtuwxay%z\]&9!Xt&W%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCCJF !99XaArsl%;h%FGL,..q"b!<YGGGc4l+mI.FFKKPQKRF &"."9"9$.."I!TSV,"W"Z"Zbhbobo"Z"p"'"2"26!RaR%="AYY8a@F))K0A0A(1aQRTV;BWY_0`$abK%//15K%o61dC9PPUUZ[U\J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*gr2A $)A''7==iHii4(
 !%knnU.C D$$G &{s   v	c                 r   t         rAd| j                  j                  j                  j                  v r| j                  |||||      S |t        d      |j                  }|B|j                  d   dkD  r0|j                  d   dkD  r||d d d d d f   z  j                  |      }| j                  ||||      S )Ncudaz\`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`r   r   )rJ  rA  r   rA   r   ro  NotImplementedErrorrB   rg   rn   r  )rW   r   rO  r   r   r,   r   rB   s           r8   r   zBambaMixer.forward  s     "f0C0C0J0J0O0O&O,,]L.Zhjqrr%n  ##%.*>*>q*AA*E.J^J^_`JadeJe*^Aq$J-GGKKERM!!-~~^^r7   )NNNN)NNN)r-   r.   r/   r0   r   r4   r]   r1   rv   r   r;   r2   r5   ro  r  r   r   r   s   @r8   r&  r&    sI   Ah{ Ahs AhL DH5915-1g||g ?@g !!1!12	g
 !.g %//*gZ DH5915L% ?@L% !!1!12	L%
 !.L%d DH5915-1_ ?@_ !!1!12	_
 !._ %//*_r7   r&  c                   $     e Zd Z fdZd Z xZS )BambaMLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  | j                  | j                  |j                        | _
        t        |j                     | _        y )Nr   )r   r]   r<   rO   r/  r   r   mlp_bias	gate_projup_proj	down_projr	   r2  act_fnrW   r<   r   s     r8   r]   zBambaMLP.__init__  s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r7   c                     | j                  | j                  | j                  |            | j                  |      z        }|S rs   )r  r  r  r  )rW   r   r  s      r8   r   zBambaMLP.forward  s6    NN4;;t~~a/@#ADLLQRO#ST	r7   r	  r   s   @r8   r  r    s    0r7   r  RMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )BambaRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z;
        BambaRMSNorm is equivalent to T5LayerNorm
        Nr   r   s      r8   r]   zBambaRMSNorm.__init__  s1     	ll5::k#:; #r7   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S r   )	rB   rn   r1   r   r  r  r  r   r   )rW   r   r  r  s       r8   r   zBambaRMSNorm.forward  sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r7   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)ry   r   rg   r   )rW   s    r8   
extra_reprzBambaRMSNorm.extra_repr  s*    ))*+6$2G2G1HIIr7   r  )r-   r.   r/   r]   r   r  r   r   s   @r8   r  r    s    $;Jr7   r  c                       e Zd Zddededef fdZ eddd      	 	 	 	 	 	 	 dd	ej                  d
e
ej                     de
ej                     de
e   de
e   de
e   de
ej                     de
eej                  ej                  f      dee   deej"                  e
eej"                  ej"                  f      f   fd       Z xZS )BambaDecoderLayerr<   r`   
layer_typec                 r   t         |           d}|dk(  rt        nd } ||      | _        t	        |j
                  |j                        | _        t	        |j
                  |j                        | _        || _	        |dk(  rt        ||      | _        y |dk(  rt        ||      | _        y t        d      )Nr   r-  r>   )r<   r`   	attentionzInvalid layer_type)r   r]   r  feed_forwardr  rO   r7  input_layernormpre_ff_layernormr  r&  r>   r   	self_attn
ValueError)rW   r<   r`   r  num_expertsffn_layer_classr   s         r8   r]   zBambaDecoderLayer.__init__  s    &1Q&6(D+F3+F,>,>FDWDWX ,V-?-?VEXEX Y$ #6YGDJ;&+FI>DN122r7   r   r   r   r   r   r   r   output_attentions	use_cacher   r   r   rb   c	                 J   |}
| j                  |      }| j                  dk(  r | j                  d||||d|	}d}n-| j                  dk(  r | j                  d||||||||d|	\  }}|
|z   }|}
| j	                  |      }| j                  |      }|
|z   }|f}|r|fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs. Can be used to provide `BambaFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        r>   )r   rO  r   r   Nr  )r   r   r   r   r  r  r   r   r6   )r  r  r>   r  r  r  )rW   r   r   r   r   r  r  r   r   r   residualself_attn_weightsoutputss                r8   r   zBambaDecoderLayer.forward  s    F !,,]; ??g%&DJJ +,--	
 M !%__+/=t~~ 
0+-) /"3#-$7
0 
0,M, !=0 !--m<))-8 =0 ")++Gr7   )r>   )NNNFFNN)r-   r.   r/   r   r4   rx   r]   r   r1   rv   r   r2   r;   r  ry   r   r'   FloatTensorr   r   r   s   @r8   r  r    s;   3{ 3s 3 3" %0A6R 2637FJ,1$)59KOK||K !.K u//0	K
 ""BCK $D>K D>K !!1!12K &eELL%,,,F&GHK 23K 
u  (51B1BEDUDU1U+V"WW	XK SKr7   r  c                   H     e Zd ZU eed<   dZdZdgZdZdZ	dZ
dZ fdZ xZS )BambaPreTrainedModelr<   modelTr  r   c                 d   t         |   |       t        |t              r|j                  j
                  j                  d       t        j                  t        j                  d|j                  dz               |j                  _        |j                  j
                  j                  d       y y )Ng      ?r   )r   _init_weightsr   r&  rB  datafill_r1   rD  rC  r.  rE  rH  )rW   r   r   s     r8   r  z"BambaPreTrainedModel._init_weightsA  sx    f%fj)NN%%c* %		%,,q&:J:JQ:N*O PFLLHHMM$ *r7   )r-   r.   r/   r   r3   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulr  r   r   s   @r8   r  r  5  s>    &*#,-"3NL% %r7   r  c                       e Zd Zdef fdZee	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee   deej                     dee   d	ee   d
ee   deej                     dee   defd              Zdej                  dej                  dej                  ded	ef
dZedej                  dededej*                  dej                  defd       Zd Z xZS )
BambaModelr<   c           	      Z   t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        g }t        |j                        D ],  }|j                  t        |||j                  |                . t        j                  |      | _        |j                   | _        t#        |j                  |j$                        | _        t)        |      | _        d| _        | j/                          y )N)r`   r  r-  )r<   F)r   r]   pad_token_idpadding_idx
vocab_sizer   	EmbeddingrO   embed_tokensrK   rL   rT   r  rD   
ModuleListlayersr   r  r7  final_layernormr{   
rotary_embgradient_checkpointing	post_init)rW   r<   decoder_layersr[   r   s       r8   r]   zBambaModel.__init__K  s     !.. ++LL):):F<N<NPTP`P`av//0 	rA!!"3FaTZTlTlmnTo"pq	rmmN3$*$?$?!+F,>,>FDWDWX.f=&+#r7   	input_idsr   r   r   inputs_embedsr  r  output_hidden_statesr   r   rb   c
                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|}|r|t        j                  d       |	.t        j                  |j                  d   |j                        }	||	j                  d      }| j                  |||	||      }| j!                  ||	      }| j#                  ||      }|rdnd }|rdnd }| j$                  D ]E  }|j&                  d	k(  r|n|}|r||fz  } ||f||||||	|d
|
}|d   }|s7|d   =||d   fz  }G | j)                  |      }|r||fz  }|r|j*                  sd|_        |sd n|}t-        ||||      S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was provided, so no cache will be returned.r   rC   r   r6   r>   )r   r   r   r  r  r   r   T)last_hidden_stater   r   
attentions)r<   r  r  r  r  r  r   rK  rL  r  r1   rC  rg   rA   r   _update_causal_mask_update_mamba_maskr  r  r  r  rE   r   )rW   r  r   r   r   r  r  r  r  r   r   r   r   
mamba_maskr   all_hidden_statesall_self_attnsdecoder_layer
layer_masklayer_outputs
next_caches                        r8   r   zBambaModel.forward^  sT    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M%0:
 !"\\-*=*=a*@I]I]^N)33A6L..M>?L]
 ,,^^L
 #oom\J"6BD0d![[ 	:M'4'?'?7'JP[J#!m%55!)
)) /"3#-$7
 
M *!,M  #/"}Q'7&99N1	:4 ,,];  -!11?#E#E15O.!*T
&+&+%	
 	
r7   r
  c           	         | j                   j                  dk(  r	|d|v r|S y ||j                         nd}| j                   j                  dk(  r&|s$t        j                  |||| j
                        ry |j                  }|j                  d   }t        |t        j                        r|j                  d   n||z   dz   }	| j                  |||	|||j                  d         }
| j                   j                  dk(  rQ|O|j                  j                  d	v r7|s5t        j                  |      j                  }t        j                   |
|      }
|
S )
Nflash_attention_2r   r   sdpa)r  past_key_values_lengthis_trainingr   rd   )sequence_lengthtarget_lengthrB   r   rX   )r  xpunpu)r<   r   rr   r   _ignore_causal_mask_sdpar   rB   rg   r   r1   rv   5_prepare_4d_causal_attention_mask_with_cache_positionrA   r   finfomin_unmask_unattended)rW   r   r
  r   r   r  past_seen_tokensrB   r  r  r   	min_dtypes               r8   r  zBambaModel._update_causal_mask  se    ;;++/BB)c^.C%%
 @O?Z?99;`a ;;++v5>O%>>*'7 MM	 ""&,,Q/ .%,,7   $!O3a7 	 PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCKQZ[Kr7   r  r  rB   rX   c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	| ddddddf   | ddddddf   k(  dddd| dddf   j                  |      }
|ddddddd|	f   |
z   }|dk(  }|ddddddd|	f   j                  ||      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr  )
fill_valuerB   rA   r   r  rC   rd   r   )rf   r1   r  r  fullrA   triurC  r   r   clonerg   rn   r  )r   r  r  rB   r   rX   r   r   r  mask_lengthpadding_attention_maskpadding_masks               r8   r  z@BambaModel._prepare_4d_causal_attention_mask_with_cache_position  s   < %.*<*<*>!*C(K, ) E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*8D$9I*Jn]^`dfgim]mNn*nq?*+Q.*"U) '  +1aL[L+@ADZZ+q05@Aq,;,AV5W5c5c )6Aq!\k\12 r7   c                 R    |}|d   dkD  s|t        j                  |dk(        rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr   )r1   all)rW   r   r   r  s       r8   r  zBambaModel._update_mamba_mask6  s7     $
!q ^%?EIIn`aNaDbJr7   )	NNNNNNNNN)r-   r.   r/   r   r]   r   r   r   r1   r2   rv   r;   r  r  r   r'   r   r   r  staticmethodr4   rB   r  r  r   r   s   @r8   r  r  I  s   { &  151537FJ59$(,0/359`
E,,-`
 !.`
 u//0	`

 ""BC`
   1 12`
 D>`
 $D>`
 'tn`
 !!1!12`
 23`
 
!`
  `
D:: ll: 	:
 ::  :x 555 5 {{	5
 5 5 5n	r7   r  c                       e Zd ZdgZddiZddgdgfiZ fdZee	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     d	e	e
j                     d
e	e   de	e
j                     de	e
j                     de	e   de	e   de	e   de	e
j                     deee
j                  f   defd              Z	 	 	 	 	 	 ddZ xZS )BambaForCausalLMzlm_head.weightlm_headcolwise_repr   logitsc                 
   t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        |j                  | _	        | j                          y )NFr   )r   r]   r  r  r  r   r   rO   r	  z_loss_coefficientr  r  s     r8   r]   zBambaForCausalLM.__init__H  sc     '
 ++yy!3!3V5F5FUS"(";"; 	r7   r  r   r   r   r  labelsr  r  r  r   logits_to_keeprb   c                    ||n| j                   j                  }|	|	n| j                   j                  }	 | j                  d
||||||||	|
d	|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}| | j                  d
||| j                   j                  d|}| j                  dkD  r[|j                  d      j                  |j                        j                  d      j!                         }|| j                  |z  z   }t#        |||j$                  |j&                  |j(                  	      S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BambaForCausalLM

        >>> model = BambaForCausalLM.from_pretrained("...")
        >>> tokenizer = AutoTokenizer.from_pretrained("...")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r  r   r   r   r  r  r  r  r   )r  r  r  r   rd   re   r   r?   )lossr  r   r   r  r6   )r<   r  r  r  r  r   r4   slicer	  loss_functionr  r  	logsumexprn   rB   r  r  r   r   r   r  )rW   r  r   r   r   r  r  r  r  r  r   r  r   r  r   slice_indicesr  r  z_losss                      r8   r   zBambaForCausalLM.forwardR  sw   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	
 ,64:: ,
)%+'/!5),
 ,
  118B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD&&*))b)1444::4FJJ1MRRTd55>>%#33!//))
 	
r7   c           	      t   |d u }	|	sZ||d   |j                   d   k\  r|d d |j                   d    d f   }nc|j                   d   |j                   d   k7  rD|d d |f   }n:t        | j                  |j                   d   | j                  | j                        }|T|R|j                         j                  d      dz
  }|j                  |dk(  d       |	s|d d |j                   d    d f   }||	rd|i}
nd|j                         i}
|
j                  ||||| j                  j                  |d       |
S )Nrd   r   r   rC   r  r  )r   r   r  r   r  r   )rg   r;   r<   rB   rA   longr  masked_fill_r   ri   num_logits_to_keep)rW   r  r   r   r  r   r   r  r   empty_past_kvmodel_inputss              r8   prepare_inputs_for_generationz.BambaForCausalLM.prepare_inputs_for_generation  sa    (4/ )!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	>Y__Q/DKKO %,*>)..077;a?L%%n&91= +A	0B/B/D,DE $+];L')=)=)?@L ,#2&"0"&++"@"@"0		
 r7   )NNNNNNNNNNr   )NNNNNT)r-   r.   r/   _tied_weights_keys_tp_plan_pp_planr]   r   r   r   r1   r2   rv   r;   r  r  r   r4   r   r   r  r   r   s   @r8   r  r  B  sl   *+=)H_-z:;H  151537FJ59-1$(,0/35934K
E,,-K
 !.K
 u//0	K

 ""BCK
   1 12K
 ))*K
 D>K
 $D>K
 'tnK
 !!1!12K
 c5<</0K
 
 K
  K
` 8r7   r  )r  r  r  )r   )Nr   )Qtypingr   r   r   r   r   r1   r   transformers.activationsr	   cache_utilsr   
generationr   integrationsr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.import_utilsr   r   configuration_bambar   +mamba_ssm.ops.triton.selective_state_updater    !mamba_ssm.ops.triton.ssd_combinedr!   r"   causal_conv1dr#   r$   
get_loggerr-   rK  r'   r;   Moduler{   r   rv   r4   r   r   r   r   r   r   r  r  r"  r  rJ  r$  r&  r  r  r  r  r  r  __all__r6   r7   r8   <module>r6     s}  6 = <   +   ) 7 > 9 O K F & R R 0 V , Rmm!DD-7** 
		H	%	 2Z3 Z3z!<299 !<H(	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%6%PD)RYY D)N; ;*VU\\ VS V
(( 46FH\]^ ^_ ^_Bryy   Y'J299 J (J(^2 ^B %? % %& u% u up V+_ V Vr Er7   