
    h2                     F   d dl Z d dlZd dlmZ d dlmZmZmZ d dlZd dl	Zd dlm
Z
 ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZmZ ddl m!Z!m"Z"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0  e       rd dl1m2Z2 d dl3m4Z4m5Z5 nd\  Z2Z4Z5 e       r	d dl6m7Z7m8Z8 nd\  Z8Z7 e9e2e7e8f      Z:dZ; ejx                  e=      Z> G d dej                  j~                        Z@ G d de-      ZA G d de)      ZB G d  d!e      ZC G d" d#e%      ZD G d$ d%e
j~                        ZE G d& d'e
j~                        ZF G d( d)e&      ZG G d* d+e+      ZH G d, d-e*      ZI G d. d/e      ZJ G d0 d1e,eJ      ZK G d2 d3e'      ZL G d4 d5e(      ZMg d6ZNy)7    N)cycle)CallableOptionalUnion)nn   )ACT2FN)FlashAttentionKwargs)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)logging)deprecate_kwarg)is_causal_conv1d_availableis_mamba_ssm_available   )LlamaRotaryEmbeddingapply_rotary_pos_emb)pad_tensor_by_sizereshape_into_chunkssegment_sum)
ZambaAttentionZambaAttentionDecoderLayerZambaForCausalLMZambaForSequenceClassificationZambaHybridDynamicCacheZambaHybridLayerZambaMambaDecoderLayer
ZambaModelZambaRMSNormeager_attention_forward   )Zamba2Config)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combinedNNN)causal_conv1d_fncausal_conv1d_updateNNzZyphra/Zamba2-2.7Bc                   (     e Zd Zd fd	ZddZ xZS )Zamba2RMSNormGatedc                     t         |           t        j                  t	        j
                  |            | _        || _        || _        y N)	super__init__r   	Parametertorchonesweightvariance_epsilon
group_size)selfhidden_sizer7   eps	__class__s       g/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/zamba2/modular_zamba2.pyr1   zZamba2RMSNormGated.__init__K   s6    ll5::k#:; #$    c                 b   |j                   }|j                  t        j                        }|?|t        j
                  j                  |j                  t        j                              z  }|j                  ^ }}|| j                  z  } |j                  g ||| j                   }|j                  d      j                  dd      }|t        j                  || j                  z         z  } |j                  g ||| j                  z   }| j                  |j                  |      z  S )Nr   T)keepdim)dtypetor3   float32r   
functionalsilushaper7   viewpowmeanrsqrtr6   r5   )	r8   hidden_statesgateinput_dtypeprefix_dimslast_dimgroup_counthidden_states_groupvariances	            r<   forwardzZamba2RMSNormGated.forwardQ   s   #))%((7)BMM,>,>twwu}}?U,VVM!.!4!4h$//10m00\+\{\DOO\&**1-222t2D1EKK4K`K`@`4aa0+00]+]{T__?\]{{]--k:::r=   )gư>r/   )__name__
__module____qualname__r1   rS   __classcell__r;   s   @r<   r-   r-   J   s    %;r=   r-   c                       e Zd Zy)Zamba2RMSNormNrT   rU   rV    r=   r<   rZ   rZ   _       r=   rZ   c            
           e Zd ZdZej
                  dfdededej                  de	e
   fdZded	ej                  d
ej                  dej                  fdZd Zdde	e   defdZy)Zamba2HybridDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    Nconfig
batch_sizerA   devicec           	      .   || _         |j                  | _        d| _        t        |j                  |j
                  z        | _        |j                  | _        |j                  | _
        |j                  | _        g | _        i | _        i | _        i | _        i | _        i | _        t%        |j&                        D ]  }t)        j*                  || j                  d|j,                  z  |j                  z  z   | j                  ||      | j                   |<   t)        j*                  || j                  |j.                  | j                  ||      | j"                  |<   | j                  |   dk(  s| j                  j1                  |        t%        |j&                        D cg c]  }t)        j2                  g g|z  |       c}| _        t%        |j&                        D cg c]  }t)        j2                  g g|z  |       c}| _        y c c}w c c}w )NFr   rb   rA   hybridrb   )rA   layers_block_typehas_previous_stateintmamba_expandr9   intermediate_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizen_mamba_headstransformer_layers_modules_parameters_buffersconv_states
ssm_statesrangenum_hidden_layersr3   zerosmamba_ngroupsmamba_headdimappendtensor	key_cachevalue_cache)r8   r`   ra   rA   rb   i_s          r<   r1   z!Zamba2HybridDynamicCache.__init__q   s    
!'!9!9"'!$V%8%86;M;M%M!N$22 & 3 3#11"$v//0 	2A"'++&&V-A-A)AFDXDX)XX%%#DQ "'D..0D0DdFYFYbhpu"DOOA %%a(H4''..q1	2 SXX^XpXpRqrQ%,,tj'8HrTYZ`ZrZrTstqELL"
):6Jt sts   !"H""H	layer_idxnew_conv_statecache_positionreturnc                 T   | j                   |   }|j                  d| j                  dz
        }|j                  dd      }|j	                  |j
                        |d d d d |f<   | j                   |   j                          | j                   |xx   |z  cc<   | j                   |   S )Nr   r#   r?   shiftsdims)ru   clampro   rollrB   rb   zero_)r8   r   r   r   
conv_states        r<   update_conv_statez*Zamba2HybridDynamicCache.update_conv_state   s     %%i0
'--a1F1F1JK__BR_8
+9+<+<Z=N=N+O
1a'(#))+#z1#	**r=   c                 l    | j                   j                          | j                  j                          y r/   )ru   r   rv   )r8   s    r<   resetzZamba2HybridDynamicCache.reset   s$     r=   c                     || j                   vr| j                   d   n|}t        | j                        |k  s | j                  |   j                         dk(  ry| j                  |   j                  d   S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )rq   lenr~   numelrF   )r8   r   s     r<   get_seq_lengthz'Zamba2HybridDynamicCache.get_seq_length   sl     3<4CZCZ2ZD++A.`i	t~~)+t~~i/H/N/N/PTU/U~~i(..r22r=   )r   )rT   rU   rV   __doc__r3   float16r$   ri   rA   r   strr1   Tensor
LongTensorr   r   r   r\   r=   r<   r_   r_   c   s     KP--quu"u03u<AKKuaijmanu@
+
+.3ll
+LQL\L\
+	
+ 3 3c 3r=   r_   c                       e Zd Zy)Zamba2RotaryEmbeddingNr[   r\   r=   r<   r   r      r]   r=   r   c                   b    e Zd ZdZ	 	 	 ddedee   dee   dee   f fdZ eddd	
      	 	 	 dde	j                  dedee	j                     dee   deee	j                  e	j                  f      dee   dee	j                  ee	j                     eee	j                        f   fd       Z xZS )Zamba2AttentionaZ  
    Multi-headed attention from 'Attention Is All You Need' paper.

    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
    (see fig. 2 in https://huggingface.co/papers/2405.16712).
    Additionally, replaced
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
    Finally, this attention layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this
    layer is tied, un-tied adapters (formally the same as LoRA but used in the base model) modules are added to the q, k, v projectors to increase
    expressivity with a small memory overhead (see Fig. 2 of https://huggingface.co/papers/2411.15242).
    r`   r   num_fwd_mem_blocksblock_idc           	         t         |   ||       || _        |j                  | _        || _        |j                  rt        j                  g       | _	        t        j                  g       | _
        t        j                  g       | _        t        | j                        D ]  }||j                  z  |k(  r{t        j                  t        j                  | j                   | j"                  j$                  d      t        j                  | j"                  j$                  | j                   d            }t        j                  t        j                  | j                   | j"                  j$                  d      t        j                  | j"                  j$                  | j                   d            }t        j                  t        j                  | j                   | j"                  j$                  d      t        j                  | j"                  j$                  | j                   d            }n<t        j&                         }t        j&                         }t        j&                         }| j                  j)                  |       | j                  j)                  |       | j                  j)                  |       ! t+        | j                        D 	
ci c]  \  }	}
|
|	
 c}
}	| _        y c c}
}	w )NFbias)r0   r1   r   hybrid_layer_idslayer_block_mapr   use_shared_attention_adapterr   
ModuleListlinear_q_adapter_listlinear_k_adapter_listlinear_v_adapter_listrw   num_mem_blocks
SequentialLinearattention_hidden_sizer`   adapter_rankIdentityr|   	enumerate	layer_dic)r8   r`   r   r   r   r   linear_q_adapterlinear_k_adapterlinear_v_adapterindexvaluer;   s              r<   r1   zZamba2Attention.__init__   s#    	+"4%66 ..)+r):D&)+r):D&)+r):D&4223 Dv,,,8')}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($ (*}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($ (*}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($
 (*{{}$'){{}$'){{}$**112BC**112BC**112BC)D, <ETEYEY;Z[<5%%,[[s   K2past_key_valuepast_key_values4.58new_nameversionrK   attention_maskposition_embeddingskwargsr   c                 $   |j                   d d }g |d| j                  }| j                  |      }	| j                  |      }
| j	                  |      }| j
                  j                  rW| j                  |   }|	 | j                  |   |      z   }	|
 | j                  |   |      z   }
| | j                  |   |      z   }|	j                  |      j                  dd      }	|
j                  |      j                  dd      }
|j                  |      j                  dd      }| j
                  j                  r|\  }}t        |	|
||      \  }	}
||j                  |
||      \  }
}t         }| j
                  j"                  dk7  rt$        | j
                  j"                     } || |	|
||f| j&                  sdn| j(                  | j*                  d|\  }} |j,                  g |d j/                         }| j1                  |      }||fS )Nr?   r#   r   eagerg        )dropoutscaling)rF   head_dimq_projk_projv_projr`   r   r   r   r   r   rG   	transposeuse_mem_roper   updater"   _attn_implementationr   trainingattention_dropoutr   reshape
contiguouso_proj)r8   rK   r   r   r   r   r   input_shapehidden_shapequery_states
key_statesvalue_statesadapter_layer_idxcossinattention_interfaceattn_outputattn_weightss                     r<   rS   zZamba2Attention.forward   s%    $))#2.88b8$--8{{=1[[/
{{=1;;33 $y 9'*W$*D*DEV*WXe*ffL#&Sd&@&@AR&STa&bbJ'*W$*D*DEV*WXe*ffL#((6@@AF__\2<<QB
#((6@@AF;;##*HC';L*VY[^'_$L*&'6'='=j,Xa'b$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r=   r(   )rT   rU   rV   r   r$   r   ri   r1   r   r3   r   r_   tupler   r
   rS   rW   rX   s   @r<   r   r      s   $ $(,0"&'\'\ C='\ %SM	'\
 3-'\R %0A6R
 26>BKO1)||1) 1) !.	1)
 "":;1) &eELL%,,,F&GH1) -.1) 
u||Xell3XeELL>Q5RR	S1) S1)r=   r   c                        e Zd ZdZddedee   f fdZ	 	 ddej                  dee
   deej                     fdZddee
   deej                     fd	Z	 	 ddee
   deej                     fd
Z xZS )Zamba2MambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    r`   r   c           	         t         |           || _        |j                  | _        |j                  | _        |j                  | _        t        |j                  | j                  z        | _
        || _        |j                  | _        d| _        t        j                         | _        |j"                  | _        |j$                  | _        |j(                  | _        | j                  j,                  | _        |j0                  | _        |j2                  | _        |j4                  | _        |j6                  | _        | j                  d| j&                  z  | j
                  z  z   | _        t        j:                  | j8                  | j8                  d|j                  | j8                  |j                  dz
        | _        | j                  | j8                  z   | j.                  z   }t        j>                  | j                  ||j@                        | _!        t        jD                  tG        jH                  | j.                              | _%        tG        jL                  d| j.                  dz         }t        jD                  tG        jN                  |            | _(        d| jP                  _)        tU        | j                  | j                  | j&                  z  d      | _+        t        jD                  tG        jH                  | j.                              | _,        d| jX                  _)        t        j>                  | j                  | j                  |j@                        | _-        t\        st^        ja                  d	       y y )
NrE   r   Tr#   )in_channelsout_channelsr   kernel_sizegroupspaddingr   gh㈵>)r7   r:   a  The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d)1r0   r1   r`   r9   rl   rm   rn   ro   ri   rj   rk   r   use_conv_bias
activationr   SiLUactuse_mem_eff_pathrz   n_groupsr{   r   rp   	num_heads
chunk_sizetime_step_limittime_step_mintime_step_maxconv_dimConv1dconv1dr   add_bias_linearin_projr2   r3   r4   dt_biasarangelogA_log_no_weight_decayr-   normDout_projis_fast_path_availableloggerwarning_once)r8   r`   r   projection_sizeAr;   s        r<   r1   zZamba2MambaMixer.__init__$  s   !--$22 & 3 3!$V%8%84;K;K%K!L"#11 779 & 7 7,,,,22 ++%55#11#11..T]]1BTEXEX1XXii++==''!+
 004==@4>>Qyy''
 ||EJJt~~$>? LLDNNQ./\\%))A,/
&*

#&""t/E/E/V\`
	 ejj89"&		$"8"8$:J:JQWQgQgh%> &r=   rK   cache_paramsr   c                    |j                   \  }}}| j                  | j                  z  }d| j                  z  d| j                  z  | j                  z  z   | j                  z   }|4|j
                  r'| j                  |j                  d            }	|	j                   d   |z
  dz  }
|
|
| j                  | j                  | j                  g}t        j                  |	|d      \  }}}}}t        ||j                  | j                     | j                  j                  j                  d      | j                  j                   | j"                        }t        j                  || j                  ||gd      \  }}}t        j$                  | j&                  j)                                }|d d d df   d d d d d f   j+                  d| j,                  | j                        j/                  t        j0                        }|d d d d d f   j+                  dd| j,                        }| j2                  d d d df   j+                  d| j,                        }| j4                  d d d df   j+                  d| j,                        }|j7                  || j                  |j                   d   | j                  z        }|j7                  || j                  |j                   d   | j                  z        }|j7                  || j                  | j,                        }t9        |j:                  | j                     ||||||d |d
      }|j7                  || j                  | j,                  z        }| j=                  ||      }| j?                  |      d d d df   }|S |Bt        j@                  |dk(        s*|jB                  }||d d d d d f   z  j/                  |      }| j                  |      }t        j$                  | j&                  j)                                }| jD                  i nd	| jD                  i}|t        j@                  |dk(        }nd}| jF                  r| jH                  r||rtK        || j                  j                  j                  d      | j                  j                   | j2                  |f| j4                  | jL                  d | j"                  | j<                  j                  | j<                  jN                  | j>                  j                  | j>                  j                   | j,                  | j                  d
dd|\  }}|S t        j                  || j                  | j                  | j                  gd      \  }}}|v|jQ                  dd      }tR        jT                  jW                  || jX                  |j                   d   z
  df      }|j                  | j                     j[                  |       t\        | j"                  dvrJ| j_                  | j                  |jQ                  dd            jQ                  dd      d d d |f         }nyt]        |jQ                  dd      | j                  j                  j                  d      | j                  j                   | j"                        jQ                  dd      d d d |f   }t        j                  || j                  ||gd      \  }}}|Bt        j@                  |dk(        s*|jB                  }||d d d d d f   z  j/                  |      }ta        |j7                  ||d| j,                        |||j7                  ||| j                  d      |j7                  ||| j                  d      f| jL                  | j4                  d d d| j2                  dd|\  }}|*|(|j:                  | j                     j[                  |       |j7                  ||d      }| j=                  ||      }| j?                  |      }|S )Nr   r#   r?   dim.rA   T)zr   dt_softplusdt_limitF)r   r   seq_idxr   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr   )rE   swish)xr5   r   r   )r   r   r  r  r  r   r	  )1rF   r   rm   rk   r   rh   r   squeezer   r3   splitr*   ru   r   r   r5   r   r   expr   floatexpandr   rB   rC   r   r   rG   r%   rv   r   r   allrA   r   r   r   r'   r   r6   r   r   rD   padro   copy_r)   r   r&   )r8   rK   r  r   ra   seq_lenr   groups_time_state_sized_to_removein_projected_statesd_mlpsplit_projection_dimrL   hidden_states_B_CdtBCr  r   r   hidden_states_reshapedoutrA   projected_statesdt_limit_kwargsinput_not_masked	ssm_state	time_stephidden_states_B_C_tr   scan_outputs                                  r<   cuda_kernels_forwardz%Zamba2MambaMixer.cuda_kernels_forwarde  sv    "/!4!4
GQ!%1D1D!D$0001t}}3DtGZGZ3ZZ]a]k]kk #(G(G"&,,}/D/DQ/G"H(..r2[@QFE$)5$2H2H$--Y]YgYg#h 05<OQekm0n-Aq$)2 4!((8""**1-  ! #(++!'')?AWX#M1a
 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az4==!''!*2MNAz4==!''!*2MNA%2%7%7
DNNTXTaTa%b"2''7& M *..z4>>DMM;YZM IImT:M--.q$|<Cz 
u )%))Na<O2P%++!.1d
1K!K O OPU V#||M:4::++-..A$($8$8$@bzSWSgSgFhO)#(99^q-@#A #' $$<;OTd!A$KK&&..q1KK$$LL" ff# ##'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(,#"$ &%"YX 
m 6;[[$++T]]DNNK62'  +*;*E*Ea*K'!#!2!2+d.C.CFYF_F_`bFc.cef-g"J !,,T^^<BB:N#+tFW/W(,$5$?$?1$EFPPQRTUVWXZb[bZbWbc)% )9+55a;#{{1199!<![[--#'??	)
  i1oa'k)3% ',kk%++-CE[\'#q!
 "-eiiRS@S6T)//E%2^Aq$J5O%O$S$STY$ZM)B!&&z7BNFF:wrBFF:wrB*  $ff (, LL $* &*&Y (\-E ++DNN;AA)L)..z7BG"iiT:mmK0
r=   c                    |j                   \  }}}|j                  }|-|j                  r!| j                  |j	                  d            }nI|6t        j                  |dk(        s||d d d d d f   z  j                  |      }| j                  |      }|j                   d   d| j                  z  z
  d| j                  z  | j                  z  z
  | j                  z
  dz  }	|j                  |	|	| j                  | j                  | j                  gd      \  }}}
}}|w|j                  | j                     j!                         }|j                  |j"                        }|j                  r1|
j%                  d      }
|j&                  | j                     }t        j(                  |dd      }|j*                  dk(  r|d d dd d f   n||d d d d df<   |j&                  | j                     j-                  |       t        j.                  |j                  |j"                        | j0                  j2                  d d dd d f   z  d      }| j4                  r|| j0                  j6                  z  }| j9                  |      j                  |      d d d df   }n|j;                  dd      }t<        j>                  jA                  || jB                  |j                   d   z
  df      }|j&                  | j                     j-                  |       | j9                  | j1                  |      j;                  dd            d d d |d d f   }|t        j                  |dk(        s|j                  }||d d d d d f   z  j                  |      }nt        jD                  || j                  | jF                  | j                  f|j"                  |	      }| j9                  | j1                  |j;                  dd            dd |f   j;                  dd            }t        j                  || j                  | j                  | j                  z  | j                  | j                  z  gd      \  }}}t        jH                  | jJ                  jM                                }|t|j                  rg|j*                  dk(  r
|d d d df   n|d d dd d f   d d d df   }|j;                  dd      jO                  ||j                   d   | jF                        }| jP                  d
   jO                  | jP                  j                   d   | jF                        }t
        j<                  j>                  jS                  ||j                  |j                        z         }t        jT                  || jV                        }|d   jO                  | j                  | jF                  | j                        j                  t
        jX                        }t        jH                  |d
   |z        }|j[                  || j                  d      dd d d f   }|jO                  || j                  | j                  | j                  z  |j                   d         j]                         }|j[                  |d|j                   d         }|d
   |dd d d f   z  }|j[                  |d| jF                        }||d
   z  }|j                  | j                     j-                  |j                  | j                     |z  |z          |j[                  || j                  d      dd d d f   }|jO                  || j                  | j                  | j                  z  |j                   d         j]                         }|j[                  |d|j                   d         }|j                  | j                     j                  |j                        }|j_                  || j                  z  | jF                  | j                        }|j_                  || j                  z  | j                  d      }t        j`                  ||      }|j_                  || j                  | jF                        }| jb                  d
   jO                  | jb                  j                   d   | jF                        }|||z  z   j                  |j                        }|j[                  |d      d d d df   }n4t<        j>                  jS                  || jP                  z         }t        jT                  || jV                        }|j[                  ||d| jF                        jM                         }|j[                  ||d| j                        jM                         }|j[                  ||d| j                        jM                         }|je                  | j                  | j                  z  d| j                        }|je                  | j                  | j                  z  d| j                        }| jf                  || jf                  z  z
  | jf                  z  }| jb                  d
   ti        ||      z  }||d
   z  }|j                  |j                        |z  }||||fD cg c]  }tk        ||| jf                         c}\  }}}}|jm                  dddd      }t        jn                  |d      }t        jH                  tq        |            }|d d d d d d d d d d d f   |d d d d d d d d d d d f   z  } | j/                  d      }!|!d
   |jm                  ddddd      d
   z  }"|"j/                  d      }#|#d
   |d d d d d f   z  j/                  d      }$t        jH                  |d d d d d d dd f   |z
        }%||%jm                  dddd      d
   z  }&|&jm                  ddddd      d
   |jm                  ddddd      dd d d f   z  j/                  d      jm                  ddddd      }'|.|j                  r"|j                  | j                     d d d df   }(nt        jr                  |'d d d df         }(t        jt                  |(|'gd      }'t        jH                  tq        t<        j>                  jA                  |d d d d d d df   d                  })|'jm                  ddddd      }*|)d   |*d d d d d df   z  j/                  d      }+|+jm                  ddddd      },|,d d d df   |,d d df   }}'t        jH                  |      }-|dd d d f   |'d d d d d df   z  }.|-jm                  dddd      }/|.j/                  d      |/d
   z  }0|$|0z   }|j[                  |d| j                  | jF                        }||z   }|dkD  r|d d d |d d d d f   }|j[                  ||d      }|*|(|j                  | j                     j-                  |       | jw                  ||
      }1| jy                  |1j                  |            }2|2S c c}w )Nr#   r?   r   r  r   r   r   .rd   ).N).NNr  )r  output_size   )r#   r   )=rF   rA   rh   r   r  r3   r  rB   rk   r   rm   r   r  r   rv   r   clonerb   	unsqueezeru   r   ndimr  sumr   r5   r   r   r   r   r   rD   r  ro   ry   r   r  r   r  r  r   softplusr   r   rC   r   r   rG   bmmr   repeat_interleaver   r   r   permutecumsumr   
zeros_likecatr   r   )3r8   input_statesr  r   ra   r  r   rA   r*  r"  rL   rK   r%  r-  r   r&  r'  r  r   dAdBdBxrv   ssm_states_reshaped
C_reshapedyr   pad_size
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decay_contractionstatesprevious_statesdecay_chunkstates_permutedresult
new_statesstate_decay_outC_times_statesstate_decay_out_permutedY_offr0  contextualized_statess3                                                      r<   torch_forwardzZamba2MambaMixer.torch_forward  sz   !-!3!3
GQ""#(G(G#||L,@,@,CD)%))NA<M2N ,~aDj/I IMMeT#||L9!''+a$2H2H.HH1t}}K\_c_r_rKrrtx  uC  uC  C  HI  I(8(>(>t55t~~V\^ )? )
%1dM2
 #$//?EEGI!]%9%9:I..~~a()55dnnE
"ZZ
2BG
ANASASWXAX}Q1W'=^k
1a8$((8>>zJ %		*--8H8O8O*PSWS^S^SeSefgijlmfmSn*ntv w%%!T[[%5%55M $ 7 : :5 A!T3, O - 7 7! <]]..!**]-@-@-DDaH
 ((8>>zJ $])C)M)MaPQ)R STUW_X_W_abTb c!-eiiPQ@Q6R)//E%2^Aq$J5O%O$S$STY$ZMT^^T]]D<O<OP$++5I !HHT[[1H1HA1N%OPSU]V]U]P]%^%h%hijlm%noM#kk-$:P:PRVR_R_bfbubuRuw{  xE  xE  HL  H[  H[  x[  :\  bd  eq!YYtzz'')**#(G(G &(WW\AtSL!r!Q'{1dC<7PBa#**:rxx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!3!34B/"))$..$--I\I\]``glgtgt`uA2i=1,-B
 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PM}Y//C ##DNN399''7"<sB 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A &00@CCAGGLJ",//*t~~2Mt}}^b^q^q"r
T^^ ;T=P=PRSTJ		-z:Az4>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''T\\(9:BR!3!34B)11*gr4==Y__aM		*gD4G4GHNNPA		*gr43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'DOO*CCtVH	*-?x-XXJ *ByM9M](()B.A cpqrtuwxay%z\]&9!Xt&W%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCAFF !99XaArsl%;h%FGL"#l&:&:1aA&Fy&Q"Q)11!Q1a@K}OdOdefhiklnoqrOstwy}  @A  uA  PB  B  G  G  LM  G  N  V  V  WX  Z[  ]^  `a  cd  eF'L,K,K"."9"9$.."I!TSV,"W"'"2"26!RaR%="AYY8a@F))K0A0A(1aQRTV;BWY_0`$abK$nnQ1a;O!/2_Q4QT_5UUZZ_`ZaF1aA6J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*gr2A$)A''7==iHii4(
 !%knnU.C D$$I &{s   zc                     t         r?d| j                  j                  j                  j                  v r| j                  |||      S | j                  |||      S )Ncuda)r   r   r5   rb   typer1  r^  )r8   rK   r  r   s       r<   rS   zZamba2MambaMixer.forward  sN     "f0C0C0J0J0O0O&O,,]L.YY!!-~NNr=   r/   r+   )rT   rU   rV   r   r$   r   ri   r1   r3   r   r_   r1  r^  rS   rW   rX   s   @r<   r   r     s    ?| ? ?H <@15	T||T 78T !.	Tn%AY8Z %qyz  {G  {G  rH %J <@15		O 78	O !.		Or=   r   c                   8     e Zd Zddedee   f fdZddZ xZS )	Zamba2MLPr`   r   c           	          t         	|           || _        |j                  | _        |j                  | _        || _        || _        t        j                  | j                  d| j                  z  |j                        | _
        t        j                  | j                  | j                  |j                        | _        t        |j                     | _        t        j                  g       | _        t#        | j
                        D ]  }||j$                  z  |k(  rt        j&                  t        j                  | j                  j                  | j                  j(                  d      t        j                  | j                  j(                  d| j                  z  d            }nt        j*                         }| j                   j-                  |        |j.                  }t1        |      D ci c]  \  }}||
 c}}| _        yc c}}w )aQ  
        This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer
        is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead.
        r   r   FN)r0   r1   r`   r9   rk   r   r   r   r   r   gate_up_proj	down_projr	   
hidden_actact_fnr   gate_up_proj_adapter_listrw   r   r   r   r   r|   r   r   r   )
r8   r`   r   r   r   gate_up_proj_adapterr   r   r   r;   s
            r<   r1   zZamba2MLP.__init__  s   
 	!--!'!9!9"4 IId&6&6D<R<R8RY_YoYop4#9#94;K;KRXRhRhiV../)+r):&t../ 	HA6(((H4')}}IIdkk55t{{7O7OV[\IIdkk66D<R<R8RY^_($
 (*{{}$**112FG	H !11;D_;UV<5%%,VVs   3H
c                     | j                  |      }| j                  |   }| | j                  |   |      z   }t        j                  |dd      }| j                  |d         |d   z  }| j                  |      }|S )Nr   r?   r  r   r#   )re  r   ri  r3   chunkrh  rf  )r8   hidden_stater   gate_up_stateoutputs        r<   rS   zZamba2MLP.forward  s    )),7NN9-	%(Q(F(Fy(QR^(__M1"={{=#34}Q7GG-r=   r+   r/   )	rT   rU   rV   r$   r   ri   r1   rS   rW   rX   s   @r<   rc  rc    s%    W| WPXY\P] W<r=   rc  c                   P    e Zd Zddedee   dee   f fdZ eddd      	 	 	 	 dd	ej                  d
ej                  dedeej                     dee
   dee   deej                     dee   deej                   eeej                   ej                   f      f   fd       Z xZS )Zamba2AttentionDecoderLayerr`   r   r   c                     || _         t        |j                        }t        |   ||       t        |d||      | _        t        |||      | _        y )Nr?   )r   r   r   )r   r   )	r   r   r   r0   r1   r   	self_attnrc  feed_forward)r8   r`   r   r   num_gsr;   s        r<   r1   z$Zamba2AttentionDecoderLayer.__init__  sO     V,,-+(2RXckl%fRZ[r=   r   r   r   r   rK   original_hidden_statesr   output_attentionsr   r   r   c           
          t        j                  ||gd      }| j                  |      } | j                  d||||||d|\  }}	| j	                  |      }| j                  ||      }|f}
|r|
|	fz  }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
                concatenated tensor is then used as input of the pre-attention RMSNorm
                (see fig. 2 in https://huggingface.co/papers/2405.16712).
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        r?   r  )rK   r   r   r   rw  r   r\   )r3   concatenateinput_layernormrs  pre_ff_layernormrt  )r8   rK   rv  r   r   r   rw  r   r   self_attn_weightsoutputss              r<   rS   z#Zamba2AttentionDecoderLayer.forward  s    @ ))=:P*QWYZ,,];+94>> ,
')+/ 3,
 ,
(( --m<))-C ")++Gr=   r+   )NNFN)rT   rU   rV   r$   r   ri   r1   r   r3   r   r_   boolr   r   r
   r   FloatTensorrS   rW   rX   s   @r<   rq  rq    s	   \| \x} \X`adXe \ %0A6R 26>B,1:>3||3 !&3 	3
 !.3 "":;3 $D>3 &e&6&673 -.3 
u  (51B1BEDUDU1U+V"WW	X3 S3r=   rq  c                   (     e Zd Zdedef fdZ xZS )Zamba2MambaDecoderLayerr`   r   c                     t         |   ||       t        ||      | _        t	        |j
                  |j                        | _        y )N)r`   r   r:   )r0   r1   r   mambarZ   r9   rms_norm_epsrz  )r8   r`   r   r;   s      r<   r1   z Zamba2MambaDecoderLayer.__init__4  s;    +%VyI
,V-?-?VEXEXYr=   )rT   rU   rV   r$   ri   r1   rW   rX   s   @r<   r  r  3  s    Z| Z Z Zr=   r  c                       e Zd Zdedej
                  def fdZ eddd      	 	 	 	 	 	 	 	 dd	e	j                  d
ee	j                     dee   dee	j                     dee	j                     dee   dee   dee   dee	j                     dee	j"                  eee	j"                  e	j"                  f      f   fd       Z xZS )Zamba2HybridLayershared_transformerlinearr  c                 :    t         |   |||       | `|| _        y r/   )r0   r1   shared_transfr  )r8   r  r  r  r;   s       r<   r1   zZamba2HybridLayer.__init__;  s%     	+VU;"4r=   r   r   r   r   rK   rv  r   r   causal_maskrw  	use_cacher   r   c
           	          | j                  |||||||	      }
|
d   }|r|
d   }| j                  |      }| j                  |||||||	      }
|r|
d   f|
dd z   }
|
S )aY  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
            hidden activations to form the input of the shared transformer layer.
            layer_idx (`int`): layer number.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )rv  r   r   r   rw  r   r   r#   )transformer_hidden_statesr   r   rw  r  r   r   N)r  r  mamba_decoder)r8   rK   rv  r   r   r  r   rw  r  r   layer_outputsr  r|  s                r<   rS   zZamba2HybridLayer.forwardB  s    B //#9&+/ 3 0 
 %2!$4! -a 0$(KK0I$J!**&?)+/ 3 + 
 *1-/@AMRSRTDUUMr=   )NNNNNFFN)rT   rU   rV   rq  r   r   r  r1   r   r3   r   r   ri   r_   r~  r   r   r  rS   rW   rX   s   @r<   r  r  :  s'   5"=5GIyy5Yp5 %0A6R :>#'15.2>B,1$):>>||> !) 6> C=	>
 !.> ell+> "":;> $D>> D>> &e&6&67> 
u  (51B1BEDUDU1U+V"WW	X> S>r=   r  c                   N     e Zd ZU eed<   dZdZddgZdZdZ	dZ
dZdZ fdZ xZS )Zamba2PreTrainedModelr`   modelTrq  r  r   c                    t         |   |       t        |t              rt	        j
                  t	        j                  | j                  j                        t        j                  | j                  j                        t        j                  | j                  j                        z
  z  t        j                  | j                  j                        z         j                  | j                  j                        }|t	        j                  t	        j                  |              z   }|j                   j"                  j%                  |       t	        j&                  d|j(                  dz         }|j*                  j"                  j%                  t	        j                  |             |j,                  j"                  j/                  d       y y )N)minr#   g      ?)r0   _init_weights
isinstancer   r3   r  randr`   rp   mathr   r   r   r   time_step_floorexpm1r   datar  r   r   r   r   fill_)r8   moduler%  inv_dtr  r;   s        r<   r  z#Zamba2PreTrainedModel._init_weights  s:   f%f./

4;;44588DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566FNN%%f-Q 0 01 45ALL##EIIaL1HHMM$ 0r=   )rT   rU   rV   r$   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_flex_attn_supports_sdpa_is_statefulr  rW   rX   s   @r<   r  r    sG    &*#68QR"3NL% %r=   r  c                      e Zd ZdZdefdZd Z	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     d	ee   d
eej                     dee   dee   dee   dee   deej                     deeef   fdZy)Zamba2Modelzh
    Model consisting of *config.num_hidden_layers* layers.

    Args:
        config: Zamba2Config
    r`   c                    t         j                  | |       || _        |j                  | _        |j
                  | _        t        j                  |j
                  |j                  | j                        | _	        t        |j                        D cg c]  }t        ||       }}g }g }|j                  | _        t        |j                        D ]  }|j                  |   dk(  r|j                  t!        ||             2|j                  |   dk(  sE|j                  t        j"                  | j                  j                  | j                  j                  d             |j                  t!        ||              t%        |      }t%        |      }t'        |      }| j)                  |||      }t        j*                  |      | _        |j.                  | _        t1        |j                  |j2                        | _        |j6                  r1|j8                  rt:        j=                  d       t?        |      | _         d| _!        | jE                          y c c}w )	N)r   r  r   re   Fr   r  ze`use_long_context` set to `True`: using rescaled `rope_theta` and extended `max_position_embeddings`.)#r  r1   r`   pad_token_idpadding_idx
vocab_sizer   	Embeddingr9   embed_tokensrw   r   rq  rg   rx   r|   r  r   iterr   
get_layersr   layersr   rZ   r  final_layernormr   use_long_contextr   r   r   
rotary_embgradient_checkpointing	post_init)r8   r`   kblocksmamba_layerslinear_layersr   r  s           r<   r1   zZamba2Model.__init__  s   &&tV4!.. ++LL):):F<N<NPTP`P`aKPQWQfQfKgha-fqAhh!'!9!9v//0 	RA''*g5##$;Fa$PQ))!,8$$RYYt{{/F/FH_H_fk%lm##$;Fa$PQ	R L)]+vEmmF+$*$?$?!,V-?-?VEXEXY&&##{ 4F;DO&+# 	7 is   Ic           
      x   g }g | _         d| _        t        | j                        D ]  \  }}|dk(  r| j                  dk(  r|| _        t	        |      }| j
                  j                  t        | j
                  j                        z  dkD  r_d| d}t        j                  |dz   dz   dz   d	z   d
z         }	| j                   j                  |	       d}
| j                  D ]q  }|dk(  re|
| j
                  j                  z  |j                  k(  r?t        j                  dt        |
      z   dz         }| j                   j                  |       |
dz  }
s | j
                  j                  rd}
| j                  D ]q  }|dk(  re|
| j
                  j                  z  |j                  k(  r?t        j                  dt        |
      z   dz         }| j                   j                  |       |
dz  }
s |j                  t        |t	        |      t	        |                   |j                  t	        |              |S )Nr   re   r#   z	^layers\.z\.shared_transformer\.z(?:z3self_attn\.(?:q_proj|k_proj|v_proj|o_proj)\.weight|z1feed_forward\.(?:gate_up_proj|down_proj)\.weight|z,(?:input_layernorm|pre_ff_layernorm)\.weightz)$z>^shared_transformer\.feed_forward\.gate_up_proj_adapter_list\.z\.(?:0|1)\.weight$zg^shared_transformer\.self_attn\.(?:linear_q_adapter_list|linear_k_adapter_list|linear_v_adapter_list)\.)_tied_weights_keysfirst_transformer_layer_idr   rg   nextr`   r   r   r   recompiler|   r   r   r   r  )r8   r  r  r  r  layer_id
layer_typeblockprefix_patternmain_keys_pattern
adapter_id_layer_typeadapter_patternattn_adapter_patterns                 r<   r  zZamba2Model.get_layers  sH   "$*+'$-d.D.D$E )	2 HjX%22a76>D3V;;--DKK4P4P0QQTUU(1(;Q%RN(*

& !PQ OO J	J
   )% ++223DE!"J'+'='= (&(2zDKKD^D^7^bgbpbp7p.0jj a"%j/!2"7!8/O
 !33::?K"a
( {{??%&
+/+A+A 	,K*h6:HbHb;bfkftft;t79zz%q&)*o%6 '<%<8" 4 !% 7 7 > >?S T&!OJ	, /tM7JDQ]L^_`d<01S)	2T r=   N	input_idsr   position_idsr   inputs_embedsr  rw  output_hidden_statesreturn_dictr   r   c                 r   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	|d u |d uz  rt        d      | j                  r%| j                  r|rt        j                  d       d}|| j                  |      }|}t        j                  |      }|rO|M||j                  d   n|j                  d   }t        | j                   || j                  | j                         }|
R||j#                  | j$                        nd}t        j&                  |||j                  d   z   |j                         }
||
j)                  d      }| j+                  |||
      }| j                   j,                  r| j/                  ||      }nd }|rd	nd }|rd	nd }t1        | j2                        D ]r  \  }}|r||fz  }| j                  r1| j                  r%| j5                  |j6                  |||||||||
      }n ||||||||||
	      }|d   }|sd|d   j||d   fz  }t | j9                  |      }|r||fz  }||j:                  sd|_        t=        ||r|nd ||      }|	r|S |j?                         S )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   )rA   rb   r  r#   rf   r\   )rv  r   r   r  r   rw  r  r   T)last_hidden_stater   rK   
attentions) r`   rw  r  r  use_return_dict
ValueErrorr  r   r   r   r  r3   r5  rF   r_   rA   rb   r   r  r   r6  _update_causal_maskr   r  r   r  _gradient_checkpointing_func__call__r  rh   r   to_tuple)r8   r  r   r  r   r  r  rw  r  r  r   rK   rv  ra   past_seen_tokensr  r   all_hidden_statesall_self_attnsr   layerr  ro  s                          r<   rS   zZamba2Model.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<s  &&4==Yj I  --i8M%!&]!; 0/8/D+-J]J]^_J`J6t{{JVZV`V`imitituO! #.  ..9X9X.Y 
 #\\ "2]5H5H5K"KTaThThN )33A6L..~}n] ;;##"&//-"N"&"6BD0d )$++ 6 "	:Iu#!m%55!**t}} $ A ANN!*"#%'! !&!+A'#1 +$3&7'(;
! *!,M  #/"}Q'7&99NE"	:H ,,];  -!11&/Q/Q15O.(+/8Od+%	
 %v;&//*;;r=   )
NNNNNNNNNN)rT   rU   rV   r   r$   r1   r  r   r3   r   r   r_   r  r~  r   r   r   rS   r\   r=   r<   r  r    s   "| "H.d 151537>B59$(,0/3&*59v<E,,-v< !.v< u//0	v<
 "":;v<   1 12v< D>v< $D>v< 'tnv< d^v< !!1!12v< 
u--	.v<r=   r  c                       e Zd Zy)Zamba2ForCausalLMNr[   r\   r=   r<   r  r  v  r]   r=   r  c                       e Zd Zy)Zamba2ForSequenceClassificationNr[   r\   r=   r<   r  r  z  r]   r=   r  )r  r  r  r  )Or  r  	itertoolsr   typingr   r   r   r3   torch.utils.checkpointr   activationsr	   modeling_flash_attention_utilsr
   modeling_outputsr   modeling_utilsr   r   processing_utilsr   utilsr   utils.deprecationr   utils.import_utilsr   r   llama.modeling_llamar   r   mamba2.modeling_mamba2r   r   r   zamba.modeling_zambar   r   r   r   r   r   r   r    r!   r"   configuration_zamba2r$   +mamba_ssm.ops.triton.selective_state_updater%   !mamba_ssm.ops.triton.ssd_combinedr&   r'   causal_conv1dr)   r*   r  r   _CONFIG_FOR_DOC
get_loggerrT   r   Moduler-   rZ   r_   r   r   r   rc  rq  r  r  r  r  r  r  __all__r\   r=   r<   <module>r     s     	  , ,    ! B 7 F & 1 N Y Y   / RmmZjW57WDD-7**46FH\]^  '			H	%; ;*	L 	D36 D3N	0 	k)n k)\kOryy kO\'		 'T<"< <~Z4 ZG( GT%O %:R<*3 R<j	( 		&D 	r=   