
    h
                       d dl mZmZ d dlmZ d dlmZmZ d dlZddl	m
Z
 ddlmZmZmZmZmZ  e       rd dlmZ  ed	d
      Z ej*                  e      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z  G d d      Z! G d  d!e!      Z" G d" d#e!      Z# G d$ d%e!      Z$ G d& d'e!      Z% G d( d)e"      Z& G d* d+e#      Z' G d, d-e#      Z( G d. d/e#      Z) G d0 d1e#      Z* G d2 d3e#      Z+ G d4 d5e$      Z, G d6 d7e$      Z- G d8 d9e!      Z.y):    )ABCabstractmethod)Iterable)AnyOptionalN   )PretrainedConfig)is_hqq_availableis_quanto_greateris_torch_greater_or_equalis_torchdynamo_compilinglogging)	Quantizerz2.7T
accept_devc                   v   e Zd ZdZdZd Zd Zedej                  fd       Z
e	 ddej                  dej                  d	eeeef      d
eej                  ej                  f   fd       Zedej                  d
eeef   fd       Zed
efd       Zed
efd       Zd Zd ZddZdej0                  d
dfdZy)CacheLayerMixinz0Base, abstract class for a single layer's cache.Fc                 "    d\  | _         | _        y )N)NN)keysvaluesselfs    V/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/cache_utils.py__init__zCacheLayerMixin.__init__   s    !+	4;    c                 0    | j                   j                   S N)	__class____name__r   s    r   __repr__zCacheLayerMixin.__repr__"   s    ..))*+r   
key_statesc                      y r    r   r!   s     r   lazy_initializationz#CacheLayerMixin.lazy_initialization%   s    =@r   Nvalue_statescache_kwargsreturnc                      y r   r#   r   r!   r&   r'   s       r   updatezCacheLayerMixin.update(   s     -0r   cache_positionc                      y r   r#   )r   r,   s     r   get_mask_sizeszCacheLayerMixin.get_mask_sizes-   s    ORr   c                      y r   r#   r   s    r   get_seq_lengthzCacheLayerMixin.get_seq_length0       %(r   c                      y r   r#   r   s    r   get_max_cache_shapez#CacheLayerMixin.get_max_cache_shape3   s    *-r   c                     | j                   E| j                   j                  dd      | _         | j                  j                  dd      | _        yy)z(Offload this layer's data to CPU device.NcpuTnon_blocking)r   tor   r   s    r   offloadzCacheLayerMixin.offload6   sA    99 		U>DI++..T.BDK !r   c                    | j                   }| j                   j                  | j                  k7  rY| j                   j                  | j                  d      | _         | j                  j                  | j                  d      | _        yyy)zcIn case of layer offloading, this allows to move the data back to the layer's device ahead of time.NTr6   )r   devicer8   r   r   s    r   prefetchzCacheLayerMixin.prefetch<   sb    99 TYY%5%5%D		T[[tDDI++..4.HDK &E r   c                     | j                   4| j                   j                          | j                  j                          t        | d      rd| _        yy)z4Resets the cache values while preserving the objectsNcumulative_lengthr   )r   zero_r   hasattrr>   r   s    r   resetzCacheLayerMixin.resetB   sB    99 IIOOKK4,-%&D" .r   beam_idxc                 <   | j                         dkD  r| j                  j                  d|j                  | j                  j                              | _        | j
                  j                  d|j                  | j
                  j                              | _        yy)z,Reorders this layer's cache for beam search.r   N)r0   r   index_selectr8   r;   r   r   rB   s     r   reorder_cachezCacheLayerMixin.reorder_cacheK   sn     1$		..q(++dii>N>N2OPDI++221hkk$++BTBT6UVDK %r   r   r(   N)r   
__module____qualname____doc__is_compileabler   r    r   torchTensorr%   r   dictstrr   tupler+   intr.   r0   r3   r9   r<   rA   
LongTensorrF   r#   r   r   r   r      s   :N,, @ell@ @mq0,,06;ll0RZ[_`ceh`h[iRj0	u||U\\)	*0 0 RU\\ReCHoR R(( (-S- -CI'We&6&6 W4 Wr   r   c                   D   e Zd ZdZdZdej                  fdZ	 ddej                  dej                  dee	e
ef      deej                  ej                  f   fd	Zd
ej                  deeef   fdZdefdZdefdZdeddfdZdeddfdZdej                  ddfdZy)DynamicLayerz
    A cache layer that grows dynamically as more tokens are generated. This is the default for generative models.
    It stores the key and value states as tensors of shape `[batch_size, num_heads, seq_len, head_dim]`.
    Fr!   c                    |j                   |j                  c| _         | _        t        j                  g | j                   | j                        | _        t        j                  g | j                   | j                        | _        y )Ndtyper;   )rW   r;   rL   tensorr   r   r$   s     r   r%   z DynamicLayer.lazy_initializationZ   sV    ","2"2J4E4E
DKLL4::dkkJ	ll2TZZLr   Nr&   r'   r(   c                    | j                   | j                  |       t        j                  | j                   |gd      | _         t        j                  | j                  |gd      | _        | j                   | j                  fS )  
        Update the key and value caches in-place, and return the necessary keys and value states.

        Args:
            key_states (`torch.Tensor`): The new key states to cache.
            value_states (`torch.Tensor`): The new value states to cache.
            cache_kwargs (`dict[str, Any]`, *optional*): Additional arguments for the cache.

        Returns:
            tuple[`torch.Tensor`, `torch.Tensor`]: The key and value states.
        dim)r   r%   rL   catr   r*   s       r   r+   zDynamicLayer.update_   se    $ 99$$Z0IItyy*52>	iil ;Dyy$++%%r   r,   c                 V    d}|j                   d   }| j                         }||z   }||fS )zDReturn the length and offset of the cache, used to generate the maskr   )shaper0   )r   r,   	kv_offsetquery_lengthpast_seen_tokens	kv_lengths         r   r.   zDynamicLayer.get_mask_sizesx   s<    	%++A...0 #33	)##r   c                     | j                   | j                   j                         dk(  ry| j                   j                  d   S )1Returns the sequence length of the cached states.r   r[   )r   numelr`   r   s    r   r0   zDynamicLayer.get_seq_length   s4    99		 1Q 6yyr""r   c                      y)zeReturns the maximum sequence length of the cache object. DynamicLayer does not have a maximum length.r#   r   s    r   r3   z DynamicLayer.get_max_cache_shape   s    r   
max_lengthc                     |dk  r| j                         t        |      z
  }| j                         |k  ry| j                  dd|ddf   | _        | j                  dd|ddf   | _        y)z
        Crop the past key values up to a new `max_length` in terms of tokens. `max_length` can also be negative
        to remove `max_length` tokens.
        r   N.)r0   absr   r   )r   rj   s     r   cropzDynamicLayer.crop   sl    
 >,,.Z@J J.IIc;J;12	kk#{
{A"56r   repeatsc                     | j                         dkD  rE| j                  j                  |d      | _        | j                  j                  |d      | _        yy)z8Repeat the cache `repeats` times in the batch dimension.r   r\   N)r0   r   repeat_interleaver   r   rn   s     r   batch_repeat_interleavez$DynamicLayer.batch_repeat_interleave   sN     1$		33G3CDI++77Q7GDK %r   indicesc                     | j                         dkD  r-| j                  |df   | _        | j                  |df   | _        yy)z<Only keep the `indices` in the batch dimension of the cache.r   .N)r0   r   r   r   rs   s     r   batch_select_indicesz!DynamicLayer.batch_select_indices   s@     1$		'3,/DI++gsl3DK %r   r   )r   rH   rI   rJ   
is_slidingrL   rM   r%   r   rN   rO   r   rP   r+   rQ   r.   r0   r3   rm   rr   rv   r#   r   r   rT   rT   R   s    
 JMell M 26	&LL& ll& tCH~.	&
 
u||U\\)	*&2$U\\ $eCHo $# #S 7s 7t 7Hs Ht H4ELL 4T 4r   rT   c                       e Zd ZdZdZdef fdZ	 ddej                  dej                  de	e
eef      d	eej                  ej                  f   fd
Zdej                  d	eeef   fdZd	efdZd	efdZded	df fdZ xZS )DynamicSlidingWindowLayerz
    A cache layer that grows dynamically as more tokens are generated, up until the sliding window size.
    It stores the key and value states as tensors of shape `[batch_size, num_heads, min(seq_len, sliding_window), head_dim]`.
    Tsliding_windowc                 >    t         |           || _        d| _        y Nr   )superr   rz   r>   )r   rz   r   s     r   r   z"DynamicSlidingWindowLayer.__init__   s    ,!"r   Nr!   r&   r'   r(   c                    | j                   | j                  |       | xj                  |j                  d   z  c_        t	        j
                  | j                   |gd      }t	        j
                  | j                  |gd      }|dddd| j                   dz   dddf   | _         |dddd| j                   dz   dddf   | _        ||fS )rZ   Nr[   r\   r   )r   r%   r>   r`   rL   r^   r   rz   )r   r!   r&   r'   full_key_statesfull_value_statess         r   r+   z DynamicSlidingWindowLayer.update   s    $ 99$$Z0*"2"22"66  ))TYY
$;D!IIt{{L&ArJ#Aq4+>+>*>*B*Da$GH	'1t/B/B.BQ.F.H!(KL  111r   r,   c                    |j                   d   }|d   }t        j                  || j                  z
  dz   d      }| j	                         | j                  k\  r| j                  dz
  |z   }||fS | j	                         |z   }||fS NReturn the length and offset of the cache, used to generate the attention maskr   r   )min)r`   rL   clamprz   r0   r   r,   rb   first_cache_positionra   rd   s         r   r.   z(DynamicSlidingWindowLayer.get_mask_sizes   s    %++A.-a0KK 4t7J7J JQ NTUV	 D$7$77++a/,>I )## ++-<I)##r   c                     | j                   S rf   r>   r   s    r   r0   z(DynamicSlidingWindowLayer.get_seq_length       %%%r   c                     | j                   S z+Return the maximum cache shape of the cacherz   r   s    r   r3   z-DynamicSlidingWindowLayer.get_max_cache_shape   s    """r   rj   c                     | j                         | j                  k\  rt        d      t        |   |       | j
                  j                  d   | _        y)z
        Crop the past key values up to a new `max_length` in terms of tokens. `max_length` can also be
        negative to remove `max_length` tokens.
        zCannot `crop` a `DynamicSlidingWindowLayer` after it has seen more tokens than itssliding window (otherwise some states are lost)r[   N)r0   rz   
ValueErrorr}   rm   r   r`   r>   )r   rj   r   s     r   rm   zDynamicSlidingWindowLayer.crop   sR    
  D$7$77B  	Z !%!4r   r   )r   rH   rI   rJ   rw   rQ   r   rL   rM   r   rN   rO   r   rP   r+   r.   r0   r3   rm   __classcell__r   s   @r   ry   ry      s    
 J#s # 26	2LL2 ll2 tCH~.	2
 
u||U\\)	*2B$U\\ $eCHo $& &#S #5s 5t 5 5r   ry   c                       e Zd ZdZdZdZdef fdZdej                  fdZ
	 ddej                  dej                  d	eeeef      d
eej                  ej                  f   fdZdej                  d
eeef   fdZd
efdZd
efdZ xZS )StaticLayera  
    A static cache layer that stores the key and value states as static tensors of shape `[batch_size, num_heads, max_cache_len), head_dim]`.
    It lazily allocates its full backing tensors, and then mutates them in-place. Built for `torch.compile` support.

    Args:
        max_cache_len (`int`):
            Maximum number of tokens that can be stored, used for tensor preallocation.
    TFmax_cache_lenc                 0    t         |           || _        y r   )r}   r   r   )r   r   r   s     r   r   zStaticLayer.__init__  s    *r   r!   c                    |j                   \  | _        | _        }| _        |j                  |j
                  c| _        | _        t        j                  | j                  | j                  | j                  | j                  f| j                  | j
                        | _	        t        j                  | j                  | j                  | j                  | j                  f| j                  | j
                        | _
        t               sSt        j                  j                  | j                         t        j                  j                  | j                         yy)a6  
        Lazy initialization of the keys and values tensors. This allows to get all properties (dtype, device,
        num_heads in case of TP etc...) at runtime directly, which is extremely practical as it avoids moving
        devices, dtypes etc later on for each `update` (which could break the static dynamo addresses as well).

        If this is unwanted, one can call `early_initialization(...)` on the Cache directly, which will call this
        function ahead-of-time (this is required for `torch.export` for example). Note that for `compile`, as we
        internally don't compile the prefill, this is guaranteed to have been called already when compiling.
        If compiling the prefill as well, e.g. calling `model.compile(...)` before `generate` with a static cache,
        it is still supported in general, but without guarantees depending on the compilation options (e.g. cuda graphs,
        i.e. `mode="reduce-overhead"` is known to fail). But it will in general work correctly, and prefill should
        not be compiled anyway for performances!
        rV   N)r`   max_batch_size	num_headshead_dimrW   r;   rL   zerosr   r   r   r   _dynamomark_static_address)r   r!   _s      r   r%   zStaticLayer.lazy_initialization  s     AK@P@P=T^Q","2"2J4E4E
DKKK  $..$2D2DdmmT**;;
	
 kk  $..$2D2DdmmT**;;
 ()MM--dii8MM--dkk: *r   r&   r'   r(   c                    | j                   | j                  |       ||j                  d      nd}||n-t        j                  |j
                  d   | j                        }	 | j                   j                  d||       | j                  j                  d||       | j                   | j                  fS # t        $ r/ || j                   dddd|f<   || j                  dddd|f<   Y Ow xY w)rZ   Nr,   r[   )r;      )
r   r%   getrL   aranger`   r;   index_copy_r   NotImplementedError)r   r!   r&   r'   r,   s        r   r+   zStaticLayer.update+  s    $ 99$$Z0 @L?W))*:;]a,8Nell:K[K[\^K_hlhshs>t 	
	=II!!!^Z@KK##A~|D
 yy$++%%	 # 	=.8DIIaN*+0<DKK1n,-	=s   &:B8 85C0/C0r,   c                 &    d}| j                   }||fS )r   r   r   )r   r,   ra   rd   s       r   r.   zStaticLayer.get_mask_sizesQ  s    	&&	)##r   c                 x    | j                   -| j                   d   j                  d      j                         S dS )rf   )r   r   ri   r\   r   )r   anysumr   s    r   r0   zStaticLayer.get_seq_lengthW  s8     7;ii6K		$###+002RQRRr   c                     | j                   S r   r   r   s    r   r3   zStaticLayer.get_max_cache_shape]  s    !!!r   r   )r   rH   rI   rJ   rK   rw   rQ   r   rL   rM   r%   r   rN   rO   r   rP   r+   r.   r0   r3   r   r   s   @r   r   r      s     NJ+c +!;ell !;N 26	$&LL$& ll$& tCH~.	$&
 
u||U\\)	*$&L$U\\ $eCHo $S S"S "r   r   c                        e Zd ZdZdZdedef fdZ	 ddej                  dej                  de	e
eef      d	eej                  ej                  f   fd
Zdej                  d	eeef   fdZd	efdZ xZS )SlidingWindowLayera  
    A static cache layer that stores the key and value states as static tensors of shape
    `[batch_size, num_heads, min(max_cache_len, sliding_window), head_dim]`. It lazily allocates its full backing
    tensors, and then mutates them in-place. Built for `torch.compile` support.

    Args:
        max_cache_len (`int`):
            Maximum number of tokens that can be stored, used for tensor preallocation.
        sliding_window (`int`):
            The size of the sliding window.
    Tr   rz   c                 L    t        ||      }t        | 	  |       d| _        y )Nr   r   )r   r}   r   r>   )r   r   rz   effective_max_cache_lenr   s       r   r   zSlidingWindowLayer.__init__q  s)    "%nm"D'>?!"r   r!   r&   r'   r(   c                 h   | j                   | j                  |       |j                  d      }| j                  | j                  k\  }| xj                  |j
                  d   z  c_        |j
                  d   | j                  kD  rn| j                   j                  |dddd| j                   dddf          | j                  j                  |dddd| j                   dddf          ||fS |r| j                   j                  dd      }| j                  j                  dd      }t        j                  dgt        | j                        }||dddd|f<   ||dddd|f<   | j                   j                  |       | j                  j                  |       n;	 | j                   j                  d||       | j                  j                  d||       | j                   | j                  fS # t        $ r/ || j                   dddd|f<   || j                  dddd|f<   Y Ow xY w)	rZ   Nr,   r[   r   ri   )dimsrV   r   )r   r%   r   r>   r   r`   copy_r   rollrL   rX   rQ   r;   r   r   )	r   r!   r&   r'   r,   is_fullnew_keys
new_valuesindexs	            r   r+   zSlidingWindowLayer.updatev  s   $ 99$$Z0%))*:;((D,>,>>*"2"22"66 "T%7%77IIOOJq!d.@.@-@-BA'EFGKKl1a$2D2D1D1F+IJK|++ yy~~br~2H))"2)6J LL"SEE$.HQ5[!&2Jq!U{# IIOOH%KKj)A		%%aD''><H
 yy$++%%	 ' A2<		!Q./4@Aq.01As   ':G9 95H10H1r,   c                     |j                   d   }|d   }t        j                  || j                  z
  dz   d      }t	        || j                        }||fS r   )r`   rL   r   r   maxr   s         r   r.   z!SlidingWindowLayer.get_mask_sizes  s\    %++A.-a0KK 4t7I7I IA MSTU	d&8&89	)##r   c                     | j                   S r   r   r   s    r   r0   z!SlidingWindowLayer.get_seq_length  r   r   r   )r   rH   rI   rJ   rw   rQ   r   rL   rM   r   rN   rO   r   rP   r+   r.   r0   r   r   s   @r   r   r   b  s    
 J#c #3 # 26	9&LL9& ll9& tCH~.	9&
 
u||U\\)	*9&v$U\\ $eCHo $& &r   r   c                       e Zd ZdZ	 d
dej
                  dej
                  deeee	f      de
ej
                  ej
                  f   fdZdej
                  de
eef   fd	Zy)ChunkedSlidingLayerzl
    An extended SlidingWindowLayer that supports prefill chunking, originally implemented for Llama 4.
    Nr!   r&   r'   r(   c                 ~   | j                   | j                  |       |j                  d      }| j                  }|| j                  k\  }| xj                  |j
                  d   z  c_        |rt        j                  | j                   ddddddddf   |fd      }t        j                  | j                  ddddddddf   |fd      }|j
                  d   dk(  r1| j                   j                  |       | j                  j                  |       | j                   | j                  fS |s||j
                  d   z   | j                  kD  ro|dk(  r|}|}nt        j                  | j                   ddddd|ddf   |fd      }t        j                  | j                  ddddd|ddf   |fd      }nS	 | j                   j                  d||       | j                  j                  d||       | j                   | j                  fS | j                   j                  |dddd| j                   dddf          | j                  j                  |dddd| j                   dddf          ||fS # t        $ r/ || j                   dddd|f<   || j                  dddd|f<   Y w xY w)rZ   Nr,   r[   r   r\   r   r   )r   r%   r   r>   r   r`   rL   r^   r   r   r   r   )	r   r!   r&   r'   r,   r>   r   r   r   s	            r   r+   zChunkedSlidingLayer.update  s   $ 99$$Z0%))*:; 22#t'9'99*"2"22"66#ii1aQ;)?(LRTUO %		4;;q!QR{+C\*RXZ [ #q(		0!!"34yy$++--.1A1A!1DDtGYGYY A%",$0!"'))TYYq!=O>O=OQR7R-SU_,`fh"i$)IIt{{1aASBSASUV;V/WYe.fln$o!A		%%aD''><H 99dkk))		1t/A/A.A.CQ(FGH+Aq43E3E2E2G,JKL  111 ' A2<		!Q./4@Aq.01As   :J 5J<;J<r,   c                     |j                   d   }|d   }| j                  }t        j                  ||z
  dz   d      }||k\  r||z   dz
  }||fS ||k  r||z   |kD  r	||z   }||fS |}||fS r   )r`   r   rL   r   )r   r,   rb   r   rz   ra   rd   s          r   r.   z"ChunkedSlidingLayer.get_mask_sizes  s    %++A.-a0++KK 4~ E IqQ	>1&59I )## "N27Kl7Z]k7k,|;I )## 'I)##r   r   )r   rH   rI   rJ   rL   rM   r   rN   rO   r   rP   r+   rQ   r.   r#   r   r   r   r     s~     26	;2LL;2 ll;2 tCH~.	;2
 
u||U\\)	*;2z$U\\ $eCHo $r   r   c                        e Zd ZdZ	 	 	 	 	 ddededededef
 fdZ	 ddej                  d	ej                  d
ee	e
ef      deej                  ej                  f   fdZed        Zed        ZdefdZ xZS )QuantizedLayera  
    A quantized layer similar to what is described in the [KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache paper](https://huggingface.co/papers/2402.02750).
    It allows the model to generate longer sequence length without allocating too much memory for the key and value caches by
    applying quantization.

    The cache has two types of storage, one for original precision and one for the quantized cache. A `residual length`
    is set as a maximum capacity for the original precision cache. When the length goes beyond maximum capacity, the original
    precision cache is discarded and moved into the quantized cache. The quantization is done per-channel with a set `q_group_size`
    for both Keys and Values, in contrast to what was described in the paper.
    nbitsaxis_key
axis_valueq_group_sizeresidual_lengthc                 v    t         |           || _        || _        || _        || _        || _        d| _        y r|   )r}   r   r   r   r   r   r   r>   r   r   r   r   r   r   r   s         r   r   zQuantizedLayer.__init__"  s=     	
 $(.!"r   r!   r&   r'   r(   c                    | xj                   |j                  d   z  c_         | j                  u| j                  |       | j	                  |j                         | j                        | _        | j	                  |j                         | j                        | _	        ||fS | j                  | j                        }| j                  | j                        }t        j                  || j                  |gd      }t        j                  || j                  |gd      }| j                  j                         dk(  r| j                  j                  d   dz   | j                  k\  r| j	                  |j                         | j                        | _        | j	                  |j                         | j                        | _	        t        j                   g |j"                  |j$                        | _        t        j                   g |j"                  |j$                        | _        ||fS t        j                  | j                  |gd      | _        t        j                  | j                  |gd      | _        ||fS )rZ   r[   )axisr\      r   rV   )r>   r`   r   r%   	_quantize
contiguousr   _quantized_keysr   _quantized_values_dequantizerL   r^   r   r]   r   rX   rW   r;   )r   r!   r&   r'   dequant_keysdequant_valueskeys_to_returnvalues_to_returns           r   r+   zQuantizedLayer.update2  s   " 	*"2"22"66 99$$Z0#'>>*2G2G2IPTP]P]>#^D %)^^L4K4K4MTXTcTc^%dD"|++''(<(<=))$*@*@AL$))Z#HbQ 99ndkk<%PVXY99==?aDIIOOB$7!$;t?S?S$S#'>>.2K2K2MTXTaTa>#bD %)^^4D4O4O4QX\XgXg^%hD"Rz/?/?
HYHYZDI,,r1A1A*J[J[\DK
 /// 		499j"9rBDI))T[[,$?RHDK///r   c                      y r   r#   )r   rX   r   s      r   r   zQuantizedLayer._quantize[  s    '*r   c                      y r   r#   )r   q_tensors     r   r   zQuantizedLayer._dequantize^  r1   r   c                     | j                   S r   r   r   s    r   r0   zQuantizedLayer.get_seq_lengtha  r   r   r   r   r   @      r   )r   rH   rI   rJ   rQ   r   rL   rM   r   rN   rO   r   rP   r+   r   r   r   r0   r   r   s   @r   r   r     s    	 "## # 	#
 # #( 26	'0LL'0 ll'0 tCH~.	'0
 
u||U\\)	*'0R * *( (& &r   r   c                   L     e Zd Z	 	 	 	 	 d	dededededef
 fdZd Zd Z xZS )
QuantoQuantizedLayerr   r   r   r   r   c                    t         	|   |||||       t        dd      rddlm}m}m} nt        d      | j                  dvrt        d	| j                         | j                  d
vrt        d| j                         | j                  d
vrt        d| j                         | j                  dk(  r|n|| _         |       | _        y )Nr   r   r   r   r   z0.2.5Tr   r   )MaxOptimizerqint2qint4ziYou need optimum-quanto package version to be greater or equal than 0.2.5 to use `QuantoQuantizedCache`. )r   r   zA`nbits` for `quanto` backend has to be one of [`2`, `4`] but got )r   ri   zE`axis_key` for `quanto` backend has to be one of [`0`, `-1`] but got zG`axis_value` for `quanto` backend has to be one of [`0`, `-1`] but got r   )r}   r   r   optimum.quantor   r   r   ImportErrorr   r   r   r   qtype	optimizer)
r   r   r   r   r   r   r   r   r   r   s
            r   r   zQuantoQuantizedLayer.__init__g  s     	!%+ 	 	
 W6AA{  ::V#`aeakak`lmnn=='deiererdstuu??')YZ^ZiZiYjk  #jjAoU5
%r   c                     ddl m} | j                  || j                  || j                        \  }} ||| j                  |||| j                        }|S )Nr   )quantize_weight)r   r   r   r   r   )r   rX   r   r   scale	zeropointqtensors          r   r   zQuantoQuantizedLayer._quantize  sK    2>>&$**dDDUDUVy!&$**dE9dN_N_`r   c                 "    |j                         S r   )
dequantize)r   r   s     r   r   z QuantoQuantizedLayer._dequantize  s    !!##r   r   r   rH   rI   rQ   r   r   r   r   r   s   @r   r   r   f  sT     "$($( $( 	$(
 $( $(L$r   r   c                   L     e Zd Z	 	 	 	 	 d	dededededef
 fdZd Zd Z xZS )
HQQQuantizedLayerr   r   r   r   r   c                 R   t         |   |||||       t               st        d      | j                  dvrt        d| j                         | j                  dvrt        d| j                         | j                  dvrt        d| j                         t        | _	        y )Nr   z4You need to install `hqq` to use `HQQQuantizedLayer`)r   r      r      zM`nbits` for `HQQ` backend has to be one of [`1`, `2`, `3`, `4`, `8`] but got )r   r   zA`axis_key` for `HQQ` backend has to be one of [`0`, `1`] but got zC`axis_value` for `HQQ` backend has to be one of [`0`, `1`] but got )
r}   r   r
   r   r   r   r   r   HQQQuantizer	quantizerr   s         r   r   zHQQQuantizedLayer.__init__  s     	!%+ 	 	
  !TUU::_,_`d`j`j_kl  ==&`aeanan`opqq??&(bcgcrcrbstuu%r   c                    | j                   j                  ||| j                  j                  | j                  j                  | j
                  | j                        \  }}| j                  j                  |d<   | j                   j                  ||| j                  j                         |d   j                  |j                        |d<   |d   j                  |j                        |d<   ||fS )N)r   r;   compute_dtyper   
group_sizer   )metar;   r   zero)	r   quantizer   r;   rW   r   r   cudar8   )r   rX   r   r   r   s        r   r   zHQQQuantizedLayer._quantize  s    //99##))//**(( 0 
 !%		_G$tyy7G7GHW((8WF|w~~6V}r   c                 H    |\  }}| j                   j                  ||      }|S r   )r   r   )r   r   quant_tensorr   rX   s        r   r   zHQQQuantizedLayer._dequantize  s'    $d**<>r   r   r   r   s   @r   r   r     sT     "&& & 	&
 & &@r   r   c                      e Zd ZdZ	 	 	 	 d,deee      deee      dedefdZ	d Z
d-d	ed
efdZd-d	ed
efdZ	 d.dej                  dej                  d	edeeeef      deej                  ej                  f   f
dZdedededej*                  dej,                  f
dZd/d	ee   defdZdej                  d	edeeef   fdZd/d	edefdZd Zdej8                  fdZdefd Zd!efd"Zd#ej                  fd$Z e!defd%       Z"e!defd&       Z#e!defd'       Z$e!dee   fd(       Z%d	edeej                  ej                  f   fd)Z&d* Z'd+ Z(y)0Cachean  
    A `Cache` is mostly a list of `CacheLayerMixin` objects, one per model layer. It serves as a container for
    the Cache of each layer.

    Args:
        layers (`Optional`, *optional*):
            A list of pre-created `CacheLayerMixin`. If omitted (`None`), then `layer_class_to_replicate` will
            be used.
        layer_class_to_replicate (`type[CacheLayerMixin]`, *optional*):
            Only used if `layers` is omitted (`None`), in which case it will be used as the base class for each layer,
            and the layers will be added lazily as soon as `update` is called with a `layer_idx` greater than the current
            list of layers.
        offloading (`bool`, *optional*, defaults to `False`):
            Whether to perform offloading of the layers to `cpu`, to save GPU memory.
        offload_only_non_sliding (`bool`, *optional*, defaults to `True`):
            If `offloading` is `True`, this further decides if only the non-sliding layers will be offloaded (because
            usually the sliding layers are small in size, so there is no need to offload them, and skipping it is faster).
    Nlayerslayer_class_to_replicate
offloadingoffload_only_non_slidingc                    ||t        d      ||t        d      ||ng | _        || _        || _        | j                  rE|| _        t
        rt        j                         nt        j                  j                         | _	        y y )Na  You can construct a Cache either from a list `layers` of all the predefined `CacheLayer`, or from a `layer_class_to_replicate`, in which case the Cache will append a new layer corresponding to `layer_class_to_replicate` for each new call to `update` with an idx not already in the Cache.z_You should provide exactly one of `layers` or `layer_class_to_replicate` to initialize a Cache.)
r   r   r   r  only_non_sliding#_is_torch_greater_or_equal_than_2_7rL   Streamr   prefetch_stream)r   r   r   r  r  s        r   r   zCache.__init__  s     ":"Fq 
 >6>q  !' 2f(@%$??$<D!5X5<<>^c^h^h^o^o^qD  r   c                 N    | j                   j                   d| j                   dS )Nz(layers=))r   r   r   r   s    r   r    zCache.__repr__  s$    ..))*(4;;-qAAr   	layer_idxr  c                    |r#	 || j                   |d j                  d      z   }n|t        | j                        k  r|nd}t
        r| j                  n(t        j                  j                  | j                        5  | j                  |   j                          ddd       y# t        $ r | j                   j                  d      }Y w xY w# 1 sw Y   yxY w)a<  
        Prefetch a given layer on its device. If `only_non_sliding` is True, it will try to prefetch only the layers
        which are non-sliding. If the `layer_idx` is outside the range, this will circle back to the first layers.
        Note that we use a non-default stream for this, to avoid blocking.
        NFr   )rw   r   r   lenr   r  r  rL   r   streamr<   r   r
  r  s      r   r<   zCache.prefetch  s     9%	
(C(I(I%(PP	
 &/T[[1A%A	qI &IT!!ejjN_N_`d`t`tNu 	.KK	"++-	. 	.  9 OO11%8	9	. 	.s   !B$ =C$$C
CCc                 b    |r| j                   |   s| j                  |   j                          yy)a  
        Offload a given `layer_idx`. If `only_non_sliding` is True, it will offload `layer_idx` only if it is a
        non-sliding layer. Note that we do it on the default stream, so that we ensure all earlier
        computation in the layer's `update` methods are finished.
        N)rw   r   r9   r  s      r   r9   zCache.offload  s-     !T__Y%?KK	"**, &@r   r!   r&   r'   r(   c                 F   | j                   Zt        | j                        |k  rB| j                  j                  | j                                t        | j                        |k  rB| j                  rat
        j                  j                  |j                        j                  | j                         | j                  |dz   | j                         | j                  |   j                  |||      \  }}| j                  r| j                  || j                         ||fS )a  
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.

        Parameters:
            key_states (`torch.Tensor`):
                The new key states to cache.
            value_states (`torch.Tensor`):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
            cache_kwargs (`dict[str, Any]`, *optional*):
                Additional arguments for the cache subclass. These are specific to each subclass and allow new types of
                cache to be created.

        Return:
            A tuple containing the updated key and value states.
        r   )r   r  r   appendr  rL   r   default_streamr;   wait_streamr  r<   r  r+   r9   )r   r!   r&   r
  r'   r   r   s          r   r+   zCache.update  s    2 ((4dkk"i/""4#@#@#BC dkk"i/ ??JJ%%j&7&78DDTEYEYZMM)a-)>)>?{{9-44Z|\f??LLD$9$9:V|r   
batch_sizer   r   rW   r;   c                     t        j                  ||d|f||      }| j                  D ]  }|j                  |        y)z
        Initialize all the layers in advance (it's otherwise lazily initialized on the first `update` call).
        This is useful for our `export` recipes, as `export` needs everything in advance.
        r   rV   N)rL   r   r   r%   )r   r  r   r   rW   r;   fake_keys_tensorlayers           r   early_initializationzCache.early_initializationC  sD     !;;
Iq('KSXagh[[ 	8E%%&67	8r   c                 n    |t        | j                        k\  ry| j                  |   j                         S )z=Returns the sequence length of the cache for the given layer.r   )r  r   r0   r   r
  s     r   r0   zCache.get_seq_lengthR  s.    DKK(({{9%4466r   r,   c                     |t        | j                        k\  r|j                  d   dfS | j                  |   j                  |      S )a  
        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
        the given layer at `layer_idx`.
        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns for each layer.
        r   )r  r   r`   r.   r   r,   r
  s      r   r.   zCache.get_mask_sizesX  sE     DKK((!''*A--{{9%44^DDr   c                 n    |t        | j                        k\  ry| j                  |   j                         S )zaReturns maximum sequence length of the cache object. Dynamic caches do not have a maximum length.ri   )r  r   r3   r  s     r   r3   zCache.get_max_cache_shaped  s0     DKK(({{9%99;;r   c                     t        t        | j                              D ]  }| j                  |   j                          ! y)z$Recursively reset all layers tensorsN)ranger  r   rA   r  s     r   rA   zCache.resetl  s4    s4;;/0 	+IKK	"((*	+r   rB   c                     t        t        | j                              D ]   }| j                  |   j                  |       " y)z!Reorder the cache for beam searchN)r  r  r   rF   )r   rB   r
  s      r   rF   zCache.reorder_cacheq  s6    s4;;/0 	;IKK	"00:	;r   rj   c                     t        t        | j                              D ]   }| j                  |   j                  |       " y)z"Crop the cache to the given lengthN)r  r  r   rm   )r   rj   r
  s      r   rm   z
Cache.cropv  s6    s4;;/0 	4IKK	"''
3	4r   rn   c                     t        t        | j                              D ]   }| j                  |   j                  |       " y)zRepeat and interleave the cacheN)r  r  r   rr   )r   rn   r
  s      r   rr   zCache.batch_repeat_interleave{  s8    s4;;/0 	DIKK	"::7C	Dr   rs   c                     t        t        | j                              D ]   }| j                  |   j                  |       " y)zSelect indices from the cacheN)r  r  r   rv   )r   rs   r
  s      r   rv   zCache.batch_select_indices  s8    s4;;/0 	AIKK	"77@	Ar   c                     | j                   D cg c]  }|j                   }}t        t        |            dkD  rt	        d|       |d   S c c}w )z*Return the maximum batch size of the cacher   z0Max batch size is not consistent across layers: r   )r   r   r  setr   r   r  r   s      r   r   zCache.max_batch_size  sV     59KK@5%&&@@s6{aOPVxXYYay As   Ac                 h    | j                   D cg c]  }|j                   }}t        |      S c c}w )z,Return the maximum cache length of the cache)r   r   r   r&  s      r   r   zCache.max_cache_len  s1     48;;?%%%%??6{ @s   /c                 l    t        | j                        dk(  ryt        d | j                  D              S )z'Return whether the cache is compileabler   Fc              3   4   K   | ]  }|j                     y wr   )rK   ).0r  s     r   	<genexpr>z'Cache.is_compileable.<locals>.<genexpr>  s     AE5''As   )r  r   allr   s    r   rK   zCache.is_compileable  s-     t{{q AT[[AAAr   c                 V    | j                   D cg c]  }t        |dd       c}S c c}w )z9Return whether the layers of the cache are sliding windowrw   F)r   getattr)r   r  s     r   rw   zCache.is_sliding  s'     BFM|U3MMMs   &c                     |t        | j                        k  r2| j                  |   j                  | j                  |   j                  fS t	        dt        | j                         d|       z
        Support for backwards-compatible `past_key_values` indexing, e.g. `past_key_values[0][0].shape[2]` to get the
        sequence length.
        zCache only has z. layers, attempted to access layer with index )r  r   r   r   KeyErrorr  s     r   __getitem__zCache.__getitem__  sj    
 s4;;'';;y)..I0F0M0MMM !#dkk"2!33abkalm r   c              #      K   t        t        |             D ]6  }| j                  |   j                  | j                  |   j                  f 8 ywz
        Support for backwards-compatible `past_key_values` iteration, e.g. `for x in past_key_values:` to iterate over
        keys and values
        N)r  r  r   r   r   r  s     r   __iter__zCache.__iter__  sK     
 s4y) 	OI;;y)..I0F0M0MNN	Os   AAc                 ,    t        | j                        S )zN
        This value corresponds to the number of layers in the model.
        )r  r   r   s    r   __len__zCache.__len__  s     4;;r   )NNFT)Tr   r   ))r   rH   rI   rJ   r   listr   typeboolr   r    rQ   r<   r9   rL   rM   rN   rO   r   rP   r+   rW   r;   r  r0   r.   r3   rA   rR   rF   rm   rr   rv   propertyr   r   rK   rw   r2  r5  r7  r#   r   r   r   r     s^   * 37DH )-ro./r #+4+@"Ar 	r
 #'r0B.# . .(- - - 26'LL' ll' 	'
 tCH~.' 
u||U\\)	*'R88*-89<8EJ[[8Z_ZfZf87 7c 7
EU\\ 
Ec 
EeTWY\T\o 
E<S < <+
;e&6&6 ;
4s 4
Ds D
AELL A
    s  
 B B B NDJ N NS U5<<3M-N O r   r   c            	           e Zd ZdZ	 	 	 	 ddeeeej                  ej                  f         dee	   de
de
f fdZdeeej                  ej                  f      fdZed	eeej                  ej                  f      dd fd
       Z xZS )DynamicCachea*
  
    A cache that grows dynamically as more tokens are generated. This is the default for generative models.
    It stores the key and value states as a list of `CacheLayer`, one for each layer. The expected shape for each tensor
    in the `CacheLayer`s is `[batch_size, num_heads, seq_len, head_dim]`.
    If a config is passed, it will additionally check for sliding or hybrid cache structure, greatly reducing the
    memory requirement of the cached tensors to `[batch_size, num_heads, min(seq_len, sliding_window), head_dim]`.

    See `Cache` for details on common methods that are implemented by all cache classes.

    Args:
        ddp_cache_data (`Iterable[tuple[torch.Tensor, torch.Tensor]]`, *optional*):
            It was originally added for compatibility with `torch.distributed` (DDP). In a nutshell, it is
            `map(gather_map, zip(*caches))`, i.e. each item in the iterable contains the key and value states
            for a layer gathered across replicas by torch.distributed (shape=[global batch size, num_heads, seq_len, head_dim]).
            Note: it needs to be the 1st arg as well to work correctly
        config (`PretrainedConfig`, *optional*):
            The config of the model for which this Cache will be used. If passed, it will be used to check for sliding
            or hybrid layer structure, greatly reducing the memory requirement of the cached tensors to
            `[batch_size, num_heads, min(seq_len, sliding_window), head_dim]`.
        offloading (`bool`, *optional*, defaults to `False`):
            Whether to perform offloading of the layers to `cpu`, to save GPU memory.
        offload_only_non_sliding (`bool`, *optional*, defaults to `False`):
            If `offloading` is `True`, this further decides if only the non-sliding layers will be offloaded (because
            usually the sliding layers are small in size, so there is no need to offload them, and skipping it is faster).

    Example:

    ```python
    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache

    >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
    >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")

    >>> inputs = tokenizer(text="My name is Qwen2", return_tensors="pt")

    >>> # Prepare a cache class and pass it to model's forward
    >>> past_key_values = DynamicCache(config=model.config)
    >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
    >>> outputs.past_key_values # access cache filled with key/values from generation
    ```
    ddp_cache_dataconfigr  r  c                    g }||j                         }t        |dd       xs t        |dd       }t        |dd       }|&t        |j                        D cg c]  }|dnd
 }}t	        |d      r|d |j
                    }|D ];  }	|	dv r|j                  t        |             #|j                  t                      = |It        |      D ];  \  }
\  }}||j                  t                      ||
   j                  ||      \  }}= t        |      d	k(  rt        | 5  t        ||
       y t        | 5  |||       y c c}w )Nrz   attention_chunk_sizelayer_typessliding_attentionfull_attentionnum_kv_shared_layers)rD  chunked_attentionr   r   )r   r  r  r   r  r  )get_text_configr.  r  num_hidden_layersr@   rF  r  ry   rT   	enumerater+   r  r}   r   )r   r?  r@  r  r  r   rz   rC  r   
layer_typer
  r!   r&   r   s                r   r   zDynamicCache.__init__  s    ++-F$V-=tDuPVXnptHuN!&->K" #6#;#;< ,:+E'K[[ 
 v56)*HV-H-H,HI) 2
!KKMM";>"Z[MM,.1	2 %9B>9R J5	5J>MM,.1i(//
LI1J v;!G)5%)A   GFz\tuAs   Er(   c                 d    d}| j                   D ]  }||j                  |j                  ffz  }  |S )z
        Converts the `Cache` instance into the its equivalent in the legacy cache format. Used for
        backward compatibility.
        r#   )r   r   r   )r   legacy_cacher  s      r   to_legacy_cachezDynamicCache.to_legacy_cache  s<    
 [[ 	:Eejj%,,799L	:r   past_key_valuesc                      |        }|t         j                  d       |4t        t        |            D ]  }||   \  }}|j	                  |||        |S )z
        Converts a cache in the legacy cache format into an equivalent `Cache`. Used for
        backward compatibility.
        9past_key_values should not be None in from_legacy_cache())loggerwarning_oncer  r  r+   )clsrP  cacher
  r!   r&   s         r   from_legacy_cachezDynamicCache.from_legacy_cache$  sg     " [\&"3#78 B	+:9+E(
LZyAB r   )NNFF)r   rH   rI   rJ   r   r   rP   rL   rM   r	   r;  r   rO  classmethodrW  r   r   s   @r   r>  r>    s    (X QU-1 )..v %ell0J*K!LM.v )*.v 	.v
 #'.v`uU\\5<<-G'H!I  eELL%,,<V6W0X ]k  r   r>  c            	       :     e Zd ZdZ	 	 ddedededef fdZ xZS )StaticCachea  
    Static Cache class to be used with `torch.compile(model)` and `torch.export()`. It will check the `config`
    for potential hybrid cache structure, and initialize each layer accordingly.

    See `Cache` for details on common methods that are implemented by all cache classes.

    Args:
        config (`PretrainedConfig`):
            The config of the model for which this Cache will be used. It will be used to check for sliding
            or hybrid layer structure, and initialize each layer accordingly.
        max_cache_len (`int`):
            The maximum number of tokens that this Cache should hold.
        offloading (`bool`, *optional*, defaults to `False`):
            Whether to perform offloading of the layers to `cpu`, to save GPU memory.
        offload_only_non_sliding (`bool`, *optional*, defaults to `True`):
            If `offloading` is `True`, this further decides if only the non-sliding layers will be offloaded (because
            usually the sliding layers are small in size, so there is no need to offload them, and skipping it is faster).

    Example:

    ```python
    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, StaticCache

    >>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
    >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

    >>> inputs = tokenizer(text="My name is Llama", return_tensors="pt")

    >>> # Prepare a cache class and pass it to model's forward
    >>> # Leave empty space for 10 new tokens, which can be used when calling forward iteratively 10 times to generate
    >>> max_generated_length = inputs.input_ids.shape[1] + 10
    >>> past_key_values = StaticCache(config=model.config, max_cache_len=max_generated_length)
    >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
    >>> outputs.past_key_values # access cache filled with key/values from generation
    StaticCache()
    ```
    r@  r   r  r  c                    |j                         }t        |dd       }|t        |dd       #t        |j                        D cg c]  }d }}nRt        |dd       #t        |j                        D cg c]  }d }}n"t        |j                        D cg c]  }d }}t	        |d      r|d |j
                    }g }|D ]Y  }	|	dk(  rt        ||j                        }
n)|	dk(  rt        ||j                        }
nt        |	      }
|j                  |
       [ t        | 5  |||
       y c c}w c c}w c c}w )NrC  rz   rD  rB  rG  rE  rF  )r   rz   r   rH  )rI  r.  r  rJ  r@   rF  r   rz   r   rB  r   r  r}   r   )r   r@  r   r  r  kwargsrC  r   r   rL  r  r   s              r   r   zStaticCache.__init__\  sJ    '')fmT:v/6B<A&BZBZ<[\q2\\!7>J<A&BZBZ<[\q2\\9>v?W?W9XYA/YY612%&D)D)D(DEK% 	!J00*W]WlWlm22+-X^XsXst#-@MM% 	! 	:Xpq' ]\Ys   	D25	D7	D<)FT)	r   rH   rI   rJ   r	   rQ   r;  r   r   r   s   @r   rZ  rZ  4  sG    $V !)- r  r  r 	 r
 #' r  rr   rZ  c                   L     e Zd ZdZ	 	 	 	 	 d
dededededededef fd	Z xZS )QuantizedCachea  
    A quantizer cache similar to what is described in the
    [KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache paper](https://huggingface.co/papers/2402.02750).
    It allows the model to generate longer sequence length without allocating too much memory for keys and values
    by applying quantization.
    The cache has two types of storage, one for original precision and one for the
    quantized cache. A `residual length` is set as a maximum capacity for the original precision cache. When the
    length goes beyond maximum capacity, the original precision cache is discarded and moved into the quantized cache.
    The quantization is done per-channel with a set `q_group_size` for both keys and values, in contrast to what was
    described in the paper.

    See `Cache` for details on common methods that are implemented by all cache classes.

    Args:
        backend (`str`):
            The quantization backend to use. One of `("quanto", "hqq").
        config (`PretrainedConfig`):
            The config of the model for which this Cache will be used.
        nbits (`int`, *optional*, defaults to 4):
            The number of bits for quantization.
        axis_key (`int`, *optional*, defaults to 0):
            The axis on which to quantize the keys.
        axis_value (`int`, *optional*, defaults to 0):
            The axis on which to quantize the values.
        q_group_size (`int`, *optional*, defaults to 64):
            Quantization is done per-channel according to a set `q_group_size` for both keys and values.
        residual_length (`int`, *optional*, defaults to 128):
            Maximum capacity for the original precision cache
    backendr@  r   r   r   r   r   c           
          |dk(  rt         }n|dk(  rt        }nt        d| d      |j                  d      }t	        |j
                        D 	cg c]  }	 ||||||       }
}	t        |   |
       y c c}	w )NquantohqqzUnknown quantization backend ``T)decoder)r   )r   r   r   rI  r  rJ  r}   r   )r   r_  r@  r   r   r   r   r   layer_classr   r   r   s              r   r   zQuantizedCache.__init__  s     h.K+K=gYaHII'''5 6334
 x\?S
 
 	'	
s   A8r   )	r   rH   rI   rJ   rO   r	   rQ   r   r   r   s   @r   r^  r^    sh    D "(( !( 	(
 ( ( ( ( (r   r^  c                      e Zd ZdZd#dZdefdZd Zdede	e
j                  e
j                  e
j                  e
j                  f   fdZd	 Zde	e	e
j                        fd
Zedeee	e
j$                  df         dd fd       Zd$dee   defdZd Zde
j,                  fdZdefdZdefdZdededdfdZdefdZde
j                  fdZdefdZde
j                  dede	eef   fd Zed!        Z ede!fd"       Z"y)%EncoderDecoderCachea  
    Base, abstract class for all encoder-decoder caches. Can be used to hold combinations of self-attention and
    cross-attention caches.

    See `Cache` for details on common methods that are implemented by all cache classes.

    Args:
        caches (`Iterable`):
            Usually an iterable of length 2, containing 2 `Cache` objects, the first one for self-attention, the
            second one for cross-attention. Can optionally also be an iterable of length 1, containing a
            `tuple[tuple[torch.Tensor]]` (usually used for compatibility with torch dp and ddp).

    Example:

    ```python
    >>> from transformers import AutoProcessor, AutoModelForCausalLM, DynamicCache, EncoderDecoderCache

    >>> model = AutoModelForCausalLM.from_pretrained("openai/whisper-small")
    >>> processor = AutoProcessor.from_pretrained("openai/whisper-small")

    >>> inputs = processor(audio=YOUR-AUDIO, return_tensors="pt")

    >>> # Prepare cache classes for encoder and decoder and pass it to model's forward
    >>> self_attention_cache = DynamicCache(config=self.config)
    >>> cross_attention_cache = DynamicCache(config=self.config)
    >>> past_key_values = EncoderDecoderCache(self_attention_cache, cross_attention_cache)
    >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
    >>> outputs.past_key_values # access cache filled with key/values from generation
    EncoderDecoderCache()
    ```
    r(   Nc           	      
   t        |      dk(  rt               | _        t               | _        t	        |d         D ]^  \  }}|d d \  }}| j                  j                  |||       t        |      dkD  s:|dd  \  }}| j                  j                  |||       ` nt        |      dk(  rdt        |d   t              rt        |d   t              s)t        dt        |d         dt        |d               |d   | _        |d   | _        nt        dt        |             i | _        t        t        | j                              D ]6  }t        | j                  j                  |      dkD        | j                  |<   8 y )Nr   r   r   z;One of the two arguments is not a Cache: type(caches[0]) = z, type(caches[1]) = zExpected 1 or 2 arguments, got )r  r>  self_attention_cachecross_attention_cacherK  r+   
isinstancer   	TypeErrorr:  r   
is_updatedr  r;  r0   )r   cachesr
  key_value_statesr!   r&   s         r   r   zEncoderDecoderCache.__init__  s   v;!(4D%)5D&/8/C [+	++;BQ+?(
L))00\9U'(1,/?/C,J..55j,PYZ[ [AfQi/z&)U7S"^DQWXYQZOK__tbfgmnogpbqau vww(.q	D%)/D& >s6{mLMMs4#=#=>? 	hI)-d.H.H.W.WXa.bef.f)gDOOI&	hr   c                 h    | j                   j                   d| j                   d| j                   dS )Nz(self_attention_cache=z, cross_attention_cache=r	  )r   r   ri  rj  r   s    r   r    zEncoderDecoderCache.__repr__  s;    ~~&&''=d>W>W=XXp))*!-	
r   c              #   V  K   t        t        |             D ]  }| j                  j                  |   j                  | j                  j                  |   j
                  | j                  j                  |   j                  | j                  j                  |   j
                  f  ywr4  )r  r  ri  r   r   r   rj  r  s     r   r5  zEncoderDecoderCache.__iter__  s     
 s4y) 	I))00;@@))00;BB**11)<AA**11)<CC	 	s   B'B)r
  c                 f   |t        |       k  r| j                  j                  |   j                  | j                  j                  |   j                  | j
                  j                  |   j                  | j
                  j                  |   j                  fS t        dt        |        d|       r0  )r  ri  r   r   r   rj  r1  r  s     r   r2  zEncoderDecoderCache.__getitem__  s    
 s4y ))00;@@))00;BB**11)<AA**11)<CC	  _SYK7efoepqrrr   c                 ,    t        | j                        S )z
        Support for backwards-compatible `past_key_values` length, e.g. `len(past_key_values)`. This value corresponds
        to the number of layers in the model.
        )r  ri  r   s    r   r7  zEncoderDecoderCache.__len__  s    
 4,,--r   c                    d}t        | j                        dkD  rOt        | j                  j	                         | j                  j	                               D ]  \  }}|||z   fz  } |S | j                  j	                         }|S )z[Converts the `EncoderDecoderCache` instance into its equivalent in the legacy cache format.r#   r   )r  rj  zipri  rO  )r   rN  	self_attn
cross_attns       r   rO  z#EncoderDecoderCache.to_legacy_cache  s    t))*Q.),))99;T=W=W=g=g=i* :%	: Z!7 99:   44DDFLr   rP  .c                 `    | t               t                     }|t        j                  d       |S t        |      D ]m  \  }}|dd \  }}|j                  j                  |||       t        |      dkD  s:|dd \  }}|j                  j                  |||       d|j                  |<   o |S )zUConverts a cache in the legacy cache format into an equivalent `EncoderDecoderCache`.NrR  r   T)	r>  rS  rT  rK  ri  r+   r  rj  rm  )rU  rP  rV  r
  ro  r!   r&   s          r   rW  z%EncoderDecoderCache.from_legacy_cache'  s    
 LNLN3" [\  09/I 7+	++;BQ+?(
L**11*lIV'(1,/?/C,J//66z<QZ[26E$$Y/7 r   c                 8    | j                   j                  |      S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.)ri  r0   r  s     r   r0   z"EncoderDecoderCache.get_seq_length9  s    ((77	BBr   c                     | j                   j                          | j                  j                          | j                  D ]  }d| j                  |<    y )NF)ri  rA   rj  rm  r  s     r   rA   zEncoderDecoderCache.reset=  sG    !!'')""((* 	/I).DOOI&	/r   rB   c                 p    | j                   j                  |       | j                  j                  |       y)zDReorders the cache for beam search, given the selected beam indices.N)ri  rF   rj  rE   s     r   rF   z!EncoderDecoderCache.reorder_cacheC  s*    !!//9""00:r   methodc           	          t        | j                  t              rt        | j                  t              sEt	        d| d| j                  j                          d| j                  j                          d      y )Nrc  z)` is only defined for dynamic cache, got z" for the self attention cache and z for the cross attention cache.)rk  ri  r>  rj  r   __str__)r   r|  s     r   check_dynamic_cachez'EncoderDecoderCache.check_dynamic_cacheH  sw    t00,?455|DF8DTE^E^EfEfEhDi j''+'A'A'I'I'K&LLkm  Er   maximum_lengthc                     | j                  | j                  j                         | j                  j                  |       y)z
        Crop the past key values up to a new `maximum_length` in terms of tokens. `maximum_length` can also be
        negative to remove `maximum_length` tokens. This is used in assisted decoding and contrastive search (on the Hub).
        N)r  rm   r   ri  )r   r  s     r   rm   zEncoderDecoderCache.cropS  s0    
 	  !3!34!!&&~6r   full_batch_size
split_sizezlist[EncoderDecoderCache]c                 "   | j                  | j                  j                         | j                  j                  ||      }| j                  j                  ||      }g }t        ||      D ]   \  }}|j                  t        ||             " |S )z
        Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
        `_split_model_inputs()` in `generation.utils`
        )r  batch_splitr   ri  rj  ru  r  rg  )r   r  r  ri  rj  outrv  rw  s           r   r  zEncoderDecoderCache.batch_split[  s    
 	  !1!1!:!:;#88DD_V`a $ : : F FXb c%()=?T%U 	C!IzJJ*9jAB	C
r   rn   c                     | j                  | j                  j                         | j                  j                  |       | j                  j                  |       y)zaRepeat the cache `repeats` times in the batch dimension. Used in contrastive search (on the Hub).N)r  rr   r   ri  rj  rq   s     r   rr   z+EncoderDecoderCache.batch_repeat_interleavei  sD      !=!=!F!FG!!99'B""::7Cr   rs   c                     | j                  | j                  j                         | j                  j                  |       | j                  j                  |       y)zeOnly keep the `indices` in the batch dimension of the cache. Used in contrastive search (on the Hub).N)r  rv   r   ri  rj  ru   s     r   rv   z(EncoderDecoderCache.batch_select_indiceso  sD      !:!:!C!CD!!66w?""77@r   c                 6    | j                   j                         S )zKReturns the maximum sequence length (i.e. max capacity) of the cache object)ri  r3   r   s    r   r3   z'EncoderDecoderCache.get_max_cache_shapeu  s    ((<<>>r   r,   c                 :    | j                   j                  ||      S r   )ri  r.   r  s      r   r.   z"EncoderDecoderCache.get_mask_sizesy  s    ((77	RRr   c                 .    | j                   j                  S r   )ri  rw   r   s    r   rw   zEncoderDecoderCache.is_sliding|  s    ((333r   c                 .    | j                   j                  S r   )ri  rK   r   s    r   rK   z"EncoderDecoderCache.is_compileable  s    ((777r   rG   r8  )#r   rH   rI   rJ   r   rO   r    r5  rQ   rP   rL   rM   r2  r7  rO  rX  r   r   FloatTensorrW  r0   rA   rR   rF   r  rm   r  rr   rv   r3   r.   r<  rw   r;  rK   r#   r   r   rg  rg    s   @h4
# 
sS sU5<<u||]b]i]i3i-j s.
uU\\':!; 
 &xe6G6G6L0M'NO	 "C Cc C/;e&6&6 ;
# 73 73 C D_ Ds DAELL A?S ?SU\\ Sc SeTWY\T\o S 4 4 8 8 8r   rg  c                         e Zd Zd fdZ xZS )OffloadedCachec                 P    t         j                  d       t        |   d       y )Nzo`OffloadedCache` is deprecated and will be removed in version v4.59 Use `DynamicCache(offloading=True)` insteadT)r  rS  rT  r}   r   )r   r   s    r   r   zOffloadedCache.__init__  s(    :	
 	D)r   rG   )r   rH   rI   r   r   r   s   @r   r  r    s    * *r   r  c                   (     e Zd Zdedef fdZ xZS )OffloadedStaticCacher@  r   c                 T    t         j                  d       t        |   ||d       y )Nzy`OffloadedStaticCache` is deprecated and will be removed in version v4.59 Use `StaticCache(..., offloading=True)` insteadTr@  r   r  r  r   r@  r   argsr\  r   s        r   r   zOffloadedStaticCache.__init__  s-    >	
 	mPTUr   r   rH   rI   r	   rQ   r   r   r   s   @r   r  r         V/ V V Vr   r  c                   (     e Zd Zdedef fdZ xZS )SlidingWindowCacher@  r   c                 R    t         j                  d       t        |   ||       y )Nz`SlidingWindowCache` is deprecated and will be removed in version v4.59 Use `StaticCache(...)` instead which will correctly infer the type of each layer.r@  r   r  r  s        r   r   zSlidingWindowCache.__init__  +    `	
 	mDr   r  r   s   @r   r  r         E/ E E Er   r  c                   (     e Zd Zdedef fdZ xZS )HybridCacher@  r   c                 R    t         j                  d       t        |   ||       y )Nz`HybridCache` is deprecated and will be removed in version v4.59 Use `StaticCache(...)` instead which will correctly infer the type of each layer.r  r  r  s        r   r   zHybridCache.__init__  r  r   r  r   s   @r   r  r    r  r   r  c                   (     e Zd Zdedef fdZ xZS )HybridChunkedCacher@  r   c                 R    t         j                  d       t        |   ||       y )Nz`HybridChunkedCache` is deprecated and will be removed in version v4.59 Use `StaticCache(...)` instead which will correctly infer the type of each layer.r  r  r  s        r   r   zHybridChunkedCache.__init__  r  r   r  r   s   @r   r  r    r  r   r  c                   (     e Zd Zdedef fdZ xZS )OffloadedHybridCacher@  r   c                 T    t         j                  d       t        |   ||d       y )Nz`OffloadedHybridCache` is deprecated and will be removed in version v4.59 Use `StaticCache(..., offload=True)` instead which will correctly infer the type of each layer.Tr  r  r  s        r   r   zOffloadedHybridCache.__init__  s.    n	
 	mPTUr   r  r   s   @r   r  r    r  r   r  c                   D     e Zd Z	 	 	 	 	 ddedededededef fdZ xZS )	QuantoQuantizedCacher@  r   r   r   r   r   c           	      Z    t         j                  d       t        |   d||||||       y )Nz~`QuantoQuantizedCache` is deprecated and will be removed in version v4.59 Use `QuantizedCache(backend='quanto', ...)` instead.ra  r  r   r@  r   r   r   r   r   r   s          r   r   zQuantoQuantizedCache.__init__  s5     	C	
 	65(JVefr   r   r  r   s   @r   r  r    s`     "g g g 	g
 g g g gr   r  c                   D     e Zd Z	 	 	 	 	 ddedededededef fdZ xZS )	HQQQuantizedCacher@  r   r   r   r   r   c           	      Z    t         j                  d       t        |   d||||||       y )Nzx`HQQQuantizedCache` is deprecated and will be removed in version v4.59 Use `QuantizedCache(backend='hqq', ...)` instead.rb  r  r  s          r   r   zHQQQuantizedCache.__init__  s5     	@	
 	x\Sbcr   r   r  r   s   @r   r  r    s`     "d d d 	d
 d d d dr   r  c                       e Zd ZdZddZy)	SinkCachea  
    It is now a `custom_generate` repository on the Hub: https://huggingface.co/transformers-community/sink_cache.
    See [these docs](https://huggingface.co/docs/transformers/generation_strategies#custom-decoding-methods) for
    general `custom_generate`usage.
    Nc                     t        d      )Nz`SinkCache` has been moved as a `custom_generate` repository on the Hub: https://huggingface.co/transformers-community/sink_cache. See the repository for usage examples.)r   )r   r\  s     r   r   zSinkCache.__init__  s    !o
 	
r   rG   )r   rH   rI   rJ   r   r#   r   r   r  r    s    
r   r  )/abcr   r   collections.abcr   typingr   r   rL   configuration_utilsr	   utilsr
   r   r   r   r   hqq.core.quantizer   r   r  
get_loggerr   rS  r   rT   ry   r   r   r   r   r   r   r   r>  rZ  r^  rg  r  r  r  r  r  r  r  r  r  r#   r   r   <module>r     s   # $    1  ;&?RV&W # 
		H	%5Wc 5WpP4? P4fO5 O5dh"/ h"V[& [&|S$, S$lM&\ M&`/$> /$d3 3ln  n br5 rjHr% HrV5(U 5(pK8% K8b*\ *V; VE EE+ EE EV; Vg> g"d d"
 
r   