
    hG                         d dl mZ d dlmZmZ d dlmZmZ d dlZddl	m
Z
 ddlmZ ddlmZmZ d	d
lmZmZmZ  e        G d d             Z G d d      Zy)    )deque)floorsqrt)OptionalUnionN   )PretrainedConfig)GenerationConfig)attach_tracertraced   )RequestStateget_device_and_memory_breakdownloggerc                      e Zd Zej                  dddfdededej                  dej                  de	de
ee	eeej                  e	f   f      d	e
e	   d
dfdZede	ded
ee	   fd       Zeded
dfd       Zd
e	fdZded
ee	   fdZededee	   d
ee	   fd       Zedej.                  dej.                  de	d
eej.                  ej.                  f   fd       Zy)PagedAttentionCached   Nconfiggeneration_configdevicedtypenum_requestslayer_device_maptp_sizereturnc           	         || _         || _        t        |dd      }||n|j                  | _        t        |dd      }	|	|	n|j
                  |j                  z  | _        |j                  | _        t        |dd      | _        |3|dkD  r.| j                  |z  dk7  rt        d| j                   d	| d
      t        | j                  | j                  | j                  | j                  |j
                  |j                        }
|
j                  t        |dd      t        |dd      t        |dd      | j                         \  }}|| _        || _        t        j                   d| j                  d| j                  d       | j                  || j                  | j                  f| _        g | _        g | _        t)        |j                        D ]  }|||   n|}t+        j,                  | j"                  | j                   |      }t+        j,                  | j"                  | j                   |      }t*        j.                  j1                  |       t*        j.                  j1                  |       | j$                  j3                  |       | j&                  j3                  |        t5        t)        |            | _        i | _        y)a  Initialize a paged attention cache for efficient memory usage.

        Args:
            config: Model configuration
            generation_config: Generation configuration containing cache parameters
            device: Device for the cache tensors
            dtype: Data type for the cache tensors
            layer_device_map: Optional mapping of layer indices to devices
            initial_prompt_shapes: Optional sample prompts to help calculate optimal cache size
        num_key_value_headsNhead_dim
block_size    r   r   zNumber of key value heads z+ must be divisible by tensor parallel size .r   r   	num_heads
num_layershidden_size
vocab_size
num_blocksmax_batch_tokens
max_memory?)r'   r(   max_memory_percentcache_dtypez7PagedAttentionCache initialized with self.num_blocks = z and self.max_batch_tokens =  )r   r   )r   r   getattrnum_attention_headsr   r%   r   num_hidden_layersr   
ValueErrorPagedAttentionMemoryHandlerr&   %infer_num_blocks_and_max_batch_tokensr'   r(   r   warningcache_shape	key_cachevalue_cacherangetorchzeros_dynamomark_static_addressappendr   _free_blocks_block_tables)selfr   r   r   r   r   r   r   kv_headsr   memory_handlerr'   r(   idxlayer_devicenew_layer_key_cachenew_layer_value_caches                    o/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/generation/continuous_batching/cache.py__init__zPagedAttentionCache.__init__   s   ( 
 6#8$?4<4HfNhNh 6:t4)1)=X6CUCUY_YsYsCs!'!9!9!"3\2F 7Q;'''1Q6 01I1I0JJuv}u~~  A  5]]..--**((
 (6'['[0,E$%68JDQ&'8,L

	 (\ (
$
$ % 0Qt>RRpX\XmXmWqqrst !44j$//SWS`S`a-//1112 		;C4D4P+C0V\L"'++d.>.>djjYe"f$)KK0@0@

[g$h! MM--.ABMM--.CDNN!!"56##$9:		; "%
"3435    n_blocks
request_idc                 $   t        | j                        |k  ryg }t        |      D ]+  }|j                  | j                  j	                                - || j
                  vrg | j
                  |<   | j
                  |   j                  |       |S )z*Allocates n_blocks for a given request_id.F)lenr>   r8   r=   popleftr?   extend)r@   rJ   rK   	allocated_s        rG   allocate_blocksz#PagedAttentionCache.allocate_blocksn   s     t  !H,	x 	:AT..6689	: T///-/Dz*:&--i8rI   c                     || j                   v r7| j                   j                  |      }| j                  j                  |       yt	        j
                  d|        y)z.Frees all blocks associated with a request_id.z6Attempted to free blocks for non-existent request_id: N)r?   popr>   rO   r   info)r@   rK   blocks_to_frees      rG   free_blockszPagedAttentionCache.free_blocks}   sR     +++!//33J?N$$^4KKPQ[P\]^rI   c                 ,    t        | j                        S )z,Returns the number of free blocks available.)rM   r>   )r@   s    rG   get_num_free_blocksz'PagedAttentionCache.get_num_free_blocks   s    4$$%%rI   c                 :    | j                   j                  |g       S )z&Returns the block table for a request.)r?   get)r@   rK   s     rG   get_block_tablez#PagedAttentionCache.get_block_table   s    !!%%j"55rI   statelogical_indicesc           	      2   |j                   }| j                  j                  |      }|st        d|       | j                  }g }|D ]L  }||z  }||z  }	|t        |      k\  rt        d| d| d|       ||   }
|
|z  |	z   }|j                  |       N |S )a  
        Maps logical sequence indices to physical cache indices using the block table, using PyTorch.

        Args:
            request_id: The request ID.
            logical_indices: A list of logical indices.

        Returns:
            A list of physical indices.

        Raises:
            ValueError: If no block table is found for the request ID.
            IndexError: If a logical index maps to a block index that is out of bounds.
        z!No block table found for request zLogical index z maps to block index z$ which is out of bounds for request )rK   r?   r[   r1   r   rM   
IndexErrorr=   )r@   r]   r^   rK   block_tabler   physical_indicesrC   	block_idxblock_offsetphysical_block_numphysical_indexs               rG   _get_physical_indicesz)PagedAttentionCache._get_physical_indices   s      %%
((,,Z8@MNN__
" 	4Cz)I+LC,, $SE)>yk J##-,0 
 "-Y!7/*<|KN##N3	4  rI   
key_statesvalue_states	layer_idxc                 t   | j                   | j                  z  }| j                  |   j                  | j                  || j
                        }| j                  |   j                  | j                  || j
                        }	|d   |d d |d d f<   |d   |	d d |d d f<   |d d d |d d f   |	d d d |d d f   fS )Nr   )r'   r   r6   viewr   r   r7   )
r@   rh   ri   rj   
read_indexwrite_indexkwargstotal_slotsk_cache_flatv_cache_flats
             rG   updatezPagedAttentionCache.update   s     oo7~~i055d6N6NP[]a]j]jk''	2778P8PR]_c_l_lm*4Q-QQ&'*6q/QQ&'D!Z23\$:WXBX5YYYrI   )__name__
__module____qualname__r9   float16r	   r
   r   r   intr   dictr   strrH   r   listrR   rW   rY   r\   r   rg   Tensortuplers    rI   rG   r   r      s    #]]OS!%O6 O6 ,O6 	O6
 {{O6 O6 #4U3c3I-J(J#KLO6 #O6 
O6b   c   _c _d _ _&S &6# 6$s) 6 % < % $s) % X\]`Xa %  % N ZLLZ llZ 	Z 
u||U\\)	*Z ZrI   r   c                   *   e Zd Zej                  ZdZej                  ZdZ	dZ
dededededed	ed
dfdZedded
efd       Zdddej                   fdee   dee   dedej$                  d
eeef   f
dZdej                   dfdedej$                  ded
eeef   fdZdej                   fdededej$                  d
efdZdej                   fdededej$                  d
efdZddej                   fdee   dee   dej$                  d
eeeef   fdZy)r2         i   r   r   r#   r$   r%   r&   r   Nc                 X    || _         || _        || _        || _        || _        || _        y Nr"   )r@   r   r   r#   r$   r%   r&   s          rG   rH   z$PagedAttentionMemoryHandler.__init__   s/     % "$&$rI   r+   c                 ^    t               \  }}}}|t        ||      z
  }t        || z        }|S r   )r   maxrx   )r+   rQ   totalreservedrP   available_memorys         rG   get_available_memoryz0PagedAttentionMemoryHandler.get_available_memory   s=    (G(I%5(I 3y(#;;/2DDErI   r*   r'   r(   r,   c                 $   ||| j                  ||      \  }}n/||| j                  |||      }n||| j                  |||      }| j                  |      }| j	                  |||      }t        |      |kD  rt        d| d|       ||fS )a  
        The memory footprint depends on the cache size C and the max batch tokens M in the following way:
            Mem = Mem(cache) + Mem(activation) + Mem(static_tensors)
        where:
            Mem(cache) = 2 * num_heads * head_dim * num_layers * cache_dtype.itemsize * C
            Mem(activation) = M * (hidden_size + vocab_size) * activation_dtype.itemsize
            Mem(static_tensors) ~= 8M * input_dtype.itemsize + M * C * activation_dtype.itemsize

        Depending on if C or M is given, we use different methods to infer the values (C = num_blocks * block_size) and
        since block_size is fixed, num_blocks is the true variable to find.
        )r(   r'   r,   zMemory footprint z is more than available memory )'compute_num_blocks_and_max_batch_tokenscompute_max_batch_tokenscompute_num_blocksr   compute_memory_footprintsumMemoryError)r@   r'   r(   r+   r,   r   memory_footprints          rG   r3   zAPagedAttentionMemoryHandler.infer_num_blocks_and_max_batch_tokens   s    & "2":+/+W+W"K,(J( #(8(@#<<ZI[]hi)j.@001ACUWbcJ  445GH88-!# 9 

  #33 12B1CCbcsbtuvv+++rI   g{Gz?mc                    | j                  |      }t        j                  d|        || j                  j                  z  | j
                  | j                  z   z  }d| j                  z  | j                  z  | j                  z  |j                  z  }d|z  | j                  j                  z  }t        j                  d|        t        j                  d|        t        j                  d|        || j                  j                  z  }||z   |z   }	| }
|	dz  d|z  |
z  z
  }|dk  rt        d	|      |	 t        |      z   d|z  z  }|dk  rt        d
|      t        |      | j                  z  }|| j                  kD  r1t        j                   d|d| j                         | j                  }t        ||z        }|| j"                  kD  r1t        j                   d|d| j"                         | j"                  }||fS )a  
        If neither M nor C is given, we assume M = m*C so we have to solve a second-order polynomial in C:
            Mem = C * 2 * self.num_heads * self.head_dim * self.num_layers * cache_dtype.itemsize
                + C * m * (hidden_size + vocab_size) * activation_dtype.itemsize
                + C * m * 8 * input_dtype.itemsize + C^2 * m * activation_dtype.itemsize

        We solve for C and then M = m*C.
        zCache memory: r      zMemory per activation token: zMemory per cache token: zMemory per input token:    r   z)Discriminant is negative: discriminant = z3Greatest solution is negative: greatest_solution = znum_blocks = z9 is too large, setting to self._upper_bound_num_blocks = zmax_batch_tokens = z? is too large, setting to self._upper_bound_max_batch_tokens = )r   r   rU   _activation_dtypeitemsizer%   r&   r#   r   r$   _input_dtyper1   r   rx   r   _upper_bound_num_blocksr4   _upper_bound_max_batch_tokens)r@   r+   r,   r   cache_memorymem_per_activation_tokenmem_per_cache_tokenmem_per_input_tokenabcdiscriminantgreatest_solutionr'   r(   s                  rG   r   zCPagedAttentionMemoryHandler.compute_num_blocks_and_max_batch_tokens  s)    001CDn\N34 $%t'='='F'F#F$JZJZ]a]l]lJl#m $..04==@4??RU`UiUii!ed&7&7&@&@@34L3MNO./B.CDE./B.CDE &&///"558PPM !ta!eai'!I,9JKLLR$|"44Q?q S?P>TUVV *+t>
444NNmj_,ftGcGcFghi55J0145d@@@NN1.22rTMoMoLstu#AA+++rI   c                    | j                  |      }|| j                  z  }|}||dz  | j                  z  | j                  z  | j                  z  |j
                  z  z  }d| j                  j
                  z  || j                  j
                  z  z   }|| j                  | j                  z   | j                  j
                  z  z  }t        ||z        S )a4  
        If C is given, we have a formula for M:
            num = (Mem - C * 2 * num_heads * head_dim * num_layers * cache_dtype.itemsize)
            denum = (8 * input_dtype.itemsize + C * activation_dtype.itemsize + (hidden_size + vocab_size) * activation_dtype.itemsize)
        M = num / denum
        r   r   )r   r   r#   r   r$   r   r   r   r%   r&   rx   )r@   r'   r+   r,   r   
cache_sizenumdenums           rG   r   z4PagedAttentionMemoryHandler.compute_max_batch_tokensA  s     001CD$//1
zA~.>PS^SgSgggD%%...d>T>T>]>]1]]$""T__48N8N8W8WWW3;rI   c                    | j                  |      }|}|| j                  j                  | j                  | j                  z   z  |z  z  }|d|z  | j
                  j                  z  z  }d| j                  z  | j                  z  | j                  z  |j                  z  }||| j                  j                  z  z  }t        ||z        }t        || j                  z        S )a4  
        If M is given, we have a formula for C:
            num = Mem - M * (hidden_size + vocab_size) * activation_dtype.itemsize - 8 * M * input_dtype.itemsize
            denum = 2 * num_heads * head_dim * num_layers * cache_dtype.itemsize + M * activation_dtype.itemsize
        C = num / denum
        r   r   )r   r   r   r%   r&   r   r#   r   r$   rx   r   r   )r@   r(   r+   r,   r   r   r   r   s           rG   r   z.PagedAttentionMemoryHandler.compute_num_blocksX  s     001CDt%%..$2B2BT__2TUXhhhq##d&7&7&@&@@@DNN"T]]2T__D{G[G[[!D$:$:$C$CCCu%
Z$//122rI   c                    | j                   j                  | j                  | j                  z   z  }||z  }|K|| j                  z  }d| j
                  z  | j                  z  | j                  z  |j                  z  }||z  }nd}|z|xt        d|z  | j                  j                  z  |z  | j                   j                  z  d|z  | j                  j                  z  d|z  | j                  j                  z  g      }nd}|||fS )Nr   r   )
r   r   r%   r&   r   r#   r   r$   r   r   )	r@   r'   r(   r,   activation_memory_footprintr   bytes_per_tokencache_memory_footprintstatic_memory_footprints	            rG   r   z4PagedAttentionMemoryHandler.compute_memory_footprintp  s$    '+&<&<&E&EIYIY\`\k\kIk&l##'77#!#doo5J$..04==@4??RU`UiUiiO%//%A"%'"!&6&B&)((4+<+<+E+EE$z1D4J4J4S4SS((4+<+<+E+EE((4+<+<+E+EE	'# ')#*,BD[[[rI   )g      ?)rt   ru   rv   r9   bfloat16r   _activation_safety_factorint32r   r   r   rx   rH   staticmethodfloatr   rw   r   r   r}   r3   r   r   r   r   r~   rI   rG   r2   r2      s    !;;L$'!"%% % 	%
 % % % 
%          %)*.$'#(==',SM', #3-', "	',
 [[', 
sCx',V %(#(==	/,!/, [[/, 	/,
 
sCx/,h %(#(==	   "  [[	 
 
 4 %(#(==	33 "3 [[	3
 
34 %)*.#(==	\SM\ #3-\ [[	\
 
sC}	\rI   r2   )collectionsr   mathr   r   typingr   r   r9   configuration_utilsr	   generation.configuration_utilsr
   utils.metricsr   r   classesr   r   r   r   r2   r~   rI   rG   <module>r      sR      "  3 > 2 J J jZ jZ jZZC\ C\rI   