
    hf                        d dl Z d dlmZmZ d dlZddlmZmZmZm	Z	m
Z
 ddlmZ ddlmZmZmZmZ ddlmZmZ ddlmZmZmZ  G d	 d
      Z G d dej4                  j6                        Z G d dej4                  j6                        Z G d dej4                  j6                        Z	 	 	 	 d/dedeej>                     deej>                     dee    dee!   f
dZ" G d dej4                  j6                        Z# G d dej4                  j6                        Z$ G d dej4                  j6                        Z%	 	 d0dedeej>                     deej>                     fdZ&d Z'd efd!Z(d"ejR                  jT                  jV                  fd#Z,	 	 	 	 	 	 d1d$e-d%ej>                  d&e-d'e-d(ee   d)eej>                     d*ee-   d+e!d,e!d-eej>                     fd.Z.y)2    N)CallableOptional   )DynamicCacheDynamicLayerDynamicSlidingWindowLayerEncoderDecoderCacheStaticCache)GenerationConfig)ALL_MASK_ATTENTION_FUNCTIONS_ignore_causal_mask_sdpa#_is_torch_greater_or_equal_than_2_5prepare_padding_mask)ALL_ATTENTION_FUNCTIONSPreTrainedModel)is_torch_greater_or_equal"is_torch_greater_or_equal_than_2_3"is_torch_greater_or_equal_than_2_6c                   J    e Zd ZdZddedefdZd Zd Zd Zd Z	d	 Z
	 ddZy
)TorchExportableModuleForVLMa|  
    A wrapper class for exporting Vision-Language Models (VLMs) like SmolVLM2 for ExecuTorch.

    This class handles the export of three main components:
        1. Vision encoder (processes images to visual features)
        2. Connector/projector (maps visual features to text embedding space)
        3. Text decoder (generates text from combined visual and text tokens)
    max_batch_sizemax_cache_lenc                    || _         || _        || _        |j                  | _        |j                   j                  | _        |j                   j                  | _        |j                   j                  | _        d| _	        d| _
        d| _        y)a  
        Initialize the exportable VLM module.

        Args:
            model: The VLM (e.g. SmolVLM) model instance
            max_batch_size: Maximum batch size. Always 1 for ExecuTorch
            max_cache_len: Maximum cache length for text generation
        N)modelr   r   configvision_modelvision_encoder	connector
text_modeltext_decoderexported_vision_encoderexported_connectorexported_text_decoder)selfr   r   r   s       b/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/integrations/executorch.py__init__z$TorchExportableModuleForVLM.__init__2   sw     
,*ll $kk66..!KK22 (,$"&%)"    c                    | j                   j                          t        j                  ddddt        j                        }dt        j
                  j                  j                  t        j
                  j                  j                  di}t        j
                  j                  | j                   |f|d      | _        | j                  S )	z$Export the vision encoder component.      i  dtypepixel_values)r   r*   Fargsdynamic_shapesstrict)	r   evaltorchrandnfloat32exportDimAUTOr!   )r$   r-   r0   s      r%   export_vision_encoderz1TorchExportableModuleForVLM.export_vision_encoderJ   s      " {{1acG <<##((<<##((
 (-||':':)	 (; (
$ +++r'   c                 &   | j                   j                          | j                  j                  j                  }| j                  j                  j
                  }| j                  j                  j                  }||z  }||z  }t        j                  d||t        j                        }ddt        j                  j                  j                  ii}t        j                  j                  | j                   |f|d      | _        | j                  S )zExport the connector component.r)   r+   image_hidden_statesFr.   )r   r2   r   vision_confighidden_size
image_size
patch_sizer3   r4   r5   r6   r7   r8   r"   )r$   vision_hidden_sizer>   r?   patches_per_dimnum_patchesr;   r0   s           r%   export_connectorz,TorchExportableModuleForVLM.export_connectorb   s     "[[66BB[[..99
[[..99
$
2%7#kk![:LTYTaTab 0!U\\5E5E5J5J1KL #(,,"5"5NN%')	 #6 #
 &&&r'   c                    t        | j                        | _        d}t        j                  d|ft        j
                        }t        j                  |t        j
                        }t        | j                  | j                  j                  j                        }t        j                  j                  d|dz
        }d|id|id}| j                  j                  |||d	
      | _        | j                  S )z"Export the text decoder component.)r   r*   r)   r+   seq_length_dimmaxr   	input_idscache_positionF)rI   rJ   r0   r1   )%TorchExportableModuleForDecoderOnlyLMr    exportable_text_decoderr3   zeroslongarangeminr   r   text_configmax_position_embeddingsr6   r7   r#   )r$   
seq_lengthrI   rJ   max_seq_lengthseq_len_dimr0   s          r%   export_text_decoderz/TorchExportableModuleForVLM.export_text_decoder{   s     (MSWSdSd'e$ 
KKJuzzB	j

CT//1H1H1`1`all&&'7^a=O&P [) +.

 &*%A%A%H%H))	 &I &
" )))r'   c                      | j                   di |  | j                  di |  | j                  di | | j                  | j                  | j
                  dS )z'Export all components of the VLM model.)r   r   r     )r9   rC   rV   r!   r"   r#   )r$   kwargss     r%   r6   z"TorchExportableModuleForVLM.export   s`    """,V,''   *6*"::00 66
 	
r'   c                      y)a  
        Simplified forward pass for inference with guaranteed non-null input_ids and cache_position.

        Args:
            pixel_values: Input images [1, channels, height, width] (optional)
            input_ids: Text token IDs [1, seq_len] (required - won't be None)
            cache_position: Cache positions [seq_len] (required - won't be None)

        Returns:
            Output with logits for text generation
        NrX   )r$   r-   rI   rJ   s       r%   forwardz#TorchExportableModuleForVLM.forward   s     	r'   Nc                      y)a  
        Simplified generate method with guaranteed non-null input_ids.

        Args:
            pixel_values: Input images [1, channels, height, width] (optional)
            input_ids: Initial text tokens [1, seq_len] (required - won't be None)
            max_new_tokens: Maximum number of tokens to generate
            do_sample: Whether to use sampling or greedy decoding
            temperature: Temperature for sampling

        Returns:
            Generated sequences
        NrX   )r$   r-   rI   max_new_tokens	do_sampletemperaturerY   s          r%   generatez$TorchExportableModuleForVLM.generate   s      	r'   )r)      )NN2   F      ?)__name__
__module____qualname____doc__intr&   r9   rC   rV   r6   r[   r`   rX   r'   r%   r   r   (   s?    *c *c *0,0'2*6	
 ber'   r   c                       e Zd ZdZ	 	 	 ddedee   dee   deej                     ddf
 fdZ		 	 	 dd	eej                     d
eej                     deej                     dej                  fdZ	 	 	 	 	 dd	eej                     d
eej                     deej                     dee   dee   dej                  j                  fdZe	 	 	 	 	 	 ddej                  j                  dedededededededefd       Z xZS )rK   a  
    A recipe module designed to make a `PreTrainedModel` exportable with `torch.export`,
    specifically for decoder-only LM with cache. This module ensures that the
    exported model is compatible with further lowering and execution in `ExecuTorch`.
    Nr   
batch_sizer   devicereturnc                    t         |           |j                  j                         }t	        |d      r|j
                  du rt        d      t	        |d      r!t        |dd      t        ||||      | _	        n(t        j                  d       t        ||||      | _	        t        j                  dt               t!        j                  dt         d	          d| j                  j                  j                  _        y)
z
        Initializes the exportable module.

        Args:
            model (`PreTrainedModel`): The pretrained model to wrap.

        Raises:
            ValueError: If the model is configured with a unsupported cache implementation.
        	use_cacheFz5The model must have caching enabled to be performant.layer_typessliding_windowNzmUsing `StaticCache` for export as `layer_types` is not specified or `sliding_window` is `null` in the config.sdpa_without_vmapsdpa)superr&   r   get_text_confighasattrrn   
ValueErrorgetattr$TorchExportableModuleWithHybridCacher   logginginfo$TorchExportableModuleWithStaticCacher   registersdpa_mask_without_vmapr   _attn_implementation)r$   r   rj   r   rk   r   	__class__s         r%   r&   z.TorchExportableModuleForDecoderOnlyLM.__init__   s      	--/v{+v/?/?5/HTUU6=)gf>NPT.U.a=eZQ^`fgDJ LL >eZQ^`fgDJ$--.ACYZ(()<>UV\>]^7J

4r'   rI   inputs_embedsrJ   c                 >    | j                   j                  |||      S )a  
        Forward pass of the module, which is compatible with the ExecuTorch llm runner.

        Args:
            input_ids (`torch.Tensor`): Tensor representing current input token id to the module.
            inputs_embeds (`torch.Tensor`): Tensor representing current input embeddings to the module.
            cache_position (`torch.Tensor`): Tensor representing current input position in the cache.

        Returns:
            torch.Tensor: Logits output from the model.
        )rI   r   rJ   )r   r[   )r$   rI   r   rJ   s       r%   r[   z-TorchExportableModuleForDecoderOnlyLM.forward   s)    " zz!!') " 
 	
r'   r0   r1   c                    |du |du z  st        d      t        | j                  d      rBt        | j                  | j                  j                  | j                        }|j
                  }nNt        | j                  d      r!| j                  j                  j
                  }nd}t        j                  d       |;|||n2t        j                  |j                  d   t        j                  |      d	}n:|||n2t        j                  |j                  d
   t        j                  |      d}t        j                  j                  | j                  d||||nd      }	|	S )aw  
        Export the wrapped module using `torch.export`.

        Args:
            input_ids (`Optional[torch.Tensor]`):
                Tensor representing current input token id to the module. Must specify either this or inputs_embeds.
            inputs_embeds (`Optional[torch.Tensor]`):
                Tensor representing current input embeddings to the module. Must specify either this or input_ids.
            cache_position (`Optional[torch.Tensor]`):
                Tensor representing current input position in the cache. If not provided, a default tensor will be used.
            dynamic_shapes (`Optional[dict]`):
                Dynamic shapes to use for export if specified.
            strict(`Optional[bool]`):
                Flag to instruct `torch.export` to use `torchdynamo`.

        Returns:
            torch.export.ExportedProgram: The exported program that can be used for inference.

        Examples:
            Export with input_ids:
            ```python
            # Prepare inputs
            input_ids = torch.tensor([[1, 2, 3]], dtype=torch.long, device=model.device)
            cache_position = torch.arange(input_ids.shape[-1], dtype=torch.long, device=model.device)

            # Export
            exported = exportable_module.export(
                input_ids=input_ids,
                cache_position=cache_position
            )
            ```

            Export with inputs_embeds:
            ```python
            # Prepare embeddings
            inputs_embeds = torch.randn(1, 3, 768, device=model.device)  # batch_size=1, seq_len=3, hidden_size=768
            cache_position = torch.arange(inputs_embeds.shape[1], dtype=torch.long, device=model.device)

            # Export
            exported = exportable_module.export(
                inputs_embeds=inputs_embeds,
                cache_position=cache_position
            )
            ```
        Nz2Need to specify either input_ids or inputs_embeds.base_model_prefixr   cpuzfTorchExportableModuleForDecoderOnlyLM.export Can't infer device from the model. Set to CPU by default.r,   rk   rH   r)   )r   rJ   rX   Tr/   rY   r0   r1   )rv   ru   r   rw   r   rk   ry   warningr3   rO   shaperN   r6   )
r$   rI   r   rJ   r0   r1   basemodel_deviceinput_kwargsexported_programs
             r%   r6   z,TorchExportableModuleForDecoderOnlyLM.export  sB   j T!mt&;<QRR4::234::tzz'C'CTZZPD;;LTZZ)::++22L LOOx  &!- #1\\)//""5UZZP\]	L "/!- #1\\-"5"5a"8

S_`	L !<<..JJ)#/6T / 
  r'   r   promptr]   r^   r_   top_ktop_pc	                    | j                         }	 ||d      j                  j                  |      }
|
j                         }d}t	        |
j
                  d         D ]F  }|
dd||dz   f   }t        j                  |gt        j                  |      } |	||      }|dz  }H t	        |      D ]  }|ddddf   }t        j                  |gt        j                  |      } |	||      }|r|dkD  r||z  }n|}|dkD  r-|t        j                  ||      d   d	   k  }t        d
      ||<   |dk  rt        j                  |d      \  }}t        j                  t        j                  |d      d      }||kD  }|dddf   j                         |dddf<   d|d<   |j                  d||      }t        d
      ||<   t        j                  |d      }t        j                  |d      }n|j!                  dd      }|j#                         dkD  r|j%                  d      }t        j&                  ||gd      }|dz  }|j)                         |j*                  k(  s n |j-                  |d   d      S )a   
        Generate a sequence of tokens using an exported program.

        Args:
            exported_program (`torch.export.ExportedProgram`): The exported model being used for generate.
            tokenizer: The tokenizer to use.
            prompt (str): The input prompt.
            max_new_tokens (int): Maximum number of new tokens to generate.
            do_sample (bool): Whether to use sampling or greedy decoding.
            temperature (float): The temperature for sampling.
            top_k (int): The number of highest probability tokens to keep for top-k sampling.
            top_p (float): The cumulative probability for nucleus sampling.
            device (str): The device to use.

        Returns:
            str: The generated text.
        pt)return_tensorsr   r)   Nr   rH   r   ).r   Nz-infrc   T)
descendingdim.).r   )num_samplesr   keepdimr   )skip_special_tokens)modulerI   tocloneranger   r3   tensorrN   topkfloatsortcumsumsoftmaxscattermultinomialargmaxr   squeezecatitemeos_token_iddecode)r   	tokenizerr   r]   r^   r_   r   r   rk   exported_modulerI   generated_idscurr_positionicurr_input_idscurr_cache_position_outputslogitsindices_to_removesorted_logitssorted_indicescumulative_probssorted_indices_to_removeprobsnext_token_ids                             r%   r`   z.TorchExportableModuleForDecoderOnlyLM.generatea  s   < +113 fT:DDGGO	 ") yq)* 	A&q!a!e)|4N"',,ejjY_"`  .I\]AQM	 ~& 5	A*1bc62N"',,ejjY_"` &ObcG ?${2F$F 19(.FE1J11Mm1\(\%05fF,- 3;49JJvRV4W1M>',||EMM-UW4X^`'a$ 0@%/G,8PQTVYWYVYQY8Z8`8`8b,S!"W578,V4 )A(H(H^]u(v%05fF,- f"5 % 1 1%Q G !(2t D   "Q& - 5 5b 9 "II}m&D"MMQM !!#y'='==k5	p a 0dKKr'   NNN)NNNNN)   Frc   rb   rc   r   )rd   re   rf   rg   r   r   rh   r3   rk   r&   Tensorr[   dictboolr6   ExportedProgramstaticmethodstrr   r`   __classcell__r   s   @r%   rK   rK      s    %)'+)-#K#K SM#K  }	#K
 &#K 
#KN -10415	
ELL)
  -
 !.	

 

2 -10415)-!%Z ELL)Z   -Z  !.	Z 
 !Z  Z  
	%	%Z x 
 ! iL,,66iL iL 	iL
 iL iL iL iL iL 
iL iLr'   rK   c                   B    e Zd ZdZ	 	 	 ddedee   dee   deej                     ddf
 fdZ		 	 	 dd	eej                     d
eej                     deej                     fdZedej                  j                  dej                  dedej                  fd       Z xZS )r{   a  
    A recipe module designed to make a `PreTrainedModel` exportable with `torch.export`,
    specifically for decoder-only LM to `StaticCache`. This module ensures that the
    exported model is compatible with further lowering and execution in `ExecuTorch`.

    Note:
        This class is specifically designed to support export process using `torch.export`
        in a way that ensures the model can be further lowered and run efficiently in `ExecuTorch`.
    Nr   rj   r   rk   rl   c                    t         |           |j                  j                         }|j                  }|t        d      |j                  st        d      |j                  dk7  rt        d      |j                  i n|j                  }||j                  dd      }|t        d      ||j                  dd      }|t        d	      ||j                  d
|j                        }|| _        t        ||      | _        t        |d|j                   |j"                  z        }t        |d|j"                        }	| j                  j$                  }
| j                  j'                  ||	||
|       t)        t+        | j                              D ]r  }| j-                  d| | j                  j.                  |   j0                  d       | j-                  d| | j                  j.                  |   j2                  d       t y)a  
        Initializes the wrapper module with the pretrained model.

        Args:
            model (`PreTrainedModel`): The pretrained model to wrap. The model must have caching
                enabled and use a 'static' caching implementation.
            batch_size (`Optional[int]`): The batch size of the model. If not provided, we check if a value can be found
                in `generation_config.cache_config` and otherwise we raise a ValueError.
            max_cache_len (`Optional[int]`): The maximum cache length for generation. Same mechanism as `batch_size` if
                not provided.
            device (`Optional[torch.device]`): The device to use. If not provided, we check if a value can be found
                in `generation_config.cache_config` and otherwise we use `model.device` (no error is raised).

        Raises:
            AssertionError: If the pretrained model does not have caching enabled or if it does
            not use a 'static' caching implementation in `model.generation_config`.
            ValueError: If `batch_size` or `max_cache_len` is not provided, either as an argument or in `cache_config`.
        NvThe model must have a generation config to be exported with static caching. Please set `generation_config` in `model`.zvThe model must have caching enabled to be exported with static caching. Please set `generation_config.use_cache=True`.staticzThe model must use a 'static' caching implementation to be exported with static caching. Please set `generation_config.cache_implementation='static'`.rj   Fbatch_size must be provided, either as an argument or in cache_config.r   Imax_cache_len must be provided, either as an argument or in cache_config.rk   )r   r   head_dimnum_key_value_heads
key_cache_F
persistentvalue_cache_)rs   r&   r   rt   generation_configAssertionErrorrn   cache_implementationcache_configgetrv   rk   r   r
   static_cacherw   r=   num_attention_headsr,   early_initializationr   lenregister_bufferlayerskeysvaluesr$   r   rj   r   rk   r   r   r   r   	num_headsr,   r   r   s               r%   r&   z-TorchExportableModuleWithStaticCache.__init__  s   2 	--/!33 $ =  !** A  11X= P 
 /;;CrIZIgIg %)),=J! !ijj (,,_dCM$ !lmm>!%%h=F 
'mFS6:v/A/AVE_E_/_`F$96;U;UV	

  ..z9hPUW]^s4,,-. 	kA  :aS!143D3D3K3KA3N3S3S`e f  <s!3T5F5F5M5Ma5P5W5Wdi j	kr'   rI   r   rJ   c                     | j                   }| j                  |||d|d      }t        |d      r|j                  S |j                  S )a8  
        Forward pass of the module, which is compatible with the ExecuTorch runtime.

        Args:
            input_ids (`torch.Tensor`): Tensor representing current input token id to the module.
            inputs_embeds (`torch.Tensor`): Tensor representing current input embeddings to the module.
            cache_position (`torch.Tensor`): Tensor representing current input position in the cache.

        Returns:
            torch.Tensor: Logits output from the model.

        This forward adapter serves two primary purposes:

        1. **Making the Model `torch.export`-Compatible**:
            The adapter hides unsupported objects, such as the `Cache`, from the graph inputs and outputs,
            enabling the model to be exportable using `torch.export` without encountering issues.

        2. **Ensuring Compatibility with `ExecuTorch` runtime**:
            The adapter matches the model's forward signature with that in `executorch/extension/llm/runner`,
            ensuring that the exported model can be executed in `ExecuTorch` out-of-the-box.
        NTrI   r   rJ   attention_maskpast_key_valuesrn   r   )r   r   ru   r   last_hidden_state)r$   rI   r   rJ   r   outss         r%   r[   z,TorchExportableModuleWithStaticCache.forward#  sX    6 ++zz')+  
 4";; )))r'   r   prompt_token_idsr]   c           	      f   |j                   }|j                  d   }||z   }| j                         D ]3  \  }}|j                  d      s|j                  d   }t	        ||      } n g }	t        t	        ||            D ]y  }
| j                         j                  |dd|
|
dz   f   t        j                  |
gt        j                  |            }|	j                  |d   |
   j                                { t        j                  dddddf   d	      j                         }|	j                  |       t        |	      |k  r| j                         j                  t        j                  |ggt        j                  |      t        j                  t        |	      gt        j                  |            }t        j                  |dddddf   d	      j                         }|	j                  |       t        |	      |k  rt        j                  |	gt        j                  |      S )
a  
        Generate a sequence of tokens using an exported program.

        This util function is designed to test exported models by simulating the generation process.
        It processes the input prompt tokens sequentially (no parallel prefill).
        This generate function is not intended to replace the original `generate` method, and the support
        for leveraging the original `generate` is potentially planned!

        Args:
            exported_program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.
            prompt_token_ids (`torch.Tensor`): Tensor representing the input prompt token IDs.
            max_new_tokens (`int`): Maximum number of new tokens to generate. Note that the total generation
                length is limited by both `max_new_tokens` and the model's cache size.

        Returns:
            torch.Tensor: A tensor containing the generated sequence of token IDs, including the original prompt tokens.
        r   	key_cacher   Nr)   r   rH   r   r   )rk   r   named_buffers
startswithrP   r   r   r[   r3   r   rN   appendr   r   r   )r   r   r]   rk   prompt_token_lenmax_generation_lengthbuffer_namebufferr   response_tokens	input_posresultcurrent_tokens                r%   r`   z-TorchExportableModuleWithStaticCache.generateO  s    . "((+11"5 0> A#3#A#A#C 	K%%k2 &Q(+,A=(Q%		 s#8:JKL 	JI%,,.66*1i)a-.G+GH$||YKuzzRXY 7 F ""#3A#6y#A#F#F#HI	J VAr1H%52>CCE}-/"%::%,,.66,,'8

SYZ$||S-A,B%**]cd 7 F "LL2q)9rBGGIM""=1 /"%:: ||_-UZZOOr'   r   )rd   re   rf   rg   r   r   rh   r3   rk   r&   
LongTensorr   r[   r   r6   r   r`   r   r   s   @r%   r{   r{     s     %)'+)-HkHk SMHk  }	Hk
 &Hk 
HkX 150415	**E,,-**  -** !.	**X 2P,,662P,,2P 2P 
	2P 2Pr'   r{   c                        e Zd ZdZ	 	 	 ddedee   dee   deej                     ddf
 fdZ		 	 	 dd	eej                     d
eej                     deej                     dej                  fdZ xZS )rx   a  
    A recipe module designed to make a `PreTrainedModel` exportable with `torch.export`,
    specifically for decoder-only LM to hybrid `StaticCache`. This module ensures that the
    exported model is compatible with further lowering and execution in `ExecuTorch`.
    Nr   rj   r   rk   rl   c                    t         |           || _        |j                  j	                         }|j
                  }|t        d      |j                  st        d      |j                  i n|j                  }||j                  dd      }|t        d      ||j                  dd      }|t        d      ||j                  d|j                        }t        ||	      | _        t        |d
|j                  |j                   z        }t        |d|j                         }	| j                  j"                  }
| j                  j%                  ||	||
|       t'        t)        | j                              D ]r  }| j+                  d| | j                  j,                  |   j.                  d       | j+                  d| | j                  j,                  |   j0                  d       t y)a  
        Initializes the exportable module.

        Args:
            model (`PreTrainedModel`): The pretrained model to wrap.
            batch_size (`Optional[int]`): The batch size of the model. If not provided, we check if a value can be found
                in `generation_config.cache_config` and otherwise we raise a ValueError.
            max_cache_len (`Optional[int]`): The maximum cache length for generation. Same mechanism as `batch_size` if
                not provided.
            device (`Optional[torch.device]`): The device to use. If not provided, we check if a value can be found
                in `generation_config.cache_config` and otherwise we use `model.device` (no error is raised).
        Raises:
            AssertionError: If the model doesn't have the expected configuration for hybrid StaticCache.
            ValueError: If `batch_size` or `max_cache_len` is not provided, either as an argument or in `cache_config`.
        Nr   z Model must have caching enabled.rj   r   r   r   rk   r   r   r   r   r   Fr   r   )rs   r&   r   r   rt   r   r   rn   r   r   rv   rk   r
   cacherw   r=   r   r,   r   r   r   r   r   r   r   r   s               r%   r&   z-TorchExportableModuleWithHybridCache.__init__  s   , 	
--/!33 $ =   !CDD.;;CrIZIgIg%)),=J! !ijj (,,_dCM$ !lmm>!%%h=F !mL
6:v/A/AVE_E_/_`F$96;U;UV	

  

''
IxPVW s4::' 	dA  :aS!14::3D3DQ3G3L3LY^ _  <s!3TZZ5F5Fq5I5P5P]b c	dr'   rI   r   rJ   c                 \    | j                  |||d| j                  d      }|j                  S )a  
        Forward pass of the module, which is compatible with the ExecuTorch llm runner.

        Args:
            input_ids (`torch.Tensor`): Tensor representing current input token id to the module.
            inputs_embeds (`Optional[torch.Tensor]`): Tensor representing current input embeddings to the module.
            cache_position (`torch.Tensor`): Tensor representing current input position in the cache.

        Returns:
            torch.Tensor: Logits output from the model.
        NTr   )r   r   r   )r$   rI   r   rJ   r   s        r%   r[   z,TorchExportableModuleWithHybridCache.forward  s9    $ **') JJ  
 ~~r'   r   )rd   re   rf   rg   r   r   rh   r3   rk   r&   r   r   r[   r   r   s   @r%   rx   rx     s     %)'+)-=d=d SM=d  }	=d
 &=d 
=dB 150415	E,,-  - !.	
 
r'   rx   r   example_input_idsexample_cache_positionr0   r1   c                    t         st        d      ddl}t        j                  dt
               t        j                  dt        d          d| j                  _         |j                         5  ||n* |j                  dgg|j                  | j                        }||n) |j                  dg|j                  | j                        }t        d      r1|j                  j                  t        |       d	||d
|||nd      }nd|t!        j"                  d       |t!        j"                  d       |j                  j$                  j'                  t        |       d	||d
dd      }|cddd       S # 1 sw Y   yxY w)a  
    Convert a `PreTrainedModel` into an exportable module and export it using `torch.export`,
    ensuring the exported model is compatible with `ExecuTorch`.

    Args:
        model (`PreTrainedModel`): The pretrained model to be exported.
        example_input_ids (`Optional[torch.Tensor]`): Example input token id used by `torch.export`.
        example_cache_position (`Optional[torch.Tensor]`): Example current cache position used by `torch.export`.
        dynamic_shapes(`Optional[dict]`): Dynamic shapes used by `torch.export`.
        strict(`Optional[bool]`): Flag to instruct `torch.export` to use `torchdynamo`.

    Returns:
        Exported program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.
    torch >= 2.3 is required.r   Nrq   rr   r)   r   z2.6.0rX   rH   Tr   zWDynamic shapes spec will be ignored by convert_and_export_with_cache for torch < 2.6.0.zSThe strict flag will be ignored by convert_and_export_with_cache for torch < 2.6.0.F)r/   rY   pre_dispatchr1   )r   ImportErrortorch.export._tracer   r|   r}   r   r   r~   no_gradr   rN   rk   r   r6   r{   ry   r   _trace_export)r   r   r   r0   r1   r3   r   s          r%   convert_and_export_with_cacher    s   * .566 !))*=?UV$$%8:QRX:YZ(;ELL%	 '  !, se5::ellK 	 &1 #qcELLI 	 %W-$||224U;%6J`a-!'!3v  3   )m ! uv
  %||22::4U;%6J`a"  ;    O'  '  ' s   /C?E88Fc                   (     e Zd ZdZ fdZd Z xZS ) Seq2SeqLMEncoderExportableModulez
    A wrapper module designed to make a Seq2Seq LM encoder exportable with `torch.export`.
    This module ensures that the exported encoder model is compatible with ExecuTorch.
    c                 0    t         |           || _        y N)rs   r&   encoder)r$   encoder_modelr   s     r%   r&   z)Seq2SeqLMEncoderExportableModule.__init__9  s    $r'   c                 :    | j                  |      j                  S )N)rI   )r  r   )r$   rI   s     r%   r[   z(Seq2SeqLMEncoderExportableModule.forward=  s    ||i|0BBBr'   rd   re   rf   rg   r&   r[   r   r   s   @r%   r	  r	  3  s    
%Cr'   r	  c                   (     e Zd ZdZ fdZd Z xZS )/Seq2SeqLMDecoderExportableModuleWithStaticCachez
    A wrapper module designed to make a Seq2Seq LM decoder exportable with `torch.export`,
    specifically for use with static caching. This module ensures the exported decoder
    is compatible with ExecuTorch.
    c                    t         |           |j                         | _        |j                  | _        |j
                  | _        t        |j                               j                  }t        | j
                  |      | _
        t        | j
                  d| j
                  j                  | j
                  j                  z        }t        | j
                  d| j
                  j                        }| j                  j                  |||t        j                   |       t#        | j                  t%        | j
                              | _        t)                t+        t-        | j                              D ]r  }| j/                  d| | j                  j0                  |   j2                  d       | j/                  d| | j                  j0                  |   j4                  d       t y )	Nr   r   r   r   r   Fr   r   )rs   r&   get_decoderdecoderlm_headr   next
parametersrk   r
   r   rw   r=   r   r   r3   r5   r	   r   r   %register_dynamic_cache_export_supportr   r   r   r   r   r   )	r$   r   max_static_cache_lengthrj   r   r   r   r   r   s	           r%   r&   z8Seq2SeqLMDecoderExportableModuleWithStaticCache.__init__H  s    ((*}}ll E,,./66 (t{{Jab4;;
DKK4K4Kt{{OnOn4noDKK)>@_@_`	..z9hPUP]P]_kl():):LPTP[P[<\]
-/ s4,,-. 	kA  :aS!143D3D3K3KA3N3S3S`e f  <s!3T5F5F5M5Ma5P5W5Wdi j	kr'   c                 n    | j                  ||| j                  d|      }| j                  |d         }|S )NT)rI   encoder_hidden_statesr   rn   rJ   r   )r  r   r  )r$   decoder_input_idsr  rJ   r   	lm_logitss         r%   r[   z7Seq2SeqLMDecoderExportableModuleWithStaticCache.forwardb  sB    ,,'"7 JJ)  
 LL,	r'   r  r   s   @r%   r  r  A  s    k4r'   r  c                   <     e Zd Z	 d fd	Zd Zd ZddZd Z xZS )Seq2SeqLMExportableModulec                     t         |           || _        |j                         | _        |j
                  | _        || _        t        d||||d      | _        d | _	        d | _
        y )NT)rj   r   )rn   
max_lengthr   r   )rs   r&   
full_modelget_encoderr  r   max_hidden_seq_lengthr   r   exported_encoderexported_decoder)r$   r   rj   r%  r   max_cache_lengthr   s         r%   r&   z"Seq2SeqLMExportableModule.__init__s  sm     	((*ll%:"!1'!5(!1	"
 !% $r'   c                    t        | j                        j                  | j                  j                        j                         }t        j                  j                  d| j                        }t        j                         5  t        j                  j                  ||fdd|iid      }d d d        |S # 1 sw Y   S xY w)Nencoder_seq_lengthrF   rI   r)   Tr0   r1   )r	  r  r   r#  rk   r2   r3   r6   r7   r%  r  )r$   encoder_input_idswrapped_encoderrU   r&  s        r%   _export_encoderz)Seq2SeqLMExportableModule._export_encoder  s    :4<<HKKDOOLbLbchhj ll&&';A[A[&\ ]]_ 	$||22"3!5{UVXcTdFenr  3  	
  	
  s   )B99Cc           	         | j                   j                  }t        | j                   | j                  j                  j                  d      | j                  j                  j                  d            j                  |      j                         }|j                  |      }|j                  |      }|j                  |      }t        j                  j                  d| j                        }t        j                         5  t        j                  j                  ||||fd d|id dd	      }d d d        |S # 1 sw Y   S xY w)
Nr   rj   )r   r  rj   encoder_hidden_seq_lengthrF   r)   )r  r  rJ   Tr+  )r#  rk   r  r   r   r   r   r2   r3   r6   r7   r%  r  )r$   r  r  rJ   target_devicewrapped_decoderencoder_seq_len_dimr'  s           r%   _export_decoderz)Seq2SeqLMExportableModule._export_decoder  s1   ..;oo(,(>(>(K(K(O(OP_(`11>>BB<P
 RTV 	 .00? 5 8 8 G'**=9 $ll../JPTPjPj.k ]]_ 
	$||22"$9>J)-./1D-E&* 
   3 	 
	  
	  s   -D;;Ec                 X   | j                   j                  }||n%t        j                  dt        j                  |      }||n't        j
                  dggt        j                  |      }||n&t        j
                  dgt        j                  |      }||n_t        j                  | j                  j                  j                  d      d| j                  j                  ft        j                  |      }	| j                  |      | _        | j                  ||	|      | _        | S )N)r)   
   r   r   rj   r6  )r#  rk   r3   onesrN   r   rM   r   r   r   r   d_modelr5   r.  r&  r4  r'  )
r$   r,  r  r  rJ   rk   example_encoder_input_idsexample_decoder_input_idsr   example_encoder_hidden_statess
             r%   r6   z Seq2SeqLMExportableModule.export  s   '' !, G5::fE 	" !, se5::fE 	" -8NellA3V[V`V`io>p 	
 %0 "''4488FDKKL_L_`mm 	& !% 4 45N O $ 4 4%'DF\!

 r'   c                    t        j                         5  | j                  j                  }|j                  |k7  r|j	                  |      } | j
                  j                         |      }t        j                  dggt         j                  |      }dg}t        |dz
        D ]  } | j                  j                         ||t        j                  |gt         j                  |            }t        j                  |d d dd d f   d      j                         }	|j                  |	       t        j                  |	ggt         j                  |      }|	| j                  j                  k(  s n |cd d d        S # 1 sw Y   y xY w)Nr   r   r)   r   r   )r3   r  r#  rk   r   r&  r   r   rN   r   r'  r   r   r   r   r   )
r$   r   r]   r   encoder_outputr  r   r   r   
next_tokens
             r%   r`   z"Seq2SeqLMExportableModule.generate  sR   ]]_  	!??11L  &&,6#3#6#6|#D  <T2299;<LMN !&qcU%**\ ZCM >A-. 7..557%~u||QCuzzbn7o
 #\\&B*:CHHJ
$$Z0 %*LL:,uzzZf$g! !9!99" !A 	!  	!  	!s   EE?1E??F)r)   i   r   ra   NNNN)	rd   re   rf   r&   r.  r4  r6   r`   r   r   s   @r%   r   r   r  s!    os%* ! F@!!r'   r   example_attention_maskc           
         t         st        d      t        j                  dt               t        j                  dt
        d          d| j                  _        t                t        j                         5  t        j                  j                  | d||t        | j                        ddd	      }|cd
d
d
       S # 1 sw Y   y
xY w)a  
    Export a model with DynamicCache using `torch.export`, ensuring the exported model is compatible with `ExecuTorch`.

    Args:
        model (`PreTrainedModel`): The pretrained model to be exported.
        example_input_ids (`Optional[torch.Tensor]`): Example input token id used by `torch.export`.
        example_attention_mask (`Optional[torch.Tensor]`): Example attention mask used by `torch.export`.

    Returns:
        Exported program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.
    r   rq   rr   rX   r  T)rI   r   r   rn   F)r1   N)r   r  r   r|   r}   r   r   r~   r  r3   r  r6   r   )r   r   r@  r   s       r%   export_with_dynamic_cacherB    s      .566 !))*=?UV$$%8:QRX:YZ(;ELL%)+	   <<..."8#/u||#D!	  / 

       s   8>C  C	c                  ^   	 t         j                  j                  j                  t        d t
        t        j                   dt        j                   d        t         j                  j                  j                  t        d        y# t        $ r} dt        |       vr Y d} ~ yd} ~ ww xY w)z>
    Utilities for `DynamicCache` <> torch.export support
    c                 f    t         j                  j                  j                  t	        |             S r  )r3   utils_pytree_dict_flatten_get_cache_dictdynamic_caches    r%   <lambda>z7register_dynamic_cache_export_support.<locals>.<lambda>.  s!    %++"5"5"C"COTaDb"c r'   .c                 f    t         j                  j                  j                  t	        |             S r  )r3   rE  rF  _dict_flatten_with_keysrH  rI  s    r%   rK  z7register_dynamic_cache_export_support.<locals>.<lambda>1  s#    u{{7J7J7b7b.8 r'   )serialized_type_nameflatten_with_keys_fnc                 h    t         j                  j                  j                  t	        |       |      S r  )r3   fxrF  _dict_flatten_specrH  )r   specs     r%   rK  z7register_dynamic_cache_export_support.<locals>.<lambda>8  s$     0 0 C COTYDZ\` a r'   z!already registered as pytree nodeN)r3   rE  rF  register_pytree_noder   _unflatten_dynamic_cachere   rd   rR  register_pytree_flatten_specrv   r   )es    r%   r  r  &  s    
00c$$0$;$;#<Al>S>S=T!U" 	1 	
 	55a	

  .c!f< =s   BB 	B,B''B,r   c                 `   t        d | j                  D              rt        d      t        st	        j
                  d       | j                  D cg c]  }|j                  |j                   c}| j                  D cg c]  }|j                  |j                   c}dS c c}w c c}w )z9Convert cache to dictionary format for pytree operations.c              3   J   K   | ]  }t        |t        t        f         y wr  )
isinstancer   r   ).0layers     r%   	<genexpr>z"_get_cache_dict.<locals>.<genexpr>B  s!     
fPUz%,0I!JKK
fs   !#zFThis pytree flattening function should only be applied to DynamicCachez[DynamicCache + torch.export is tested on torch 2.6.0+ and may not work on earlier versions.)r   value_cache)anyr   RuntimeErrorr   ry   r   r   r   )r   r]  s     r%   rH  rH  @  s    

fY^YeYe
ffcdd-uv /4llUUejj>TejjU27,,[%,,BZ[ U[s   B&#B&B+B+contextc                    t         j                  j                  j                  | |      }t	               }|j                  dg       }|j                  dg       }t        t        t        |      t        |                  D ]?  }|t        |      k  r||   nd }|t        |      k  r||   nd }|j                  |||       A |S )Nr   r_  )
r3   rE  rF  _dict_unflattenr   r   r   rG   r   update)	r   rb  
dictionaryr   key_list
value_listidxkeyvalues	            r%   rV  rV  N  s    $$44VWEJNE~~k2.Hr2JSXJ89 &"S]2hsm#&Z#8
3dS%%& Lr'   rj   rJ   	kv_length	kv_offsetmask_functionr   
local_sizeallow_is_causal_skipallow_torch_fixrl   c	                 &   |j                   d   }
t        |||      }|rt        ||
||      ryt        j                  ||j
                        }||z  }|j                  dd      }t        |	d   dd      }t        |	d   dd      }||t        d	      ||k  }||||z
  kD  }||z  }n|||z  ||z  k(  }||z  }|ddddddf   j                  | ddd      }|||ddddddf   z  }t        s|r|t        j                  | dd
      z  }|S )a  
    Create a 4D boolean mask of shape `(batch_size, 1, query_length, kv_length)` where a value of True indicates that
    the element should take part in the attention computation, and False that it should not.

    This is similar to `masking_utils.sdpa_mask` but does not use `vmap` which is incompatible with export.

    Args:
        batch_size (`int`):
            The batch size of the input sequence.
        cache_position (`torch.Tensor`):
            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
        kv_length (`int`):
            The size that the key and value states will have during the attention computation.
        kv_offset (`int`, optional):
            An optional offset to indicate at which first position the key and values states will refer to.
        mask_function (`Callable`):
            The mask factory function describing the mask pattern.
        attention_mask (`torch.Tensor`, optional):
            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
        local_size (`int`, optional):
            The size of the local attention, if we do not use full attention. This is used only if `allow_is_causal_skip=True`
            to try to skip mask creation if possible.
        allow_is_causal_skip (`bool`, optional):
            Whether to allow to return `None` for the mask under conditions where we can use the `is_causal` argument in
            `torch.sdpa` instead. Default to `True`.
        allow_torch_fix (`bool`, optional):
            Whether to update the mask in case a query is not attending to any tokens, to solve a bug in torch's older
            versions. We need an arg to skip it when using eager. By default `True`.

    r   N)rk   r   r)   r   rp   attention_chunk_sizez;Cannot use both `sliding_window` and `attention_chunk_size`Tr   )r   r   r   r3   rO   rk   viewrw   rv   expandr   all)rj   rJ   rl  rm  rn  r   ro  rp  rq  rY   q_lengthpadding_mask	kv_arangereshaped_cache_positionrp   
chunk_sizecausal_masksliding_mask_overlaychunked_mask_overlays                      r%   r}   r}   [  sp   V ##A&H'	9ML  8xQZ\f g Y~/D/DEII,11"a8 VH-/?FN)+A4HJ!j&<VWW 66K!(+B^+SS++		(J6:QU_:__++dD!Q./66z2r2NK!LD$1A$BB /?uyy+2tDDr'   r?  )NN)r   NNNTT)/ry   typingr   r   r3   cache_utilsr   r   r   r	   r
   generation.configuration_utilsr   masking_utilsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   r   nnModulerK   r{   rx   r   r   r   r  r	  r  r   rB  r  rH  rE  rF  ContextrV  rh   r}   rX   r'   r%   <module>r     s=    %   >  F W WtILEHHOO ILXtP588?? tPnb588?? bN 1559%)!F F -F  %U\\2F  TN	F 
 TNF RCuxx C.ehhoo .bH! H!Z 1559& & -&  %U\\2& R4< 
ekk.A.A.I.I 
" (,-1 $!% VVLLV V 	V
 H%V U\\*V V V V ellVr'   