
    ho                       d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlmZmZ d dlmZmZ d dlmZmZ d dlmZ d dl
mZ d dlmZmZ d dlmZ d d	lmZ d dlZd d
lm Z m!Z! d dl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0m1Z1 ddl)m2Z2  e0       rd dl3Z3d dlm4Z4m5Z5m6Z6m7Z7 ddl8m9Z9m:Z:  e$       rd dl;Z; e(       rd dl<m=Z=  e&       xr  e#       xr  e'       xr  e%       Z>e>rd dl?Z?d dl@mAZAmBZB d dlCmDZD d dlEmFZFmGZG d dlHmIZI d dlJmKZK d dlLmMZM d dlNmOZOmPZPmQZQmRZRmSZS d dlTmUZU d dlVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZe d dlfmgZg d dlhmiZimjZjmkZk  G d d egd!"      Zl G d# d$eUd!"      Zm G d% d&eKd!"      Zn ejel      Zo ejem      Zp ejen      Zqh d'Zrh d(Zsh d)Zt e1j                  ev      Zwd*d+d,d-iZx eyexj                               Z{ G d. d/ej                        Z}d0efd1Z~d2ed3d4d5d4fd6Z G d7 d8      Z G d9 d:      Ze G d; d<             Z G d= d>e2      Zevd?k(  r e       Zej                          yy)@    N)ArgumentParser	Namespace)	GeneratorIterable)	dataclassfield)BytesIO)Thread)OptionalUnion)
model_info)HF_HUB_OFFLINE)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES*MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)is_fastapi_availableis_librosa_availableis_openai_availableis_pydantic_availableis_uvicorn_availableis_vision_available   )
AutoConfigLogitsProcessorListPreTrainedTokenizerFastProcessorMixinTextIteratorStreamer)is_torch_availablelogging   )BaseTransformersCLICommand)AutoProcessorBitsAndBytesConfigGenerationConfigPreTrainedModel)ContinuousBatchingManagerRequestStatus)Image)FastAPIHTTPException)CORSMiddleware)JSONResponseStreamingResponse)Transcription)TranscriptionCreateParamsBase)ChatCompletionMessageParam)ChatCompletionChunkChoiceChoiceDeltaChoiceDeltaToolCallChoiceDeltaToolCallFunction)CompletionCreateParamsStreaming)ResponseResponseCompletedEventResponseContentPartAddedEventResponseContentPartDoneEventResponseCreatedEventResponseErrorResponseErrorEventResponseFailedEventResponseInProgressEventResponseOutputItemAddedEventResponseOutputItemDoneEventResponseOutputMessageResponseOutputTextResponseTextDeltaEventResponseTextDoneEvent)ResponseCreateParamsStreaming)	BaseModelTypeAdapterValidationErrorc                       e Zd ZU dZeed<   y))TransformersResponseCreateParamsStreamingz
        OpenAI's ResponseCreateParamsStreaming with an additional field for the generation config (as a json string).
        generation_configN__name__
__module____qualname____doc__str__annotations__     [/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/commands/serving.pyrJ   rJ   w       	 rT   rJ   F)totalc                       e Zd ZU dZeed<   y)+TransformersCompletionCreateParamsStreamingz
        OpenAI's CompletionCreateParamsStreaming with an additional field for the generation config (as a json string).
        rK   NrL   rS   rT   rU   rY   rY   ~   rV   rT   rY   c                   :    e Zd ZU dZeed<   eed<   dZee	   ed<   y)%TransformersTranscriptionCreateParamsz
        OpenAI's TranscriptionCreateParamsBase with an additional field for the generation config (as a json string).
        filerK   FstreamN)
rM   rN   rO   rP   bytesrR   rQ   r]   r   boolrS   rT   rU   r[   r[      s"    	 !&&rT   r[   >   textuserstorepromptinclude	reasoning
background
truncationtool_choiceservice_tiertop_logprobsmax_tool_callsprevious_response_id>   nstopra   audiorb   logprobsmetadata	functions
modalities
predictionrh   ri   rj   function_callstream_optionsresponse_formatpresence_penaltyreasoning_effortweb_search_optionsparallel_tool_callsmax_completion_tokens>   rc   rd   languagerw   chunking_strategytimestamp_granularitiesqwenz<tool_call>z</tool_call>)startendc                       e Zd ZdZdZdZdZy)ModalityLLMVLMSTTTTSN)rM   rN   rO   r   r   r   r   rS   rT   rU   r   r      s    
C
C
C
CrT   r   argsc                     t        |       S )z~
    Factory function used to instantiate serving server from provided command line arguments.

    Returns: ServeCommand
    )ServeCommand)r   s    rU   serve_command_factoryr      s     rT   reqmodel_generation_configr#   returnc                 `   | j                  d      "t        di t        j                  | d         }nt	        j
                  |      } |j                  di |}|j                         D ]  \  }}|	t        |||        | j                  d      t        | d         |_
        | j                  d      t        | d         |_
        | j                  d      t        | d         |_        | j                  d      
| d   |_        | j                  d      
| d   |_        | j                  d      +t        | d         |_        t        | d         dk(  rd	|_        | j                  d
      t        | d
         |_        | j                  d      t%        j&                  | d          |S )a  
    Creates a generation config from the parameters of the request. If a generation config is passed in the request,
    it will be used as a baseline for parameterization. Otherwise, we will use the model's default generation config.
    Other parameters in the request will be applied on top of the baseline.

    Args:
        req (`dict`):
            The request which may optionally contain generation parameters.
        model_generation_config (`GenerationConfig`):
            The model's default generation config.
        kwargs (`dict`):
            Additional parameters to set in the generation config.

    Returns:
        The prepared `GenerationConfig` object.
    rK   max_output_tokens
max_tokensfrequency_penalty
logit_biasrn   temperatureg        Ftop_pseedrS   )getr#   jsonloadscopydeepcopyupdateitemssetattrintmax_new_tokensfloatrepetition_penaltysequence_biasstop_stringsr   	do_sampler   torchmanual_seed)r   r   kwargsrK   non_standard_kwargskvs          rU   !create_generation_config_from_reqr      s   . ww"#/,Ttzz#>Q:R/ST MM*AB2+22<V<#))+ -1=%q!,-
 ww"#/+.s3F/G+H( ww|(+.s</@+A(
ww"#//4S9L5M/N,
ww|(*-l*;'
wwv"),V&
ww})(-c-.@(A%]#$+*/'
www#"'G"5
wwv"#f+&rT   c                       e Zd ZdZd Zd Zy)	ToolStatez7Lightweight class to keep track of the tool call state.c                 $    | j                          y N)resetselfs    rU   __init__zToolState.__init__!  s    

rT   c                 <    d| _         d| _        d| _        d| _        y)z>Reset the tool call state (assumes we're outside a tool call).Fr    N)inside_tool_callhas_tool_name_definedarg_nesting_levelbufferr   s    rU   r   zToolState.reset$  s!     %%*"!"rT   N)rM   rN   rO   rP   r   r   rS   rT   rU   r   r     s    ArT   r   c            	       F    e Zd ZdZ	 ddddedeed      fdZd	 Zd
 Z	d Z
y)
TimedModelz
    A class that holds a PreTrainedModel instance and its associated processor.
    Automatically deletes the instances after a specified timeout.
    Nmodelr$   timeout_seconds	processor)r   r   c                     || _         t        |j                        | _        || _        || _        t        j                  | j
                  | j                        | _	        | j                  j                          y r   )r   rQ   name_or_path_name_or_pathr   r   	threadingTimer_delete_model_timerr   )r   r   r   r   s       rU   r   zTimedModel.__init__2  s[     
 !3!34".ood&:&:D<N<NOrT   c                     | j                   j                          t        j                  | j                  | j
                        | _         | j                   j                          y)z2Reset the timer for the deletion of the instances.N)r   cancelr   r   r   r   r   r   s    rU   reset_timerzTimedModel.reset_timer?  s@    ood&:&:D<N<NOrT   c                 Z   t        | d      r| j                  | `| `d| _        d| _        t        j                          t
        j                  j                         rt
        j                  j                          t        j                  | j                   d| j                   d       yyy)z>Delete the wrapped model and processor and clean up resources.r   Nz was removed from memory after z seconds of inactivity)hasattrr   r   gccollectr   cudais_availableempty_cacheloggerinfor   r   r   s    rU   r   zTimedModel._delete_modelE  s    4!djj&<
DJ!DNJJL zz&&(

&&(KK%%&&EdFZFZE[[qr '=!rT   c                 <    t        | d       xs | j                  du S )z)Check if the instances have been deleted.r   N)r   r   r   s    rU   
is_deletedzTimedModel.is_deletedV  s     4))?TZZ4-??rT   r   )rM   rN   rO   rP   r   r   r   r   r   r   r   rS   rT   rU   r   r   ,  sJ     SW	   E"MNO	"@rT   r   c                      e Zd ZU dZ edddi      Zeed<    eddg d	d
      Ze	e   ed<    eddg d	d
      Z
e	e   ed<    edddi      Zeed<    edddi      Ze	e   ed<    edddi      Zeed<    edddi      Zeed<    eddddgd
      Zeed<    edddi      Zeed<    edddi      Zeed<    ed dd!i      Zeed"<    ed#dd$i      Zeed%<    ed&dd'i      Zeed(<    eddd)i      Ze	e   ed*<    eddd+i      Zeed,<    eddd-i      Zeed.<    eddd/i      Ze	e   ed0<   d1 Zy)2ServeArgumentsz
    Arguments for the serve CLI.

    See the metadata arg for each argument's description -- the metadata will be printed with
    `transformers serve --help`
    autohelpzfDevice to use for inference; will default to `auto` andplace the model on an accelerator if available.)defaultrq   deviceNzA`torch_dtype` is deprecated! Please use `dtype` argument instead.)r   bfloat16float16float32)r   choicestorch_dtypezOverride the default `torch.dtype` and load the model under this dtype. If `'auto'` is passed, the dtype will be automatically derived from the model's weights.dtypeFz2Whether to trust remote code when loading a model.trust_remote_codezWhich attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`.attn_implementationzIWhether to use 8 bit precision for the base model - works only with LoRA.load_in_8bitzIWhether to use 4 bit precision for the base model - works only with LoRA.load_in_4bitnf4zQuantization type.fp4bnb_4bit_quant_typez#Whether to use nested quantization.use_bnb_nested_quant	localhostz$Interface the server will listen to.hosti@  zPort the server will listen to.porti,  z@Time in seconds after which a model will be removed from memory.model_timeoutr   z8Logging level as a string. Example: 'info' or 'warning'.	log_levelz1The default seed for torch, should be an integer.default_seedztWhether to enable CORS. Some apps that make requests from external domains (e.g. Cursor) require CORS to be enabled.enable_corsz+Whether to turn on strict input validation.input_validationzName of the model to be forced on all requests. This is useful for testing Apps that don't allow changing models in the request.force_modelc                 `    | j                   "| j                  dk(  r| j                   | _        yyy)z(Only used for BC `torch_dtype` argument.Nr   )r   r   r   s    rU   __post_init__zServeArguments.__post_init__  s1     'DJJ&,@))DJ -A'rT   )rM   rN   rO   rP   r   r   rQ   rR   r   r   r   r   r_   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rS   rT   rU   r   r   [  s4     >
FC  "'WA
"K#  !PA
E8C=  $)] ^t  */ r
*#  efL$  efL$   %UFZhmotgu=vww!&uHm?n!o$o kV=c4deD#edf6W-XYD#Y\]M3  &*d!eIs  #(([\#L(3-  &
K  #B
d  "'2
"K# *rT   r   c                   D   e Zd Zedefd       ZdefdZdedddd	d
e	fdZ
defdZdefdZdefdZ	 	 	 	 	 	 d1dee   dee   dee   dee   dee   deed      defdZdddefdZd Z ej*                  d      deeeef      fd       Zdedeeddf   fd Zedd!defd"       Zed#efd$       Zdedeeddf   fd%Zdedeeddf   fd&Zdedeeddf   fd'Zdede fd(Z!ededed)   fd*       Z"d+edefd,Z#d-efd.Z$d-ede%d!e&f   fd/Z'd-ede%d!e(f   fd0Z)y)2r   parserc                 d    t         f}| j                  d|      }|j                  t               y)z
        Register this command to argparse so it's available for the transformer-cli

        Args:
            parser: Root parser to register command-specific arguments
        serve)dataclass_types)funcN)r   
add_parserset_defaultsr   )r   r   serve_parsers      rU   register_subcommandz ServeCommand.register_subcommand  s3     *+((/(R!!'<!=rT   r   c                    t         st        d      || _        | j                  j                  dk(  | _        | j                  j
                  | _        | j                  j                  )t        j                  | j                  j                         t        j                  d      }|j                  t        j                  | j                  j                  j                                   t        j                  d      }|j                  t        j                  | j                  j                  j                                   i | _        d | _        d | _        d | _        d | _        y )NzaMissing dependencies for the serving CLI. Please install with `pip install transformers[serving]`
sdpa_pagedtransformersz+transformers.generation.continuous_batching)serve_dependencies_availableImportErrorr   r   use_continuous_batchingr   r   r   r   r   
get_loggersetLevel
log_levelsr   lowerloaded_models#running_continuous_batching_managerlast_messageslast_kv_cache
last_model)r   r   transformers_logger	cb_loggers       rU   r   zServeCommand.__init__  s   +s 
 	'+yy'D'D'T$990099!!-dii445 &00@$$W%7%7		8K8K8Q8Q8S%TU&&'TU	7--dii.A.A.G.G.IJK 57X\0 "!rT   requestschema_TypedDictMeta	validatorrG   unused_fieldsc                 0   t         j                  d|        t        |j                               }|j                  }||z
  }|r(t         j                  d|        t        dd|       | j                  j                  rB	 |j                  |       ||z  }	|	r(t         j                  d|	        t        dd|	       yy# t        $ rF}t         j                  d|j                                 t        d|j                               d}~ww xY w)a  
        Validates the request against the schema, and checks for unexpected keys.

        Args:
            request (`dict`):
                The request to validate.
            schema (`_TypedDictMeta`):
                The schema of the request to validate. It is a `TypedDict` definition.
            validator (`TypeAdapter`):
                The validator to use to validate the request. Built from `schema`.
            unused_fields (`set`):
                Fields accepted by `schema`, but not used in `transformers serve`.

        Raises:
            HTTPException: If the request is invalid or contains unexpected or unused fields.
        zValidating request: z Unexpected keys in the request: i  )status_codedetailzValidation error: NzUnused fields in the request: )r   debugsetkeys__mutable_keys__errorr)   r   r   validate_pythonrH   errors)
r   r  r  r  r  
input_keyspossible_keysunexpected_keyseunused_fields_in_requests
             rU   _validate_requestzServeCommand._validate_request  s   . 	+G956 (
//$}4LL;O;LMNC:Z[jZk8lmm99%%H))'2 (2M'A$'=>V=WXY# #.LMeLf,g  ( & # H1!((*>?#AHHJGGHs   C 	DADDc                 F    | j                  |t        t        t               y N)r  r  r  r  )r$  rJ   response_validatorUNUSED_RESPONSE_FIELDSr   r  s     rU   validate_response_requestz&ServeCommand.validate_response_request!  s!    <(0	 	 	
rT   c                 F    | j                  |t        t        t               y r&  )r$  rY   completion_validatorUNUSED_CHAT_COMPLETION_FIELDSr)  s     rU    validate_chat_completion_requestz-ServeCommand.validate_chat_completion_request)  s!    >*7	 	 	
rT   c                 F    | j                  |t        t        t               y r&  )r$  r[   transcription_validatorUNUSED_TRANSCRIPTION_FIELDSr)  s     rU   validate_transcription_requestz+ServeCommand.validate_transcription_request1  s!    8-5	 	 	
rT   N
request_idcontentr   rolefinish_reason
tool_callsr3   r   c                     t        |t        t        j                               |t        t	        |||      d|      gdd      }d|j                  d	       d
S )a  
        Builds a chunk of a streaming OpenAI Chat Completion response.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            request_id (`str`):
                The request ID.
            content (`str`, *optional*):
                Content of the response from the model.
            model (`str`, *optional*):
                The model that generated the content.
            role (`str`, *optional*):
                The role of the next content, until a new role is defined.
            finish_reason (`str`, *optional*):
                The reason the generation by the model has finished.
            tool_calls (`list[ChoiceDeltaToolCall]`, *optional*):
                Data about the tool calls, when they are triggered.

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        )r4  r5  r7  r   )deltaindexr6  r   zchat.completion.chunk)idcreatedr   r   system_fingerprintobjectdata: Texclude_none

)r0   r   timer1   r2   model_dump_json)r   r3  r4  r   r5  r6  r7  chunks           rU   build_chat_completion_chunkz(ServeCommand.build_chat_completion_chunk9  sq    @ $		$% '!#-
 "/
  "*!
$ --4-@AFFrT   responserF   c                 .    d|j                  d       dS )a  
        Builds a event of a streaming OpenAI Response response.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            response (`BaseModel`):
                The response to build an event from. One of the multiple OpenAI Response output types

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        r?  Tr@  rB  )rD  )r   rG  s     rU   build_response_eventz!ServeCommand.build_response_eventm  s"     00d0CDDIIrT   c                     t               } j                  r3|j                  t        dgddgdg       t        j                  d       nt        j                  d       |j                  d      dt        f fd       }|j                  d	      dt        f fd
       }ddlm	} |j                  d      d|f fd       }|j                  d      |j                  d       fd              }t        j                  | j                  j                   j                  j                    j                  j"                         y )N*T)allow_originsallow_credentialsallow_methodsallow_headerszUCORS allow origin is set to `*`. This is not recommended for production environments.zaSome apps may require CORS. Consider launching the server with `--enable-cors` if you see errors.z/v1/chat/completionsr  c                     j                  |        j                  rj                  |       }nj                  |       }t	        |d      S Nr  text/event-stream
media_type)r.  r  #continuous_batching_chat_completiongenerate_chat_completionr,   r  outputr   s     rU   chat_completionz)ServeCommand.run.<locals>.chat_completion  sK    11'1B++AA'J66w?$V8KLLrT   z/v1/responsesc                 d    j                  |        j                  |       }t        |d      S rQ  )r*  generate_responser,   rX  s     rU   	responsesz#ServeCommand.run.<locals>.responses  s2    **7*;++G4F$V8KLLrT   r   )Requestz/v1/audio/transcriptionsc           
        K   | j                         4 d {   }t        |d   j                          d {   |d         }t        j	                  d|d   j
                   d|d   j                   d|d   j                  dz  dd	       d d d       d {    j                  
       j                  |      }t        |d      S 7 7 7 8# 1 d {  7  sw Y   HxY ww)Nr\   r   )r\   r   zReceived file: z; MIME type: z; size:    z.2fz KiBrR  rS  rT  )formr[   readr   r  filenamecontent_typesizer2  generate_transcriptionr,   )r  ra  parsed_requestrY  r   s       rU   audio_transcriptionsz.ServeCommand.run.<locals>.audio_transcriptions  s      ||~ 	 	!F#F|0022w-"
 %d6l&;&;%<M$v,JcJcId e!&\..5c:$@	 	 ///G00@F$V8KLL	2	 	 	 	sU   C+CC+CC
ACC+C5C+CC+C(CC($C+z
/v1/modelsc                  <    t        d j                         d      S )Nlist)r>  data)r+   get_gen_modelsr   s   rU   get_all_modelsz(ServeCommand.run.<locals>.get_all_models  s      64;N;N;P QRRrT   )r   r   r   )r(   r   add_middlewarer*   r   warning_oncepostdictfastapir^  optionsr   uvicornrunr   r   r   r   )r   apprZ  r]  r^  rh  rm  s   `      rU   ru  zServeCommand.run}  s;   i "e"&"e"e   g s 
(	)	MT 	M 
*	M 
/	"	Mt 	M 
#	M 	$	,	-	M 	M 
.	M" 
\	"			S 
 
#	S 	Cdiinn499>>TYYM`M`arT   )maxsizec           	         g d}t         rQ|D cg c]E  }|dt        j                  j                         j                         |j	                  d      d   dG c}S |D cg c]  }t        |       }}|D cg c]5  }|j                  d|j                  j                         |j                  d7 c}S c c}w c c}w c c}w )a.  
        This is by no means a limit to which models may be instantiated with `transformers serve`: any chat-based
        model working with generate can work.

        This is a limited list of models to ensure we have a discoverable /v1/models endpoint for third-party
        integrations.
        )zMenlo/Jan-nanozMenlo/Jan-nano-128kzQwen/Qwen2.5-0.5B-InstructzQwen/Qwen2.5-3B-InstructzQwen/Qwen2.5-7B-InstructzQwen/Qwen2.5-14B-Instructz meta-llama/Llama-3.1-8B-Instructz meta-llama/Llama-3.2-1B-Instructz!meta-llama/Llama-3.3-70B-InstructzHuggingFaceTB/SmolVLM-Instructz!ibm-granite/granite-vision-3.2-2bzQwen/Qwen2.5-VL-7B-Instructr   /r   )r;  r>  r<  owned_by)	r   datetimenow	timestampsplitr   r;  
created_atauthor)r   modelsr   model_infoss       rU   rl  zServeCommand.get_gen_models  s    
  $   %'00446@@B %C 0 3	  ;AA:e,AKA )   ((%$//99; %	  Bs   A
B5 B:8:B?r   c                    	  j                  d         		 j                  k7  }	 _        |r0 j                  $ j                  j                  dd       d _         j	                  	      \  }}t        |d      r|j                  n|}t        |j                  |j                  |j                  ddd	dd
d
       j                  K|j                  d       _        t                j                  _         j                  j                          |j                  d   dd      j!                  |j"                        }	 fd} ||d         S )a'  
        Generates an OpenAI Chat Completion using continuous batching.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        r   NTr   )blocktimeout	tokenizerFr   r`  
   fifo)	r   eos_token_idpad_token_id	use_cache
num_blocks
block_sizer   max_batch_tokens	scheduler)rK   	streamingmessagespt)return_tensorsadd_generation_promptc              3     K   	 	j                   j                  | j                  d      j                        }d}	j	                  |d       	j                   D ]  }|j
                  |k7  rj                  d      "|s |j                  t        j                  k(  rDd}|j                  t        j                  k(  rdnd }|j                  t        j                  k(  r	j	                  ||        y 	j	                  ||j                  	        y # t        $ r9}t        j                  t        |             d
t        |       d Y d }~y d }~ww xY ww)Nr3  )r3  r   F	assistantr5  r   Trn   r6  r   )r3  r4  r   data: {"error": ""})r
  add_requestr   r   rF  r3  statusr&   FINISHED
next_token	Exceptionr   r  rQ   )
_inputsr3  queue_is_flushedresultr6  r"  rK   model_id_and_revisionr   r   s
         rU   stream_chat_completionzPServeCommand.continuous_batching_chat_completion.<locals>.stream_chat_completion  s_    !7!EEQQ(=N_NnNn R 
 $)  66z[p6qq"FF F((J6 ww|,8AQ!==M,B,BB$/3,.4mm}?U?U.UF[_M}}(>(>>">>&mK` ?   ">>'16;L;LTi ?  !(  7SV$*3q6(#667s;   E$C6D ;E$<"D E$	E!(/EE$E!!E$r   )process_model_namer  r
  rn   load_model_and_processorr   r  r   rK   r  r  init_continuous_batchingr   logit_processorr   apply_chat_templatetor   )
r   r   must_discard_cacher   r   r  inputsr  rK   r  s
   ``      @@rU   rV  z0ServeCommand.continuous_batching_chat_completion  sc    !% 7 7G E2dooE/77C88==DRS=T;?8889NOy+29k+JI''PY	=$)$;$;"//"//
 33;7<7U7U"3t 8V 8D4 H[G\D44D44::< ..s:tko.pssLL
"	7H &fQi00rT   r$   c                     | j                   j                  }|t        j                         v rt        j
                  }|S |t        j                         v rt        j                  }|S t        d|       )NzUnknown modality: )		__class__rM   r   valuesr   r   r   r   
ValueError)r   model_classnamemodalitys      rU   get_model_modalityzServeCommand.get_model_modalityE  sm    //22HOOQQ||H   A H H JJ||H  1/1BCDDrT   r  c           	         g }| D ]  }|d   g d}|t         j                  k(  rmt        |d   t              r|d   }nMt        |d   t              r:g }|d   D ]  }|d   dk(  s|j                  |d          ! dj                  |      }|d<   n(|t         j                  k(  rt        |d   t              r|d   j                  d|d   d       n|d   D ]  }|d   dk(  r|d   j                  |        |d   dk(  s)d	|d   d
   v rt        j                  dd|d   d
         }t        j                  t        t        j                  |                  }t        j                   dd      }	|	j"                  }
|j%                  |	j"                         n|d   d
   }
|d   j                  d|
d        |j                  |        |S )Nr5  r5  r4  r4  typer`    )r  r`   	image_urlbase64urlz^data:image/.+;base64,r   z.pngF)suffixdeleteimage)r  r  )r   r   
isinstancerQ   rj  appendjoinr   resubr'   openr	   r  	b64decodetempfileNamedTemporaryFilenamesave)r  r  processor_inputsmessageparsed_messageparsed_contentr4  
image_datar  r\   r  s              rU   *get_processor_inputs_from_inbound_messagesz7ServeCommand.get_processor_inputs_from_inbound_messagesQ  s    '	4G&-fo"EN8<<' gi0#6%,Y%7N	 2D9%'N#*9#5 C"6?f4*11'&/BC &)XXn%=N,:y)X\\) gi0#6"9-44fgV_N`5ab#*9#5 \"6?f4*95<<WE$V_;'7;+?+FF-/VV4LbRYZeRfglRm-n
(-

76;K;KJ;W3X(Y'/'B'B&Y^'_&*ii %

499 5&-k&:5&A*95<<gVY=Z[\  ##N3O'	4P  rT   c                      j                   j                   j                   j                  |d<   |d   }|d   d   dk(  ry j                  |d          j                  k7  } _         j	                        \  } j                        } j                  ||      }dt        D ]/  }|j                  j                  d   j                         v s-| n |j                  |d|j                  d	      d
dd      }|j                  j                        }|j                  dd      d}	dj                  j                  d   j                         v rd}	t        ||	d      }
t!        |j"                        }d} j%                  |      r|s j&                  }i ||
|d|d fd} ||
      S )a  
        Generates an OpenAI Chat Completion using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        Nr   r  r5  r  r   Ttoolsr  )r  r  r  return_dicttokenizer3  req_0gptossFskip_special_tokensskip_promptr   )streamerrK   return_dict_in_generatepast_key_valuesc              3     K   d}d }dj                   j                  d   j                         v rd}d}fd}t        |      }d}	 |j	                          t               }j                  d	
       | D ]2  }dj                   j                  d   j                         v r |j                  d      r|d t        d        }||z  }|r||v rd}[\|j                         t           d   k(  rd|_        |j                         t           d   k(  r(|j                          j                  |d d       |j                  r@|xj                  |z  c_        |j                  sYt        j                   d|j                        }	|	|	j#                  d      }	d|_        t%        t'        |	      dd|dz         }
n|dk(  rWd|j                  vrg|xj(                  |j+                  d      z  c_        |xj(                  |j+                  d      z  c_        |j(                  dk  r&dj-                  |j/                  d      d d       dz   }t%        t'        |      dd      }
j                  |d |
g       |dk7  sj                  ||       5 j                  |d       |j-                          |j-                          y # t0        $ r9}t2        j5                  t7        |             d t7        |       d! Y d }~Nd }~ww xY w# |j-                          w xY ww)"NFr  r   T<|channel|>final<|message|>c                  L     j                   di | }|j                  _        y NrS   generater  r  r   generate_outputr   r   s     rU   generate_with_cachezbServeCommand.generate_chat_completion.<locals>.stream_chat_completion.<locals>.generate_with_cache  $    "0%..":6":%4%D%D"rT   targetr   r   r  r  
<|return|>r   r   r7  )r3  r5  r6  r   z\"name\": \"(.*?)\"r   )r  function
_tool_call)r  r:  r  r;  z"arguments": {{})	arguments)r  r:  r  )r3  r5  r7  r   )r4  r   rn   r  r  r  )configarchitecturesr  r
   r   r   rF  endswithlenstrip_TOOL_CALL_TOKENSr   r   r   r   r  searchgroupr3   r4   r   countr  r~  r  r   r  rQ   )r  _request_id
filter_cotcot_trace_endr  threadresults
tool_stater  	tool_nametoolr"  generation_kwargsr   r  r3  r   tool_model_familys               rU   r  zEServeCommand.generate_chat_completion.<locals>.stream_chat_completion  s_     J M5<<55a8>>@@!
 =E #6?PQFGh&[
 66z[p6qq& WF5<<#=#=a#@#F#F#HH!??<8%+,@s</@.@%AFv%G "(G3).J$$ )4!<<>->?P-QRY-ZZ:>J7$ "<<>->?P-QRW-XX&,,."&"B"B+6%).:&;	 #C #  %%66&--7- $.#C#C,.II6LjN_N_,`	#,#4$,090BICG
 @':-Hi-X*+)3'2\'A	(" $*R<$, $4:;L;L#L$, !+ < <S@Q Q < * < <S@Q Q <#-#?#?!#C-/WWV\\#5Fs5K-Ls-RF':-HSY-Z*+)3(" #'"B"B+6Ttf\q #C #  % |">>'?T ?  kWp 66{RX`u6vv   7SV$*3q6(#667
 sC   AMIK* ?K* M*	L,3/L'"L/ 'L,,L/ /MM)r   r   r  r  r  r  r  _MODELS_WITH_TOOL_SUPPORTr  r  r  r  r   r  r   r   r   rK   is_continuationr  )r   r   r  r  r   r  r  supported_model_familiesr  r  generation_streamerrK   r  r  r  r   r  r3  r  s   `             @@@@@rU   rW  z%ServeCommand.generate_chat_completion  s    99  ,9900CL9<Z B<;. $ 7 7G E2dooE/889NOy**51JJ8U]^ !(A 	$'5<<+E+Ea+H+N+N+PP$<!	 .."&'''" / 
 5<<(WW\73
 #u||11!4::<<"'2 3

 >c[`[r[rs$-? ..M

+!2'+,
y	 y	v &&9:FFrT   c                    
  j                  d          j                  k7  } _         j                        \  }t        d   t              r'dv r	dd   dgng }|j                  dd   d       nt        d   t              r8dv r.d   d   d   dk7  rdd   dgd   }nYd   }d   |d   d	<   nHd   }nBt        d   t              r$dv r	dd   dgng }|j                  d          nt        d
      |j                  |dd      }|j                  j                        }j                  dd      d}dj                  j                  d   j                         v rd}t!        ||d      }t#        j$                        }d} j'                        r|s j(                  }|t+        j,                  |      ||d|d

 fd}	 |	|      S )a	  
        Generates an OpenAI Response using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Response for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Response events.
        r   inputinstructionssystemr  ra   r   r5  r4  z%inputs should be a list, dict, or strTr  )r  r  rl   r  r  Fr  r  N)r  attention_maskr  rK   r  r  c              3   L	  K   d}d }dj                   j                  d   j                         v rd}d}fd}t        |      }d}d}d}	 |j	                          t        j
                         }	t        d|t        d	 |	d
j                  d      dddiidg g j                  dd      dj                  d                  }
|dz  }j                  |
       t        d|t        d	 |	dj                  d      dddiidg g j                  dd      dj                  d                  }|dz  }j                  |       t        d||t        d dddg             }|dz  }j                  |       t        dd |||t        dd g !      "      }|dz  }j                  |       d }| D ]  }dj                   j                  d   j                         v r |j                  d#      r|d t!        d#        }||z  }|r
||v rd}d }\]t#        d$d ||||d d%d&g'      }|dz  }j                  |        t%        d(d ||d|d d%d&g)      }|dz  }j                  |       t'        d*d |||t        d|j(                  g !      "      }|dz  }|dz  }j                  |       t+        d+||t        d dd,d|j,                  gg -            }|dz  }|dz  }j                  |       t/        d.|t        d	 |	d,j                  d      dddii|j0                  gdg j                  dd      dj                  d      /            }|dz  }j                  |       |j3                          |j3                          y # t4        $ r}t6        j9                  d0t;        |              t=        d1|t;        |      2      }|dz  }j                  |       t?        d3|t        d	 	d4j                  d      dddiig dg ddj                  d      tA        d5t;        |      6      7            }|dz  }j                  |       Y d }~d }~ww xY w# |j3                          w xY ww)8NFr  r   Tr  c                  L     j                   di | }|j                  _        y r  r  r  s     rU   r  zTServeCommand.generate_response.<locals>.stream_response.<locals>.generate_with_cache  r  rT   r  zresponse.createdresp_queuedr	  formatr  r`   rG  r{   r   rq   )r;  r  r  r   r	  r`   r>  r  rY  r{   rh   rq   )r  sequence_numberrG  r   zresponse.in_progressin_progresszresponse.output_item.addedmsg_r  r  )r;  r  r  r5  r4  )r  r  output_indexitemzresponse.content_part.addedoutput_textr   )r  r`   annotations)r  item_idr  r  content_indexpartr  zresponse.output_text.deltagX@)tokenlogprob)r  r  r  r  r  r9  rp   zresponse.output_text.done)r  r  r  r  r  r`   rp   zresponse.content_part.donezresponse.output_item.done	completed)r;  r  r  r5  r4  r  zresponse.completed)r;  r  r  r   r	  r`   rY  r>  r  r{   rh   rq   z"Exception in response generation: r  )r  r  r  zresponse.failedfailedserver_error)coder  )r;  r  r  r   r	  r`   rY  r>  r  r{   rh   rq   r  )!r  r  r  r
   r   rC  r:   r6   r   rI  r>   r?   rA   r8   rB   r  r  rC   rD   r9   r`   r@   r  r7   r  r  r  r   r  rQ   r<   r=   r;   )r  r  r  r  r  r  r  r  r  r  response_createdresponse_in_progressresponse_output_item_addedresponse_content_part_addedr  r  response_output_text_deltaresponse_output_text_doneresponse_content_part_doneresponse_output_item_doneresponse_completedr"  error_eventresponse_failedr  r   r  r   r3  r   s                           rU   stream_responsez7ServeCommand.generate_response.<locals>.stream_response  s     J M5<<55a8>>@@!
 =E #6?PQFOLMN!YY[
 $8+$3%":,/#-'3%(WW^%<&(89) !,/GG4I5,Q$*!$!4$ $  1$//0@AA'>/$3%":,/#-,3%(WW^%<&(89) !,/GG4I5,Q$*!$!4($$  1$//0DEE .J5$3!-.!*.Y}[fpr	.*  1$//0JKK /L6":,/$3!-"/+RUWX/+  1$//0KLL & PF5<<#=#=a#@#F#F#HH!??<8%+,@s</@.@%AFv%G "(G3).J&(G$$1G9"&zl 3(7%1&3$,.4"@!A2. $q(O334NOO5P: -B4":,/$3!-"# (*t<=-)  1$//0IJJ .J5":,/$3!-"/+E^EcEcqst.*  1$"//0JKK -H4$3!-.!*.&*(!;!@!@ A$&	-)  1$!//0IJJ &<-$3%":,/#-*3%(WW^%<&(89 9 > >?) ,/GG4I5,Q$*!$!4&"$  1$//0BCCJ I  !AA#a&JK0 $3F
  1$//<<"5*$3%":,/#-'3%(WW^%<&(89!) ,1$*!$!4+!/$'F#,  1$//@@C!AH s>   AR$MN4 #R$4	R=CRR RR R!!R$)r  r  r  r  rQ   r  rj  rq  r  r  r  r   r   r  r  r  r   r   rK   r  r  r   	ones_like)r   r   r  r   r  r  r  rK   r  r,  r  r   r  r3  s   ``        @@@@rU   r\  zServeCommand.generate_responseG  s1    !% 7 7G E2dooE/889NOyc'lC(M[_bMbxC4GHIhjFMM6c'lCDGd+$w<?6*h6'/C<OP`SVW^S_`F \F+.~+>F1Ii(WGd+M[_bMbxC4GHIhjFMM#g,'DEE..vTbf.g5<<(WW3W=
 #u||11!4::<<"'2 3

 >c[`[r[rs$-? ..M #oof5+!2'+,
a	 a	F 2J??rT   c                 (  
 t               st        d      | j                  |d         }| j                  |      \  t	        j
                  dd      }t        |j                        }j                  j                  }t        j                  |d         }t        j                  ||d      \  }} ||d	      j                  j                        

d
   j                  j                         
d
<   ||dd
fd}	 |	       S )a  
        Generates an OpenAI Transcription using the audio file.

        Args:
            req (`dict`): The request containing the audio file and model information.

        Returns:
            `Generator[str, None, None]`: A generator that yields the transcription result.
        z]Missing librosa dependency for audio transcription. Please install with `pip install librosa`r   Tr  r  r\   )srmonor  )sampling_rater  input_features)r  rK   r  c               3      K    j                   di } j                  | j                  d      d   }t        |      }|j	                  d        y w)NT)r  r   )r`   r@  rS   )r  batch_decode	sequencesr-   rD  )generated_idstranscription_texttranscriptionaudio_inputsaudio_modelaudio_processorr  s      rU   _generate_transcriptionzDServeCommand.generate_transcription.<locals>._generate_transcription  sg     0K00U<UCTUM!0!=!=m>U>Uko!=!pqr!s)/ABM"222EFGs   AA)r   r  r  load_audio_model_and_processorr   r  r   rK   feature_extractorr1  ior	   librosaloadr  r   r   )r   r   r  r  rK   model_sampling_rateaudio_bytesaudio_array_r<  r9  r:  r;  r  s             @@@@rU   rf  z#ServeCommand.generate_transcriptionk  s"    $%o  !% 7 7G E'+'J'JK`'a$_2%%4T
 >)F)F

 .??MMjjV- k6IPTUQ&{BUfjknn
 *66F)G)J)J;K\K\)]%& ,!2'+
	H '((rT   c                 >   |j                  d      xs |j                  d      }d}| j                  d}n`t        | j                        t        |      k\  rd}n<t        t        | j                              D ]  }| j                  |   ||   k7  sd} n || _        |S )aD  
        Determines whether the current request is a continuation of the last request. In other words, if it is the
        same chat session.

        Args:
            req (`dict`): The request to check.

        Returns:
            `True` if the request is a continuation of the last request, `False` otherwise.
        r  r  TF)r   r  r  range)r   r   r  req_continues_last_messagesis        rU   r  zServeCommand.is_continuation  s     77:&:#'''*:&*# %*/'##$H5*/' 3t1123 %%a(HQK727/
 &**rT   r"   c                     | j                   r:t        d| j                  | j                  | j                  | j                        }|S | j
                  rt        d      }|S d}|S )a  
        Returns the quantization config for the given CLI arguments.

        Args:
            args (`ServeArguments`): The serve arguments. May contain quantization settings, device, etc.

        Returns:
            `Optional[BitsAndBytesConfig]`: The quantization config.
        T)r   bnb_4bit_compute_dtyper   bnb_4bit_use_double_quantbnb_4bit_quant_storage)r   N)r   r"   r   r   r   r   )r   quantization_configs     rU   get_quantization_configz$ServeCommand.get_quantization_config  sr     "4!'+zz$($<$<*.*C*C'+zz# #" "4!# #" #'""rT   model_idc                 p    | j                   j                  | j                   j                  }d|v r|S | dS )aR  
        Applies the `force_model` CLI argument and canonicalizes the model name to the format "model_id@revision".
        If the model_id DOESN'T contain an @, it defaults to "model_id@main".

        Args:
            model_id (`str`): The model ID.

        Returns:
            `str`: The canonicalized model name to be used
        @z@main)r   r   )r   rP  s     rU   r  zServeCommand.process_model_name  s<     99  ,yy,,H(?O5!!rT   r  c                    | j                   }t        j                  d|        d|v r|j                  dd      \  }}n|d}}t	        j
                  |||j                        }|j                  dv r|j                  nt        t        |j                        }| j                  |      }||j                  |d|j                  d}|||d
<   t        j
                  |fi |}	t        t        |	j                  d         }
 |
j
                  |fi |}t        |dd	      |j                  |j                         }|j"                  j$                  d	u xr |j"                  j&                  dk(  }|j"                  j$                  d	uxr |j"                  j$                  dk  }|s|rd|j"                  _        t        j                  d|        ||fS )a  
        Generic method to load a model and a data processor from a model ID and revision, making use of the serve CLI
        arguments.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.
            model_cls (`type[PreTrainedModel]`):
                The model class to load.

        Returns:
            `tuple[PreTrainedModel, Union[ProcessorMixin, PreTrainedTokenizerFast]]`: The loaded model and
            data processor (tokenizer, audio processor, etc.).
        zLoading rR  r   main)revisionr   )r   Nr   )rU  r   r   
device_mapr   NrN  r   hf_device_map   r`  zLoaded model )r   r   r   r~  r!   from_pretrainedr   r   getattrr   rO  r   r   r  r  r  r   rK   r   
max_length)r   r  r   rP  rU  data_processorr   rN  model_kwargsr  architecturer   has_default_max_lengthhas_short_max_new_tokenss                 rU   _load_model_and_data_processorz+ServeCommand._load_model_and_data_processor  s    yyh4567''!6!<!<S!!DHh!6hH&66"44
 #jjN:

tzz@Z"::4@ !#'#;#; !%!7!7
 *2EL./++HEE|V-A-A!-DE,,,XFF5/408HHT[[)E ##22d:gu?V?V?a?aeg?g 	 ##22$>p5CZCZCiCilpCp 	! "%=59E##2m$9#:;<n$$rT   c                    || j                   vs| j                   |   j                         rG| j                  |      \  }}t        || j                  j
                  |      | j                   |<   ||fS | j                   |   j                          | j                   |   j                  }| j                   |   j                  }||fS )a\  
        Loads the text model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, PreTrainedTokenizerFast]`: The loaded text model and processor.
        r   r   	r	  r   ra  r   r   r   r   r   r   )r   r  r   r   s       rU   r  z%ServeCommand.load_model_and_processor$  s     !(:(::d>P>PQf>g>r>r>t#BBCXYE98B $		 7 7#9D45 i	 45AAC&&'<=CCE**+@AKKIirT   c                    || j                   vs| j                   |   j                         rG| j                  |      \  }}t        || j                  j
                  |      | j                   |<   ||fS | j                   |   j                          | j                   |   j                  }| j                   |   j                  }||fS )aU  
        Loads the audio model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, ProcessorMixin]`: The loaded audio model and processor.
        rc  rd  )r   r  r:  r;  s       rU   r=  z+ServeCommand.load_audio_model_and_processor?  s     !(:(::d>P>PQf>g>r>r>t+/+N+NOd+e(K8B $		 7 7)9D45 O++	 45AAC,,-BCIIK"001FGQQOO++rT   )r   NNNNN)*rM   rN   rO   staticmethodr   r   r   r   rq  r  r$  r*  r.  r2  r   rQ   rj  rF  rI  ru  	functools	lru_cacheanyrl  r   rV  r   r  r  rW  r\  rf  r_   r  rO  r  ra  tupler   r  r   r=  rS   rT   rU   r   r     s   	>N 	> 	>^ @// !/ !	/
 /b
 

 

d 
 %'!%#"'+<@2GSM2G #2G }	2G
 sm2G  }2G T"7892G 
2GhJ[ JS J ?bB Y&+T#s(^ 4 + '+ZW1t W1	#tUY/@Z W1r 	"3 	 	 	 + x +  + ZFGD FGYsD$5O FGPb@T b@iT4.H b@H	.)$ .)9S$_3M .)`+4 +D +< #n #BV9W # #8"3 "3 "";%C ;%z %( 	 "99	: 6,C ,ERcesRsLt ,rT   r   __main__)r  r   r{  enumrg  r   r?  r   r  r  r   rC  argparser   r   collections.abcr   r   dataclassesr   r   r	   r
   typingr   r   huggingface_hubr   huggingface_hub.constantsr   r  &transformers.models.auto.modeling_autor   r   transformers.utils.import_utilsr   r   r   r   r   r   r   r   r   r   r   r   utilsr   r   r    r   r!   r"   r#   r$   generation.continuous_batchingr%   r&   r@  PILr'   r  rt  rr  r(   r)   fastapi.middleware.corsr*   fastapi.responsesr+   r,    openai.types.audio.transcriptionr-   .openai.types.audio.transcription_create_paramsr.   openai.types.chatr/   'openai.types.chat.chat_completion_chunkr0   r1   r2   r3   r4   *openai.types.chat.completion_create_paramsr5   openai.types.responsesr6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   -openai.types.responses.response_create_paramsrE   pydanticrF   rG   rH   rJ   rY   r[   r'  r,  r0  r(  r-  r1  r  rM   r   r  rj  r  r  Enumr   r   rq  r   r   r   r   r   r   ru  rS   rT   rU   <module>r     s[        	 	  	    . / (   " & 4    0 (   Z k 4 6k;O;QkViVk   .6A>\<  [    " \@@4QY^ 6U]b '0MUZ ' %%NO&'RS)*OP %!.# 
		H	%
   !!2!7!7!9: tyy 	 8	8/8 	8v ,@ ,@^ d* d* d*NS,- S,l$ zNE	IIK rT   