
    h                         d dl mZmZ d dlZddlmZ ddlmZm	Z	 ddl
mZmZmZmZmZ ddlmZmZ  G d d	e      Z G d
 ded      Z G d de      ZdgZy)    )OptionalUnionN   )BatchFeature)
ImageInputmake_nested_list_of_images)AudioKwargsImagesKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                       e Zd ZU ee   ed<   y)Gemma3nImagesKwargsdo_convert_rgbN)__name__
__module____qualname__r   bool__annotations__     l/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/gemma3n/processing_gemma3n.pyr   r      s    TN"r   r   c                   .    e Zd ZU eed<   eed<   dddiiZy)Gemma3nProcessorKwargsaudio_kwargsimages_kwargstext_kwargspaddingFN)r   r   r   r	   r   r   	_defaultsr   r   r   r   r      s"    &&u
Ir   r   F)totalc                        e Zd ZdZg dZdZdZdZ	 	 	 ddedef fdZ		 	 	 	 dd	e
d
eeeee   ee   f   deeej"                  ee   eej"                     eee      f      dee   def
dZ xZS )Gemma3nProcessorat  
    A processor for Gemma 3n, wrapping the full capabilities of a feature extractor, image processor, and tokenizer
    into a single processor.

    Args:
        feature_extractor (`Gemma3nAudioFeatureExtractor`):
            Feature extractor that converts raw audio waveforms into MEL spectrograms for the audio encoder. This
            should return a `BatchFeature` with `input_features` and `input_features_mask` features.
        image_processor (`SiglipImageProcessorFast`):
            Image processor that prepares batches of images for the vision encoder. This should return a `BatchFeature`
            with a `pixel_values` feature.
        tokenizer (`GemmaTokenizerFast`):
            The text tokenizer for the model.
        chat_template (`string`, *optional*):
            A Jinja template for generating text prompts from a set of messages.
        audio_seq_length (int, *optional*, defaults to 188):
            The number of audio soft tokens that will be added to the text prompt
        image_seq_length (int, *optional*, defaults to 256):
            The number of image soft tokens that should be added to
    )feature_extractorimage_processor	tokenizerAutoFeatureExtractorAutoImageProcessorAutoTokenizeraudio_seq_lengthimage_seq_lengthc                    || _         |j                  | _        |j                  | _        |j                  | _        dj	                  |j                  g|z        }d|j                   | |j
                   d| _        || _        |j                  | _        |j                  | _	        |j                  | _
        dj	                  |j                  g|z        }	d|j                   |	 |j                   d| _        t        
| 8  d||||d| y )N z

)r%   r&   r'   chat_templater   )r+   audio_token_id	boa_tokenaudio_tokenjoin	eoa_tokenfull_audio_sequencer,   image_token_id	boi_tokenimage_token	eoi_tokenfull_image_sequencesuper__init__)selfr%   r&   r'   r/   r+   r,   kwargsaudio_tokens_expandedimage_tokens_expanded	__class__s             r   r<   zGemma3nProcessor.__init__C   s!    !1'66",,$00 ")>)>(?BR(R S%))*=*=)>?T>UV_ViViUjjn#o  0'66",,$00 ")>)>(?BR(R S%))*=*=)>?T>UV_ViViUjjn#o  	
/+'		

 	
r   imagestextaudior>   returnc           	         |||t        d       | j                  t        fd| j                  j                  i|}t        |t              r|g}n.t        |t              st        |d   t              st        d      |e | j                  |fi |d   }|s|D cg c]  }| j                   }}|D 	cg c](  }	|	j                  | j                  | j                        * }}	ni }|| j                  j                  |      }t        |      }
 | j                  |
fi |d   }|s5|
D cg c]*  }dj                  | j                   gt#        |      z        , }}t#        |
      t#        |      k7  r$t        dt#        |
       d	t#        |       d
      |D 	cg c](  }	|	j                  | j                   | j$                        * }}	ni }|d   j'                  dd       } | j                  dd|i|d   ddi}| j)                  ||dg       |d   }t+        j,                  |      }d||| j.                  k(  <   d||| j0                  k(  <   |j3                         D ci c]  \  }}||j5                          }}}|j5                         |d<   t7        i ||||      S c c}w c c}	w c c}w c c}	w c c}}w )Nz5Provide at least one of `text`, `images`, or `audio`.tokenizer_init_kwargsr   zAInvalid input text. Please provide a string, or a list of stringsr   r    z1Received inconsistently sized batches of images (z) and text (z).r   return_tensorsrC   npimage)
modalities	input_ids   r   token_type_ids)datatensor_typer   )
ValueError_merge_kwargsr   r'   init_kwargs
isinstancestrlistr%   r2   replacer5   r&   fetch_imagesr   r3   r8   lenr:   pop_check_special_mm_tokensrJ   
zeros_liker6   r0   itemstolistr   )r=   rB   rC   rD   videosr>   output_kwargsaudio_inputs_promptbatched_imagesimage_inputsrI   text_inputs	array_idsrO   kvs                     r   __call__zGemma3nProcessor.__call__c   s    <FNu}TUU***"
"&.."<"<
 
 dC 6DD$'
47C0H`aa1411%Y=;XYL278Q((88 ^bbSYFNN4#3#3T5M5MNbDbL))66v>F7?N/4//a-P_B`aL Q_`v$"2"2!3c&k!AB``>"c$i/ GNH[G\\hilmqirhssuv 
 ^bbSYFNN4#3#3T5M5MNbDbL&}599:JDQ$dnnd$d-2Nd_cd%%dKWI%N  ,	y1;<yD$7$778;<yD$7$7781<1B1B1DEAq!((*}EE(6(=(=(?$%!PK!P<!P<!P^lmmK 9 c a c Fs   J*8-J/./J4-J9&J>)N      )NNNN)r   r   r   __doc__
attributesfeature_extractor_classimage_processor_classtokenizer_classintr<   r   r   r   r   rW   r   rJ   ndarrayfloatr   r   r   rk   __classcell__)rA   s   @r   r$   r$   (   s    * GJ40%O  # #
 
 
D "^b_c?n?n I0$y/4HYCZZ[?n bjj$u+tBJJ7GdSXkIZZ[\	?n /0?n 
?nr   r$   )typingr   r   numpyrJ   feature_extraction_utilsr   image_utilsr   r   processing_utilsr	   r
   r   r   r   tokenization_utils_baser   r   r   r   r$   __all__r   r   r   <module>r~      sR     #  4 A c c C#, #-U zn~ znz 
r   