
    h+                     N   d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	Z	 d dl
m	c mZ ddlmZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0 d Z1d_dZ2dejf                  de4dejf                  fdZ5	 d`de	jl                  dejf                  dejf                  dejf                  deejf                     de7d e7d!e$e&   fd"Z8 G d# d$e	jl                        Z9 ed%       G d& d'e	jl                               Z: G d( d)e	jl                        Z; G d* d+e      Z< G d, d-e	jl                        Z= G d. d/e	jl                        Z> G d0 d1e	jl                        Z? G d2 d3e	jl                        Z@ G d4 d5e	jl                        ZA G d6 d7e	jl                        ZB G d8 d9e	jl                        ZC G d: d;e	jl                        ZD G d< d=e	jl                        ZE G d> d?e	jl                        ZF G d@ dAe	j                        ZH G dB dCe	jl                        ZI G dD dEe	jl                        ZJ G dF dGe	jl                        ZK G dH dIe	jl                        ZL G dJ dKe	jl                        ZM e'dLM       G dN dOe"             ZN G dP dQ      ZOe' G dR dSe"             ZP G dT dUe	jl                        ZQe' G dV dWeP             ZRe' G dX dYePe             ZS G dZ d[eP      ZT G d\ d]ePe      ZUg d^ZVy)a    N)cached_property)CallableOptionalUnion   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)check_model_inputs   )
Emu3ConfigEmu3TextConfigEmu3VQVAEConfigc                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..N   dim)shapetorchcat)xx1x2s      d/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/emu3/modeling_emu3.pyrotate_halfr+   /   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''    c                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer+   )qkcossinposition_idsunsqueeze_dimq_embedk_embeds           r*   apply_rotary_pos_embr7   6   sY    ( --
&C
--
&C3w;q>C/0G3w;q>C/0GGr,   hidden_statesn_repreturnc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r$   expandreshape)r8   r9   batchnum_key_value_headsslenhead_dims         r*   	repeat_kvrB   Q   so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr,   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr!   r   r    )r#   dtype)ptrainingr   )rB   num_key_value_groupsr%   matmul	transposer$   nn
functionalsoftmaxfloat32torM   rI   rO   
contiguous)rC   rD   rE   rF   rG   rH   rI   rJ   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r*   eager_attention_forwardr^   ]   s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r,   c                   *    e Zd ZdZdedef fdZ eddd      	 	 dd	ej                  d
e
ej                  ej                  f   deej                     dee   deej                     dee   de
ej                  ej                  f   fd       Z xZS )Emu3Attention=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                 d   t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        | j                  dz  | _
        |j                  | _        d| _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j                  | j                  z  |j
                  |j                        | _        y )NrA         Tbias)super__init__rb   rc   getattrhidden_sizenum_attention_headsrA   r?   rP   rH   attention_dropout	is_causalrS   Linearattention_biasq_projk_projv_projo_projselfrb   rc   	__class__s      r*   ri   zEmu3Attention.__init__z   sM   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
r,   past_key_valuepast_key_values4.58new_nameversionr8   position_embeddingsrG   cache_positionrJ   r:   c                 4   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  rt        | j                  j                     } || |	|
||f| j                  sdn| j                  | j                   d|\  }} |j"                  g |d j%                         }| j'                  |      }||fS )Nr    r   r!   )r2   r1   r   eager        )rI   rH   )r$   rA   rq   viewrR   rr   rs   r7   updaterc   r^   rb   _attn_implementationr   rO   rm   rH   r=   rX   rt   )rv   r8   r~   rG   ry   r   rJ   input_shapehidden_shapequery_statesrY   rZ   r1   r2   cache_kwargsattention_interfacer]   r[   s                     r*   forwardzEmu3Attention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r,   NN)__name__
__module____qualname____doc__r   intri   r   r%   Tensortupler   r	   
LongTensorr   r   r   __classcell__rw   s   @r*   r`   r`   w   s    G
z 
c 
. %0A6R ,059))||)) #5<<#=>)) !.	))
 "%)) !!1!12)) +,)) 
u||U\\)	*)) S))r,   r`   RMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )Emu3RMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z:
        Emu3RMSNorm is equivalent to T5LayerNorm
        N)rh   ri   rS   	Parameterr%   onesweightvariance_epsilon)rv   rk   epsrw   s      r*   ri   zEmu3RMSNorm.__init__   s1     	ll5::k#:; #r,   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nr!   r    T)keepdim)	rM   rW   r%   rV   powmeanrsqrtr   r   )rv   r8   input_dtypevariances       r*   r   zEmu3RMSNorm.forward   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r,   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)r   r   r$   r   rv   s    r*   
extra_reprzEmu3RMSNorm.extra_repr   s*    ))*+6$2G2G1HIIr,   )ư>)r   r   r   ri   r   r   r   r   s   @r*   r   r      s    $;Jr,   r   c                   $     e Zd Z fdZd Z xZS )Emu3MLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  | j                  | j                  |j                        | _
        t        |j                     | _        y )Nrf   )rh   ri   rb   rk   intermediate_sizerS   ro   mlp_bias	gate_projup_proj	down_projr   
hidden_actact_fnrv   rb   rw   s     r*   ri   zEmu3MLP.__init__   s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r,   c                     | j                  | j                  | j                  |            | j                  |      z        }|S N)r   r   r   r   )rv   r'   r   s      r*   r   zEmu3MLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r,   r   r   r   ri   r   r   r   s   @r*   r   r      s    0r,   r   c                   >    e Zd Zdedef fdZ eddd      	 	 	 	 	 	 ddej                  d	e	ej                     d
e	ej                     de	e   de	e   de	ej                     de	eej                  ej                  f      dee   dej                  fd       Z xZS )Emu3DecoderLayerrb   rc   c                 h   t         |           |j                  | _        t        ||      | _        t        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        t        j                  |j                        | _        y )N)rb   rc   r   )rh   ri   rk   r`   	self_attnr   mlpr   rms_norm_epsinput_layernormpost_attention_layernormrS   Dropoutrm   rI   ru   s      r*   ri   zEmu3DecoderLayer.__init__   s    !--&f	J6?*6+=+=6CVCVW(3F4F4FFL_L_(`%zz&":":;r,   rx   ry   rz   r{   r8   rG   r3   	use_cacher   r~   rJ   r:   c                     |}	| j                  |      } | j                  d|||||||d|\  }}
|	| j                  |      z   }|}	| j                  |      }| j	                  |      }|	| j                  |      z   }|S )N)r8   rG   r3   ry   r   r   r~    )r   r   rI   r   r   )rv   r8   rG   r3   ry   r   r   r~   rJ   residual_s              r*   r   zEmu3DecoderLayer.forward   s     !,,];)4>> 	
')%+) 3	
 	
q !4<<#>> 55mD/ 4<<#>>r,   )NNNFNN)r   r   r   r   r   ri   r   r%   r   r   r   r	   boolr   r   r   r   r   r   s   @r*   r   r      s    	<z 	<c 	< %0A6R 2637+/$)59KO|| !. u//0	
 "% D> !!1!12 &eELL%,,,F&GH +, 
 Sr,   r   c                   H     e Zd ZdZdef fdZdej                  fdZ xZ	S )Emu3VQVAEVectorQuantizera  
    A module for vector quantization using learned embedding vectors.

    This module implements the quantization process similar to te one described in
    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
    input vectors into discrete codebook vectors, which are learned during training.
    Current implementation improves over previous ones by avoiding costly matrix multiplications
    and allowing for post-hoc remapping of indices.
    rb   c                    t         |           t        j                  |j                  |j
                        | _        | j                  j                  j                  j                  d|j                  z  d|j                  z         y )Ng            ?)
rh   ri   rS   	Embeddingcodebook_size	embed_dim	embeddingr   datauniform_r   s     r*   ri   z!Emu3VQVAEVectorQuantizer.__init__  sb    f&:&:F<L<LM""++D63G3G,GvOcOcIcdr,   hidden_statec                    |j                   \  }}}}}|j                  ddddd      j                         }|j                  d|      }t	        j
                  |dz  dd      }t	        j
                  | j                  j                  dz  d	      }	dt	        j                  || j                  j                  j                  dd            z  }
||	z   |
z
  }
t	        j                  |
d	      }|j                  ||||      }|S )
Nr   r   r      r!   r    T)r#   r   r"   )r$   permuterX   r   r%   sumr   r   rQ   rR   argmin)rv   r   
batch_sizetemporalchannelsheightwidthhidden_state_flattenedhidden_state_sumembedding_sum	distancesmin_encoding_indicess               r*   r   z Emu3VQVAEVectorQuantizer.forward!  s    8D8J8J5
Hh#++Aq!Q:EEG!-!2!22x!@ !99%;Q%>AtT		$.."7"7":B %;T^^=R=R=\=\]^`a=bcc	$}4y@	$||I1=388XvW\]##r,   )
r   r   r   r   r   ri   r%   r   r   r   r   s   @r*   r   r     s&    e e
$ELL $r,   r   c                   $     e Zd Z fdZd Z xZS )Emu3VQVAEEncoderConvDownsamplec                 `    t         |           t        j                  ||ddd      | _        y )Nr   r!   r   kernel_sizestridepaddingrh   ri   rS   Conv2dconvrv   in_channelsrw   s     r*   ri   z'Emu3VQVAEEncoderConvDownsample.__init__4  '    IIk;AaYZ[	r,   c                 Z    t        j                  |ddd      }| j                  |      }|S )N)r   r   r   r   constantr   )padmoderF   )Fr   r   rv   r8   s     r*   r   z&Emu3VQVAEEncoderConvDownsample.forward8  s+    mJVWX		-0r,   r   r   s   @r*   r   r   3  s    \r,   r   c                   $     e Zd Z fdZd Z xZS )Emu3VQVAEEncoderConvUpsamplec                 `    t         |           t        j                  ||ddd      | _        y )Nr   r   r   r   r   s     r*   ri   z%Emu3VQVAEEncoderConvUpsample.__init__@  r   r,   c                 X    t        j                  |dd      }| j                  |      }|S )N       @nearestscale_factorr   )r   interpolater   r   s     r*   r   z$Emu3VQVAEEncoderConvUpsample.forwardD  s(    m#IV		-0r,   r   r   s   @r*   r   r   ?  s    \r,   r   c            	       \     e Zd Zdededee   dee   f fdZdej                  fdZ xZ	S )Emu3VQVAEConv3d
in_channelout_channelr   r   c                 P   t         	|           t        |dd  |dd        D cg c]
  \  }}||z
   }}}d| _        |d d d   D ]%  }| xj                  |dz  |dz  z   |dz  fz  c_        ' | xj                  dz  c_        t	        j
                  ||||      | _        y c c}}w )Nr   r   r    r!   )r!   r   )r   )rh   ri   zipr   rS   Conv3dr   )
rv   r  r  r   r   
one_kernel
one_stridepadding_sizespad_sizerw   s
            r*   ri   zEmu3VQVAEConv3d.__init__K  s     	ORS^_`_aSbdjklkmdnOop5KZj0pp%dd+ 	JHLLX]X\98q=IIL	JII	
	 qs   B"r8   c                 h    t        j                  || j                        }| j                  |      }|S r   )r   r   r   r   r   s     r*   r   zEmu3VQVAEConv3d.forwarda  s*    mT\\:		-0r,   )
r   r   r   r   r   ri   r%   r   r   r   r   s   @r*   r   r   J  sF    

 
 3Z	

 c

,U\\ r,   r   c                   `     e Zd Zdedef fdZdej                  dej                  fdZ xZS )Emu3VQVAESpatialNormr   out_channelsc                     t         |           t        j                  |ddd      | _        t        j
                  ||ddd      | _        t        j
                  ||ddd      | _        y )N    r   Tnum_channels
num_groupsr   affiner   r   r   )rh   ri   rS   	GroupNorm
norm_layerr   conv_yconv_brv   r   r  rw   s      r*   ri   zEmu3VQVAESpatialNorm.__init__h  sn    
 	,,%	
 ii
 ii
r,   r8   quant_statesc                     t        j                  ||j                  dd  d      }| j                  |      }|| j	                  |      z  | j                  |      z   }|S )NrL   r   )sizer   )r   r   r$   r  r  r  )rv   r8   r  s      r*   r   zEmu3VQVAESpatialNorm.forward  sX    }}\8K8KBC8PW`a6%L(AADKKP\D]]r,   	r   r   r   r   ri   r%   r   r   r   r   s   @r*   r  r  g  s5    

 
8U\\  r,   r  c                   H     e Zd Zdedef fdZdej                  fdZ xZS )Emu3VQVAETemporalUpsampler  r  c                 J    t         |           t        ||dd      | _        y )Nr   r   r   r   r   r   r   r   rh   ri   r   r   rv   r  r  rw   s      r*   ri   z"Emu3VQVAETemporalUpsample.__init__  (    
 	#!	
	r,   r8   c                 P   |j                   \  }}}}}|j                  ddddd      j                         j                  |d|      }t	        j
                  |dd	      }|j                  ||||d      j                  ddddd      j                         }| j                  |      }|S )
Nr   r   r   r   r!   r    r   r   r   )r$   r   rX   r   r   r   r   )rv   r8   r   r   r   r   r   s          r*   r   z!Emu3VQVAETemporalUpsample.forward  s    8E8K8K5
Hh%--aAq!<GGINNz[]_ghm#IV%**:xPRS[[\]_`bcefhijuuw		-0r,   r  r   s   @r*   r  r    s*    

 
U\\ r,   r  c                   H     e Zd Zdedef fdZdej                  fdZ xZS )Emu3VQVAETemporalDownsampler  r  c                 J    t         |           t        ||dd      | _        y )N)r   r   r   )r!   r   r   r"  r#  r$  s      r*   ri   z$Emu3VQVAETemporalDownsample.__init__  r%  r,   r8   c                 (    | j                  |      }|S r   )r   r   s     r*   r   z#Emu3VQVAETemporalDownsample.forward  s    		-0r,   r  r   s   @r*   r(  r(    s*    

 
U\\ r,   r(  c                   (     e Zd Z	 d fd	Zd Z xZS )Emu3VQVAETemporalResnetBlockc                 p   t         |           || _        ||n|| _        t	        j
                  |      | _        t        ||dd      | _        t	        j
                  |      | _	        t        ||dd      | _
        | j                  | j                  k7  r t	        j                  ||ddd      | _        y y )Nr   r!  r"  r   r   r   )rh   ri   r   r  rS   BatchNorm3dnorm1r   conv1norm2conv2r  nin_shortcutr  s      r*   ri   z%Emu3VQVAETemporalResnetBlock.__init__  s    
 	&+7+?K\^^K0
$!	

 ^^L1
$!	

 t000 "		!D 1r,   c                 L   |}| j                  |      }|t        j                  |      z  }| j                  |      }| j	                  |      }|t        j                  |      z  }| j                  |      }| j                  | j                  k7  r| j                  |      }||z   S r   )	r/  r%   sigmoidr0  r1  r2  r   r  r3  )rv   r8   r   s      r*   r   z$Emu3VQVAETemporalResnetBlock.forward  s     

=1}55

=1

=1}55

=1t000((2H-''r,   r   r   r   s   @r*   r,  r,    s     @(r,   r,  c                   ~     e Zd Z	 	 ddedee   dee   f fdZddej                  deej                     fdZ xZ	S )	Emu3VQVAEResnetBlockr   r  quant_channelsc                    t         |           || _        ||n|}|| _        || _        |=t        j                  |ddd      | _        t        j                  |ddd      | _        n"t        ||      | _        t        ||      | _        t        j                  ||ddd      | _        t        j                  ||ddd      | _        | j                  | j                  k7  r t        j                  ||ddd      | _        y y )	Nr  r   Tr  r   r   r   r   )rh   ri   r   r  r8  rS   r  r/  r1  r  r   r0  r2  r3  )rv   r   r  r8  rw   s       r*   ri   zEmu3VQVAEResnetBlock.__init__  s    	&&2&:{(,!;2SW`deDJ<BTXaefDJ-nkJDJ-nlKDJYY

 YY

 t000 "		!D 1r,   r8   c                 v   | j                   dn|f}|} | j                  |g| }|t        j                  |      z  }| j	                  |      } | j
                  |g| }|t        j                  |      z  }| j                  |      }| j                  | j                  k7  r| j                  |      }||z   S Nr   )
r8  r/  r%   r5  r0  r1  r2  r   r  r3  )rv   r8   r8  	norm_argsr   s        r*   r   zEmu3VQVAEResnetBlock.forward  s    --5BN;L	 "

==9=}55

=1"

==9=}55

=1t000((2H-''r,   r   r   )
r   r   r   r   r   ri   r%   r   r   r   r   s   @r*   r7  r7    sU     '+(,	** sm* !	*X(U\\ (8ELLCY (r,   r7  c            
            e Zd ZdZdef fdZ	 ddej                  deej                     de	ej                  eej                     f   fdZ
 xZS )	Emu3VQVAEAttentionBlockra   rb   c                 &   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        d| _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).re   Fr   )rh   ri   rb   rk   r   rl   	num_headsrA   
ValueErrorscalerm   rI   rn   rS   ro   rr   rs   rq   out_projrP   r   s     r*   ri   z Emu3VQVAEAttentionBlock.__init__(  s$   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..A %&!r,   r8   rG   r:   c           
      :   |j                   \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	t        }
| j                  j                  dk7  rt        | j                  j                     }
 |
| |||	|| j                  | j                  | j                  sdn| j                        \  }}|j!                  |||      j#                         }| j%                  |      }||fS )z#Input shape: Batch x Time x Channelr   r!   r   r   )rn   rH   rI   )r$   rq   rr   rs   r   r@  rA   rR   r^   rb   r   r   rn   rB  rO   rI   r=   rX   rC  )rv   r8   rG   rJ   r   
seq_lengthr   querieskeysvaluesr   r]   r[   s                r*   r   zEmu3VQVAEAttentionBlock.forward?  sa    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc(?;;++w6"9$++:Z:Z"[$7nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0L((r,   r   )r   r   r   r   r   ri   r%   r   r   r   r   r   r   s   @r*   r>  r>  %  s\    G& &4 26$)||$) !.$)
 
u||Xell33	4$)r,   r>  c                   *     e Zd ZdZ fdZddZ xZS )Emu3VQVAEGroupNormz
    Same as the torch GroupNorm with the only difference that this ones accepts
    an optional kwarg `quant_states` which is not used. This class makes it easier to
    use SpatialNorm or GroupNorm without conditionals
    c                 $    t        |   di | y r;  )rh   ri   )rv   rJ   rw   s     r*   ri   zEmu3VQVAEGroupNorm.__init__m  s    "6"r,   c                     t        j                  || j                  | j                  | j                  | j
                        S r   )r   
group_normr  r   rg   r   )rv   inputr  s      r*   r   zEmu3VQVAEGroupNorm.forwardp  s)    ||E4??DKKDHHUUr,   r   )r   r   r   r   ri   r   r   r   s   @r*   rJ  rJ  f  s    #Vr,   rJ  c                   `     e Zd Zd fd	Zddej
                  deej
                     fdZ xZS )Emu3VQVAEMiddleBlockc                     t         |           t        |||      | _        t	        |      | _        |t        |ddd      | _        nt        ||      | _        t        |||      | _	        y )Nr   r  r8  r  r   Tr  )
rh   ri   r7  block_1r>  attn_1rJ  	attn_normr  block_2)rv   rb   r   r8  rw   s       r*   ri   zEmu3VQVAEMiddleBlock.__init__u  so    +#$)

 .f5!/[UW]ajnoDN1.+NDN+#$)
r,   r8   r  c                 b   | j                  ||      }|}| j                  ||      }|j                  \  }}}}|j                  ||||z        j	                  dd      }| j                  |      d   }|j                  ||||      j                  dddd      }||z   }| j                  ||      }|S )Nr   r!   r   r   )	rS  rU  r$   r   rR   rT  r=   r   rV  )rv   r8   r  r   r   r   r   r   s           r*   r   zEmu3VQVAEMiddleBlock.forward  s    ]LA }lC.;.A.A+
Hfe%**:x%PZZ[\^_`M215%--j&%RZZ[\^_abdef =0]LAr,   r   )	r   r   r   ri   r%   FloatTensorr   r   r   r   s   @r*   rP  rP  t  s,    
(
U%6%6 
huO`O`Fa 
r,   rP  c                   >     e Zd Z fdZdej
                  fdZ xZS )Emu3VQVAEDownBlockc           
         t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  }dt        |      z   }|| _        t        j                         | _        t        | j                        D ]K  }t        j                         }t        j                         }t        j                         }|||   z  }	|||   z  }
t        | j
                        D ]~  }|j                  t        |	|
             |
}	|j                  .||j                  v s=|j                  t!        |             |j                  t        j"                  |	ddd              t        j$                         }||_        ||_        ||_        || j                  dz
  k7  rt-        |	      |_        | j                  j                  |       N y )N)r   r   r  r  r   Tr  r   )rh   ri   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsr   in_channel_multiplierrS   
ModuleListdownrangeappendr7  attn_resolutionsr>  r  Moduleblockattn
attn_normsr   
downsample)rv   rb   ra  r^  rb  i_levelri  rj  rk  block_in	block_outi_blockrd  rw   s                r*   ri   zEmu3VQVAEDownBlock.__init__  s   "6#<#<=$33,,#66 $u-?'@ @%:"MMO	T112 	#GMMOE==?DJ$'<W'EEH%(:7(CCI !4!45 
q($,%. %**67fF]F];]KK 7 ?@%%bllUW]ajn&op
q 99;DDJDI(DO$..22"@"JIIT"1	#r,   r8   c                 >   t        | j                        D ]  \  }}t        | j                        D ]  } |j                  |   |      }t        |j                        dkD  s1|} |j                  |   |      }|j                  \  }}}}	|j                  ||||	z        j                  dd      } |j                  |   |      d   }|j                  |||	|      j                  dddd      }||z   } || j                  dz
  k7  s|j                  |      } |S )Nr   r   r!   r   )	enumeraterd  re  r`  ri  r]  rj  rk  r$   r   rR   r=   r   r_  rl  )
rv   r8   rm  blocksrp  r   r   r   r   r   s
             r*   r   zEmu3VQVAEDownBlock.forward  s5   (3 	AOGV !4!45 = 5W 5m Dv{{#a',H$>F$5$5g$>}$MM:G:M:M7J&%$1$6$6z8VV[^$\$f$fghjk$lM$8FKK$8$G$JM$1$9$9*feU]$^$f$fghjkmnpq$rM$,}$<M= $..22 & 1 1- @	A" r,   r   r   r   ri   r%   rX  r   r   r   s   @r*   rZ  rZ    s    ##JU%6%6 r,   rZ  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Emu3VQVAEUpBlockc           	         t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  |j                  d   z  }t        j                         | _
        t        t        | j                              D ]5  }t        j                         }t        j                         }t        j                         }|j                  |j                  |   z  }t        | j
                  dz         D ]e  }	|j                  t        |||             |}||j                  v s1|j                  t!        |             |j                  t#        ||             g t        j$                         }
||
_        ||
_        ||
_        |dk7  rt-        |      |
_        | j                  j1                  d|
       8 y )Nr    r   rR  r   )rh   ri   r]  r^  r_  r`  r   ra  rS   rc  upreversedre  rf  r7  rg  r>  r  rh  ri  rj  rk  r   upsampleinsert)rv   rb   r8  rn  rm  ri  rj  rk  ro  rp  rx  rw   s              r*   ri   zEmu3VQVAEUpBlock.__init__  s   "6#<#<=$33))''&*C*CB*GG--/d&:&: ;< 	"GMMOE==?DJ,,v/H/H/QQI !4!4q!89 V($,%.'5 %f555KK 7 ?@%%&:>8&TUV BBHBG&BM!|:8DGGNN1b!3	"r,   r8   r  c                 h   t        | j                  d d d         D ]  \  }}t        | j                  dz         D ]  } |j                  |   ||      }t        |j                        dkD  s2|} |j                  |   ||      }|j                  \  }}}	}
|j                  |||	|
z        j                  dd      } |j                  |   |      d   }|j                  ||	|
|      j                  dddd      }||z   } |t        | j                        dz
  k7  s|j                  |      } |S )Nr    r   r   r!   r   )rr  rx  re  r`  ri  r]  rj  rk  r$   r   rR   r=   r   rz  )rv   r8   r  rm  rs  rp  r   r   r   r   r   s              r*   r   zEmu3VQVAEUpBlock.forward  sD   (27 	?OGV !4!4q!89 = 5W 5m\ Rv{{#a',H$>F$5$5g$>}l$[M:G:M:M7J&%$1$6$6z8VV[^$\$f$fghjk$lM$8FKK$8$G$JM$1$9$9*feU]$^$f$fghjkmnpq$rM$,}$<M= #dgg,** & >	?  r,   rt  r   s   @r*   rv  rv    s(    #"JU%6%6 eFWFW r,   rv  c                   >     e Zd Z fdZdej
                  fdZ xZS )Emu3VQVAEEncoderc                    t         |           |j                  }|j                  }|j                  }|j
                  }|j                  }|rd|z  n|}||d   z  }t        j                  j                  ||ddd      | _
        t        |      | _        t        ||      | _        t        j                  j                  d|dd	      | _        t        j                  j                  ||ddd      | _        t%        t'        j(                  |j*                              }	t        j,                         | _        t        j,                         | _        t3        |	      D ])  }
t5        ||      }| j.                  j7                  |       + t3        |j8                        D ]*  }t;        ||
      }| j0                  j7                  |       , y )Nr!   r    r   r   r   r  r   T)r  r  r   r  r\  )rh   ri   ra  r   double_latentlatent_channelsr^  r%   rS   r   conv_inrZ  
down_blockrP  middle_blockr  norm_outconv_outr   mathlog2temporal_downsample_factorrc  	time_convtime_res_stackre  r(  rf  r`  r,  )rv   rb   ra  r   r  r  r^  r  rn  temporal_down_blocksir   r   time_res_convrw   s                 r*   ri   zEmu3VQVAEEncoder.__init__  s   ,,((,, 00#66.;q?* #5b#99xx{MqYZdef,V40B**bxUYbf*g ( 
  #499V-N-N#OP mmo+, 	(A.|\JDNN!!$'	( v,,- 	6A8()M &&}5	6r,   pixel_valuesc                 h   |j                   d   } |j                  dg|j                   dd   }| j                  |      }| j                  |      }| j	                  |      }| j                  |      }|t        j                  |      z  }| j                  |      } |j                  d|g|j                   dd   }|j                  ddddd      }| j                  D ]"  } ||      }|t        j                  |      z  }$ | j                  D ]
  } ||      } |j                  ddddd      }|S )Nr   r    r!   r   r   r   )r$   r=   r  r  r  r  r%   r5  r  r   r  r  )rv   r  temporal_dimr8   r   layers         r*   r   zEmu3VQVAEEncoder.forward3  sH   #))!,+|++BH1C1CAB1GH \26))-8 m4}55m4---b,YATATUVUWAXY%--aAq!< NN 	:D /MU]]=99M	: (( 	1E!-0M	1 &--aAq!<r,   )r   r   r   ri   r%   r   r   r   r   s   @r*   r~  r~    s    %6NE$4$4 r,   r~  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Emu3VQVAEDecoderrb   c                    t         	|           |j                  }|j                  |j                  d   z  }t        j                         | _        t        |j                        D ]>  }t        |j                  |j                        }| j                  j                  |       @ t        t        j                  |j                               }t        j                         | _        t        |      D ]=  }t%        |j                  |j                        }| j"                  j                  |       ? t        j&                  |j                  |ddd      | _        t+        |||      | _        t/        |      | _        |j                  |j                  d   z  }t3        ||      | _        t        j&                  ||j6                  ddd      | _        y )Nr    r\  r   r   r   )r8  r   )rh   ri   r   ra  r^  rS   rc  r  re  r`  r,  r  rf  r   r  r  r  r  r  r   r  rP  r  rv  up_blockr  r  r  r  )
rv   rb   r8  rn  r   r  temp_upsample_block_numr  r   rw   s
            r*   ri   zEmu3VQVAEDecoder.__init__R  s   ))''&*C*CB*GG mmov,,- 	6A8"22AWAWM &&}5		6 #&dii0Q0Q&R"S./ 	(A,V-C-CVE[E[\DNN!!$'	( yy""
 1R`a(0''&*C*CA*FF,^XF		
r,   r8   r  c                    t        j                  ||fd      }|j                  ddddd      }| j                  D ]
  } ||      } | j                  D ]"  } ||      }|t        j
                  |      z  }$ |j                  ddddd      }t        j                  |dd      \  }} |j                  dg|j                  dd   } |j                  dg|j                  dd   }| j                  |      }| j                  ||      }| j                  ||      }| j                  ||      }|t        j
                  |      z  }| j                  |      }|S )Nr   r"   r!   r   r   r   r    )r%   r&   r   r  r  r5  chunkr=   r$   r  r  r  r  r  )rv   r8   r  hidden_quant_statesr  s        r*   r   zEmu3VQVAEDecoder.forwardy  sp   #ii(E1M199!Q1aH (( 	=E"'(;"<	= ^^ 	FE"'(;"<5==1D#EE	F 299!Q1aH&+kk2Eqa&P#|---bK=3F3Fqr3JK+|++BH1C1CAB1GH]3 ))-Fm\Bm\B}55m4r,   )	r   r   r   r   ri   r%   r   r   r   r   s   @r*   r  r  Q  s+    %
 %
NU\\  r,   r  aR  
    The VQ-VAE model used in Emu3 for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    )custom_introc                        e Zd ZU eed<   dZdZdZdZdZ	dZ
g dZd Zdef fdZdej                  dej                  fd	Zd
ej                  fdZ xZS )	Emu3VQVAErb   
emuvideovqr  T)r,  r>  r7  r   c                 |   t        |t        j                  t        j                  f      rt        j                  j                  |j                  dd       |j                  qt        j                  j                  |j                        \  }}dt        j                  |      z  }t        j                  j                  |j                  | |       y y t        |t        j                        rt        j                  j                  |j                  t        j                  d             |j                  xt        j                  j                  |j                        \  }}|dkD  rdt        j                  |      z  nd}t        j                  j                  |j                  | |       y y t        |t        j                  t        j                  t        j                   f      rUt        j                  j#                  |j                  d       t        j                  j#                  |j                  d	       y t        |t        j$                        rc|j                  j&                  j)                          |j*                  2|j                  j&                  |j*                     j-                          y y y )
Nfan_outrelu)r   nonlinearityr      )ar   r   r   )
isinstancerS   r   r  initkaiming_normal_r   rg   _calculate_fan_in_and_fan_outr  sqrtr   ro   kaiming_uniform_BatchNorm2dr.  r  	constant_r   r   normal_padding_idxzero_)rv   rC   fan_inr   bounds        r*   _init_weightszEmu3VQVAE._init_weights  s   fryy"))45GG##FMM	PV#W{{&GGAA&--P	DIIf--  ufe< ' 		*GG$$V]]diil$C{{&GGAA&--P	17!DIIf--  ufe< '  NOGGfmmS1GGfkk3/-MM&&(!!-""6#5#56<<> . .r,   c                    t         |   |       || _        t        |      | _        t        |      | _        t        |      | _        dt        |j                        dz
  z  | _        t        |j                  |j                  dd      | _        t        |j                  |j                  dd      | _        dt        |j                        dz
  z  | _        | j%                          | j'                          y )Nr!   r   )r   r   r   r!  r"  )rh   ri   rb   r~  encoderr  decoderr   quantizer]  r^  vision_spatial_factorr   r  r   
quant_convpost_quant_convspatial_scale_factoreval	post_initr   s     r*   ri   zEmu3VQVAE.__init__  s     '/'/08%&3v/H/H+IA+M%N")""F$4$4)T]
  /f44)T] 
 %&#f.G.G*H1*L$M!		r,   image_sizesc                    |j                   dk(  }|rL| j                  j                  }|j                  \  }}}}|j	                  d      j                  d|ddd      }n|j                  \  }}}}}| j                  |      }	|	j                  ddddd      }	| j                  |	      }	|	j                  ddddd      }	| j                  |	      }
|r|
j                  d      n|
}t        ||      D cg c]B  \  }}|d t        |d   | j                  z        d t        |d   | j                  z        f   D }}}|S c c}}w )Nr   r   r   r!   r   )ndimrb   r  r$   r.   repeatr  r   r  r  squeezer  r   r  )rv   r  r  is_imager   r   r   r   r   r8   codesimage_tokenssingle_imager  s                 r*   encodezEmu3VQVAE.encode  sX   $$){{==H2>2D2D/J&%'11!4;;AxAqQL<H<N<N9J(FE\2 &--aAq!<6 &--aAq!<m,+3u}}Q' '*,&D
"d D3tAw)C)CCDDFqDQRGVZVpVpLpHqFqqr
 

 
s   1AD=r8   c                    |j                   dk(  }|r|j                  d      }|j                  \  }}}}| j                  j	                  |j                               }|j                  d   }|j                  |||||      j                  ddddd      j                         }| j                  |      }	|j                  ddddd      }|	j                  ddddd      }	| j                  |	|      }
|
j                  ||| j                  j                  z  | j                  j                  || j                  z  || j                  z        }
|r	|
d d df   S |
S )Nr   r   r    r   r   r!   )r  r.   r$   r  r   flattenr   r   rX   r  r  r=   rb   r  r  r  )rv   r8   r  r   r   r   r   quantr   
post_quantvideos              r*   decodezEmu3VQVAE.decode  sK    %%*)33A6M.;.A.A+
Hfe''(=(=(?@;;r?

:xIQQRSUVXY[\^_`kkm))%0
aAq!,''1aA6
Z/t{{===KK$$T...D---
 'uQT{1E1r,   )r   r   r   r   __annotations__base_model_prefixmain_input_name_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backend_no_split_modulesr  ri   r%   r   r  r  r   r   s   @r*   r  r    sq     $$ON"&?* *5<< ell 82ELL 2r,   r  c                       e Zd ZdZd Zed        Zed        Zed        Zed        Z	ed        Z
ed        Zd	eej                     d
ej                  fdZd	ej                  d
ej                  fdZy)Emu3ImageVocabularyMappingzM
    A class for mapping discrete image tokens from VQGAN to BPE tokens.
    c                 j    || _         |j                  d      | _        |j                  d      | _        y )Nz<|extra_200|>z<image>)	vocab_mapgeteol_token_idimage_token_id)rv   r  s     r*   ri   z#Emu3ImageVocabularyMapping.__init__  s+    "%MM/:'mmI6r,   c           	          t        | j                  j                         D cg c]  \  }}|j                  d      s| c}}      S c c}}w Nz<|visual tokensortedr  items
startswithrv   namevals      r*   r  z'Emu3ImageVocabularyMapping.image_tokens  s8    DNN,@,@,BhytSdooVfFgshiih
   A	
A	
c           	          t        | j                  j                         D cg c]  \  }}|j                  d      s| c}}      S c c}}w r  r  r  s      r*   image_tokens_strz+Emu3ImageVocabularyMapping.image_tokens_str  s8    T^^-A-A-Ci	ctWgGhtijjir  c                 t    | j                   D ci c]  }t        |dd       | j                  |     c}S c c}w )NirL   )r  r   r  )rv   tokens     r*   img2bpez"Emu3ImageVocabularyMapping.img2bpe   s5    FJF[F[\UE"RL!4>>%#88\\\s   #5c                 j    | j                   j                         D ci c]  \  }}||
 c}}S c c}}w r   )r  r  )rv   r0   vs      r*   bpe2imgz"Emu3ImageVocabularyMapping.bpe2img$  s+    !%!3!3!56A1666s   /c                     t        j                  t        | j                  j	                               dz   t         j
                        }| j                  j                         D ]
  \  }}|||<    |S Nr   rM   )r%   zerosmaxr  rG  r   r  rv   mappingr0   r  s       r*   bpe2img_mapping_tensorz1Emu3ImageVocabularyMapping.bpe2img_mapping_tensor(  [    ++c$,,"3"3"56:%))LLL&&( 	DAqGAJ	r,   c                     t        j                  t        | j                  j	                               dz   t         j
                        }| j                  j                         D ]
  \  }}|||<    |S r  )r%   r  r  r  rG  r   r  r  s       r*   img2bpe_mapping_tensorz1Emu3ImageVocabularyMapping.img2bpe_mapping_tensor/  r  r,   	img_batchr:   c                 ,   |j                   }t        j                  |j                  d   dft        j                        | j
                  z  }| j                  |j                  d         }t        j                  ||gd      }|j                  |      S )Nr   r   r  cpur    r"   )	devicer%   r   r$   r   r  r  rW   r&   )rv   r  r  eol_row
img_tokenss        r*   convert_img2bpez*Emu3ImageVocabularyMapping.convert_img2bpe6  sw    !!**iooa0!4EIIFIZIZZ00e1DE
YY
G4"=
}}V$$r,   c                     |j                   }|dd df   }| j                  |j                  d         }|j                  |      S )N.r    r  )r  r  rW   )rv   r  r  r  s       r*   convert_bpe2imgz*Emu3ImageVocabularyMapping.convert_bpe2img=  sG    !!c3B3h'	00e1DE
}}V$$r,   N)r   r   r   r   ri   r   r  r  r  r  r  r  listr%   r   r  r  r   r,   r*   r  r    s    7
 j j k k ] ] 7 7    %ell); % %% %%,, %r,   r  c                   F    e Zd ZU eed<   dZdZdgZddgZdZ	dZ
dZdZdZdZy)	Emu3PreTrainedModelrb   modelTr   ry   r\   FN)r   r   r   r   r  r  supports_gradient_checkpointingr  _skip_keys_device_placementr  r  _can_compile_fullgraph!_supports_param_buffer_assignmentr  r  r   r,   r*   r  r  D  sO    &*# $5m"DN!(-%"&r,   r  c                   ~     e Zd ZU ej                  ed<   ddef fdZ ej                         e	d               Z
 xZS )Emu3RotaryEmbeddinginv_freqrb   c                    t         |           t        |d      rUt        |j                  t
              r;|j                  j                  d|j                  j                  d            | _        nd| _        |j                  | _	        |j                  | _
        || _        t        | j                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                   | _        y )Nrope_scaling	rope_typetypedefaultr  F)
persistent)rh   ri   hasattrr  r  dictr  r  max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrb   r   rope_init_fnattention_scalingregister_bufferr  original_inv_freq)rv   rb   r  r  rw   s       r*   ri   zEmu3RotaryEmbedding.__init__Y  s    6>*z&:M:Mt/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r,   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   r    r   mpsr  F)device_typeenabledr!   r"   r  )r  floatr<   r$   rW   r  r  r  strr%   autocastrR   r&   r1   r  r2   rM   )
rv   r'   r3   inv_freq_expandedposition_ids_expandedr  freqsembr1   r2   s
             r*   r   zEmu3RotaryEmbedding.forwardj  sV    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s    BF%%F.r   )r   r   r   r%   r   r  r   ri   no_gradr   r   r   r   s   @r*   r  r  V  s=    ll/z /" U]]_<  <r,   r  c                       e Zd ZeedZdef fdZee		 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     de
e   de
ej                     d	e
ej                     d
e
e   dee   defd              Z xZS )Emu3TextModel)r8   
attentionsrb   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t#        |      | _        d| _        | j)                          y c c}w )Nr   rb   F)rh   ri   pad_token_idr  
vocab_sizerS   r   rk   embed_tokensrc  re  num_hidden_layersr   layersr   r   normr  
rotary_embgradient_checkpointingr  ru   s      r*   ri   zEmu3TextModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammBGH`H`BabYfi0b
   2 28K8KL	-V<&+# 	 cs   D	input_idsrG   r3   ry   inputs_embedsr   r   rJ   r:   c           
      B   |d u |d uz  rt        d      || j                  |      }|r|t        | j                        }|F||j	                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }
|}| j                  ||      }| j                  d | j                  j                   D ]  } ||f|
||||d|} | j                  |      }t        ||      S )	Nz:You must specify exactly one of input_ids or inputs_embedsr,  r   r   )r  )rb   input_embedsrG   r   ry   r3   )rG   r3   ry   r   r~   )last_hidden_statery   )rA  r/  r
   rb   get_seq_lengthr%   aranger$   r  r.   r   r3  r1  r0  r2  r   )rv   r5  rG   r3   ry   r6  r   r   rJ   past_seen_tokensr\   r8   r~   decoder_layers                 r*   r   zEmu3TextModel.forward  s[    -t";<YZZ *.*;*;I*FM0*$++>O!CRC^==?de+0<< "2]5H5H5K"KTaThTh,N )33A6L(;;&))+%
 &"oom\J![[)H4;;+H+HI 		M)*) /-$7 M		 		-0&++
 	
r,   )NNNNNNN)r   r   r   r   r`   _can_record_outputsr   ri   r   r   r   r%   r   r   r	   rX  r   r   r   r   r   r   r   s   @r*   r)  r)  z  s     *#
z    151537+/5959$(8
E,,-8
 !.8
 u//0	8

 "%8
   1 128
 !!1!128
 D>8
 +,8
 
!8
  8
r,   r)  c                   p    e Zd ZU dgZddiZddgdgfiZeed<    fdZe	e
	 	 	 	 	 	 	 	 	 ddeej                     d	eej                     d
eej                     dee   deej                      deej                     dee   deej                     deeej                  f   dee   defd              Z xZS )Emu3ForCausalLMlm_head.weightlm_headcolwise_repr8   logitsrb   c                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y NFrf   )
rh   ri   r)  r  r.  rS   ro   rk   rB  r  r   s     r*   ri   zEmu3ForCausalLM.__init__  sU     "6*
 ++yy!3!3V5F5FUS 	r,   r5  rG   r3   ry   r6  labelsr   r   logits_to_keeprJ   r:   c
                 z    | j                   d|||||||d|
}|j                  }t        |	t              rt	        |	 d      n|	}| j                  |dd|ddf         }d}|* | j                  d||| j                  j                  d|
}t        |||j                  |j                  |j                        S )a  
        Example:

        ```python
        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
        >>> import torch
        >>> import requests
        >>> from PIL import Image

        >>> model = Emu3ForCausalLM.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

        >>> inputs = processor(text=["Can you write me a poem about winter."], return_tensors="pt").to(model.device)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```r5  rG   r3   ry   r6  r   r   NrD  rG  r.  lossrD  ry   r8   r*  r   )r  r9  r  r   slicerB  loss_functionrb   r.  r   ry   r8   r*  )rv   r5  rG   r3   ry   r6  rG  r   r   rH  rJ   outputsr8   slice_indicesrD  rM  s                   r*   r   zEmu3ForCausalLM.forward  s    @ ,64:: 	,
)%+')	,
 	,
  118B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD%#33!//))
 	
r,   )	NNNNNNNNr   )r   r   r   _tied_weights_keys_tp_plan_pp_planr   r  ri   r   r   r   r%   r   r   r	   rX  r   r   r   r   r   r   r   r   r   s   @r*   r@  r@    s6   *+=)H_-z:;H  151537+/59-1$(59348
E,,-8
 !.8
 u//0	8

 "%8
   1 128
 ))*8
 D>8
 !!1!128
 c5<</08
 +,8
 
 8
  8
r,   r@  c                   `    e Zd ZddiZ fdZd Zd Zd Zd Zde	j                  d	e	j                  fd
Zde	j                  d	e	j                  fdZe	j                  de	j                  dedefd       Zde	j                  de	j                  de	j                  fdZee	 	 	 	 	 	 	 	 	 dde	j                  de	j                  d	e	j(                  dee	j(                     dee	j                     dee   dee	j                     dee   dee	j                     dee   deeef   fd              Z xZS )	Emu3Modelztext_model.model
text_modelc                     t         |   |       t        j                  |j                        | _        t        |j                        | _        t        |j                        | _        | j                          y r   )rh   ri   r)  _from_configtext_configrW  r  	vq_configvqmodelr  vocabulary_mapvocabulary_mappingr  r   s     r*   ri   zEmu3Model.__init__  sY     '44V5G5GH !1!12"<V=R=R"S 	r,   c                 6    | j                   j                         S r   )rW  get_input_embeddingsr   s    r*   r`  zEmu3Model.get_input_embeddings'  s    3355r,   c                 :    | j                   j                  |       y r   )rW  set_input_embeddingsrv   rF   s     r*   rb  zEmu3Model.set_input_embeddings*  s    ,,U3r,   c                     || _         y r   rW  rv   r  s     r*   set_decoderzEmu3Model.set_decoder-  s	    !r,   c                     | j                   S r   re  r   s    r*   get_decoderzEmu3Model.get_decoder0  s    r,   r  r  c                     | j                   j                  ||      }|D cg c]+  }| j                  j                  |      j	                         - }}t        j                  |      }|S c c}w )a  
        Tokenizes images into discrete tokens with VQGAN module. Converts
        obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
        special tokens.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
            image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
                The sizes of the images in the batch, being (height, width) for each image.
        )r\  r  r^  r  r  r%   r&   )rv   r  r  image_tokens_listtokensbpe_tokens_list
bpe_tokenss          r*   get_image_tokenszEmu3Model.get_image_tokens3  sc     !LL//kJctuY_422BB6JRRTuuYY/
 vs   0A*c                    | j                  ||      }|D cg c];  \  }}|| j                  j                  z  || j                  j                  z  dz   z  = }}} | j                         |      }t	        j
                  ||      }|S c c}}w )a7  
        Tokenizes images into discrete tokens with VQGAN module and embeds
        them with text embeddings layer

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
                The tensors corresponding to the input images.
        r   )ro  r\  r  r`  r%   split)rv   r  r  r  r   r   split_sizesimage_featuress           r*   get_image_featureszEmu3Model.get_image_featuresD  s     ,,\;G "-
 t||999et||GiGi>ilm>mn
 
 5224\B^[A
s   A B	r  r   r   c                     |ddddf   j                  d||dz         }| j                  j                  |      }| j                  j	                  |      }|S )a  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.

        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
            height (`int`):
                Height of the generated image before upsampling.
            width (`int`):
                Width of the generated image before upsampling.
        Nr    r   )r   r^  r  r\  r  )rv   r  r   r   	sequencesimages         r*   decode_image_tokenszEmu3Model.decode_image_tokensV  sX     !CRC(--b&%!)D	..>>yI##L1r,   r5  r6  rs  c                 P   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }|j                  d   |j                  d   z  }||   j                         |j                         k7  rt        d| d|       |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        )rM   r  r    r   r   z6Image features and image tokens do not match: tokens: z, features )r`  r%   tensorr^  r  longr  allr   r.   	expand_asrW   r$   numelrA  )rv   r5  r6  rs  special_image_maskn_image_tokensn_image_featuress          r*   get_placeholder_maskzEmu3Model.get_placeholder_maski  s    !.2M$2K2K2MT44CC5::^k^r^rs3 " "4!7!7!;!*d.E.E.T.T!T+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL+,2248L8L8NNHHXXcdtcuv  "!r,   rG   r3   ry   r   r   rJ   r:   c
           
      2   |du |duz  rt        d      | | j                         |      }|O| j                  ||      }t        j                  |d      }| j                  |||      }|j                  ||      } | j                  d||||||	d|
}|S )ap  
        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
            [`Emu3ImageProcessor`] for processing images).
        NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oner   r"   )r6  rs  )rG   r3   ry   r6  r   r   r   )rA  r`  rt  r%   r&   r  masked_scatterrW  )rv   r5  r  r  rG   r3   ry   r6  r   r   rJ   image_embedsr  rP  s                 r*   r   zEmu3Model.forward  s    * -t";<s   7D557	BM#22<ML 99\q9L!%!:!:| "; " *889K\ZM "$// 
)%+')
 
 r,   )	NNNNNNNNN)r   r   r   _checkpoint_conversion_mappingri   r`  rb  rg  ri  r%   rX  r   ro  rt  r'  r   ry  r  r   r   r   r   r	   r   r   r   r   r   r   r   r   r   s   @r*   rV  rV    s   &8,%G"64"U->-> UM]M] "u/@/@ uO_O_ $ ]]0@0@ # VY  $"))":?:K:K"]b]n]n"0  '+*.$(1537+/59$(59.##. ''. \\	.
 !.. u//0. "%.   1 12. D>. !!1!12. +,. 
u,,	-.  .r,   rV  c                       e Zd ZdZdgZddddZ fdZd Zd	 Zd
e	j                  fdZd Zd Zed        Zed        Zed        Zd Zee	 	 	 	 	 	 	 	 	 	 	 d dej,                  dej.                  dej0                  deej0                     deej,                     dee   deej.                     dee   deej,                     deej,                     deeej0                  f   dee   d
ee e!f   fd              Z"	 	 	 	 	 	 	 d! fd	Z# xZ$S )"Emu3ForConditionalGeneration rA  zmodel.text_modelzmodel.vqmodelrB  )z^text_model.modelz^vqmodelz^text_model.lm_headc                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y rF  )rh   ri   rV  r  rS   ro   rZ  rk   r.  rB  r  r   s     r*   ri   z%Emu3ForConditionalGeneration.__init__  sS     v&
yy!3!3!?!?ASASA^A^ejkr,   c                 6    | j                   j                         S r   )r  r`  r   s    r*   r`  z1Emu3ForConditionalGeneration.get_input_embeddings  s    zz..00r,   c                 :    | j                   j                  |       y r   )r  rb  rc  s     r*   rb  z1Emu3ForConditionalGeneration.set_input_embeddings  s    

''.r,   r:   c                     | j                   S r   )rB  r   s    r*   get_output_embeddingsz2Emu3ForConditionalGeneration.get_output_embeddings  s    ||r,   c                 :    | j                   j                  |       y r   )r  rg  rf  s     r*   rg  z(Emu3ForConditionalGeneration.set_decoder  s    

w'r,   c                 6    | j                   j                         S r   )r  ri  r   s    r*   ri  z(Emu3ForConditionalGeneration.get_decoder  s    zz%%''r,   c                 .    | j                   j                  S r   )r  rW  r   s    r*   rW  z'Emu3ForConditionalGeneration.text_model  s    zz$$$r,   c                 .    | j                   j                  S r   )r  r\  r   s    r*   r\  z$Emu3ForConditionalGeneration.vqmodel  s    zz!!!r,   c                 .    | j                   j                  S r   )r  r^  r   s    r*   r^  z/Emu3ForConditionalGeneration.vocabulary_mapping  s    zz,,,r,   c                 :     | j                   j                  di |S r;  )r  ry  )rv   rJ   s     r*   ry  z0Emu3ForConditionalGeneration.decode_image_tokens  s    -tzz--777r,   r5  r  r  rG   r3   ry   r6  r   r   rG  rH  rJ   c                     | j                   d|||||||	d|}|d   }t        |t              rt        | d      n|}| j	                  |dd|ddf         }d}|
4 | j
                  d||
| j                  j                  j                  d|}t        |||j                  |j                  |j                        S )an  
        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
            [`Emu3ImageProcessor`] for processing images).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
        >>> import torch
        >>> import requests
        >>> from PIL import Image

        >>> model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

        >>> conversation = [
        ...     {
        ...     "role": "system",
        ...     "content": [
        ...         {"type": "text", "text": "You are a helpful assistant."},
        ...         ],
        ...     },
        ...     {
        ...     "role": "user",
        ...     "content": [
        ...         {"type": "image"},
        ...         {"type": "text", "text": "Please describe the image."},
        ...         ],
        ...     },
        ... ]

        >>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
        >>> image = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)

        >>> inputs = processor(images=[image], text=[prompt], return_tensors="pt").to(model.device, torch.bfloat16)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```rJ  r   NrK  rL  r   )r  r  r   rN  rB  rO  rb   rZ  r.  r   ry   r8   r*  )rv   r5  r  r  rG   r3   ry   r6  r   r   rG  rH  rJ   rP  r8   rQ  rD  rM  s                     r*   r   z$Emu3ForConditionalGeneration.forward  s    | $** 	
)%+')	
 	
  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD &#33!//))
 	
r,   c	                 R    t        |   |f|||||||d|	}
|d   dk7  rd |
d<   |
S )N)ry   rG   r6  r   r3   r  r   r   r  )rh   prepare_inputs_for_generation)rv   r5  ry   rG   r6  r   r3   r   r  rJ   model_inputsrw   s              r*   r  z:Emu3ForConditionalGeneration.prepare_inputs_for_generation?  sZ     w<

+)')%%

 

 !!+/L(r,   )NNNNNNNNNNr   )NNNNNTN)%r   r   r   r  rR  r  ri   r`  rb  rS   rh  r  rg  ri  propertyrW  r\  r^  ry  r   r   r%   r   rX  r   r   r	   r   r   r   r   r   r   r   r   r  r   r   s   @r*   r  r    s   *+/#(&"1/ryy (( % % " " - -8  '+*.$(1537+/59$(59-134X
##X
 ''X
 \\	X

 !.X
 u//0X
 "%X
   1 12X
 D>X
 !!1!12X
 ))*X
 c5<</0X
 +,X
 
u,,	-X
  X
z  r,   r  )r  r@  r)  r  r  rV  )Nr   )r   )Wr  	functoolsr   typingr   r   r   r%   torch.nnrS   torch.nn.functionalrT   r   activationsr   cache_utilsr	   r
   
generationr   integrationsr   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.deprecationr   utils.genericr   configuration_emu3r   r   r   r+   r7   r   r   rB   rh  r   r^   r`   r   r   r   r   r   r   r   r  r  r(  r,  r7  r>  r  rJ  rP  rZ  rv  r~  r  r  r  r  r  r)  r@  rV  r  __all__r   r,   r*   <module>r     s'  .  % , ,     ! . ) 7 / 9 O K F & I I 0 / K K(6	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%4D)BII D)N Y'J")) J (J(bii  +1 +\$ryy $D	RYY 	299 bii :!299 !H		 .")) &.(299 .(b<(299 <(~>)bii >)BV V299 D8 8v7ryy 7tCryy CLCryy CL l2 l2l2^3% 3%l '/ ' '"!<")) !<H P
' P
 P
f I
)? I
 I
XV# Vrh#6 hVr,   