
    hK                     0   d dl mZmZmZ d dlZd dlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0 ddl1m2Z2  e(       rddl3m4Z4  e*jj                  e6      Z7e& G d de!             Z8 G d dejr                        Z: G d dejr                        Z; ed       G d dejr                               Z< G d d ejr                        Z=d! Z>dDd"Z?d#ej                  d$eAd%ej                  fd&ZB	 dEd'ejr                  d(ej                  d)ej                  d*ej                  d+eej                     d,eCd-eCd.e#e%   fd/ZD G d0 d1ejr                        ZE G d2 d3ejr                        ZF G d4 d5e      ZG G d6 d7e8      ZH G d8 d9e      ZI G d: d;e8      ZJ e&d<=       G d> d?e8             ZK e&d@=       G dA dBe8e2             ZLg dCZMy)F    )CallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)use_kernel_forward_from_hub)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availableis_torchdynamo_compilinglogging)deprecate_kwarg   )	DiaConfigDiaDecoderConfigDiaEncoderConfig)DiaGenerationMixin)make_flex_block_causal_maskc                   <    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZddgZy)DiaPreTrainedModelconfigmodelT	input_idsDiaEncoderLayerDiaDecoderLayerN)__name__
__module____qualname__r#   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphmain_input_name_no_split_modules     b/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/dia/modeling_dia.pyr)   r)   ?   s<    &*#N!!O*,=>r<   r)   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )DiaMultiChannelEmbeddinga  In order to efficiently compute the audio embedding from the 9 different channels,
    we vectorize the embedding process by using a single embedding layer and an offset.
    Example:
    - num_embeds = 4
    - vocab_size = 8
    - num_channels = 3
    We would have offsets = [0, 8, 16]
    If audio_codes = [0, 1, 2, 3], [1, 3, 4, 7], [5, 6, 7, 8],
    then tokens = audio_codes + offsets
                = [0, 1, 2, 3, 9, 11, 12, 15, 21, 22, 23, 24]
    This allows us to use a single embedding layer for all channels.
    r*   c                 ~   t         |           t        j                  |j                  |j
                  z  |j                        | _        |j                  | _        |j
                  | _        t        j                  |j
                  t        j                        |j                  z  }| j                  d|d       y )NdtypeoffsetsF
persistent)super__init__r   	Embedding
vocab_sizenum_channelshidden_sizeembedtorcharangelongregister_buffer)selfr*   rC   	__class__s      r=   rG   z!DiaMultiChannelEmbedding.__init__Z   s    \\&"3"3f6I6I"I6K]K]^
!--"//,,v22%**EHYHYYYEBr<   audio_codesreturnc                 "   || j                   j                  |j                        z   j                  d      }| j	                  |      j                  |j                  d   |j                  d   d| j                        }|j                  d      S )Nr"   r      dim)	rC   todevicesqueezerL   viewshaperK   sum)rQ   rS   tokensembedss       r=   forwardz DiaMultiChannelEmbedding.forwardb   su    0B0B CCLLQOF#((a+:K:KA:NPRTXTdTdezzaz  r<   )
r/   r0   r1   __doc__r$   rG   rM   Tensorrb   __classcell__rR   s   @r=   r?   r?   L   s2    C/ C!5<< !ELL !r<   r?   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )DiaMLPc                 *   t         |           || _        t        j                  |j
                  d|j                  z  d      | _        t        j                  |j                  |j
                  d      | _        t        |j                     | _        y )NrW   Fbias)rF   rG   r*   r   LinearrK   intermediate_sizegate_up_proj	down_projr   
hidden_actactivation_fnrQ   r*   rR   s     r=   rG   zDiaMLP.__init__i   sp    IIf&8&8!f>V>V:V]bc6#;#;V=O=OV[\#F$5$56r<   hidden_statesrT   c                     | j                  |      }|j                  dd      \  }}|| j                  |      z  }| j                  |      S )NrW   rV   rX   )rn   chunkrq   ro   )rQ   rs   	up_statesgates       r=   rb   zDiaMLP.forwardq   sL    %%m4	#//!/4i 2 24 88	~~i((r<   )r/   r0   r1   rG   rM   FloatTensorrb   re   rf   s   @r=   rh   rh   h   s'    7)U%6%6 )5;L;L )r<   rh   RMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )
DiaRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z9
        DiaRMSNorm is equivalent to T5LayerNorm
        N)rF   rG   r   	ParameterrM   onesweightvariance_epsilon)rQ   rK   epsrR   s      r=   rG   zDiaRMSNorm.__init__|   s1     	ll5::k#:; #r<   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )NrW   rV   T)keepdim)	rB   rZ   rM   float32powmeanrsqrtr   r   )rQ   rs   input_dtypevariances       r=   rb   zDiaRMSNorm.forward   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r<   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler   r^   r   rQ   s    r=   
extra_reprzDiaRMSNorm.extra_repr   s*    ))*+6$2G2G1HIIr<   )gư>)r/   r0   r1   rG   rb   r   re   rf   s   @r=   r{   r{   z   s    $;Jr<   r{   c                   ~     e Zd ZU ej                  ed<   ddef fdZ ej                         e	d               Z
 xZS )DiaRotaryEmbeddinginv_freqr*   c                    t         |           t        |d      rUt        |j                  t
              r;|j                  j                  d|j                  j                  d            | _        nd| _        |j                  | _	        |j                  | _
        || _        t        | j                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                   | _        y )Nrope_scaling	rope_typetypedefaultr   FrD   )rF   rG   hasattr
isinstancer   dictgetr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr*   r   rope_init_fnattention_scalingrP   r   original_inv_freq)rQ   r*   r[   r   rR   s       r=   rG   zDiaRotaryEmbedding.__init__   s    6>*z&:M:Mt/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r<   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   rV   r"   mpscpuF)device_typeenabledrW   rX   rA   )r   floatexpandr^   rZ   r[   r   r   strrM   autocast	transposecatcosr   sinrB   )
rQ   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r=   rb   zDiaRotaryEmbedding.forward   sV    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s    BF%%F.N)r/   r0   r1   rM   rd   r2   r#   rG   no_gradr   rb   re   rf   s   @r=   r   r      s=    ll/y /" U]]_<  <r<   r   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..NrV   rW   rX   )r^   rM   r   )r   x1x2s      r=   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r<   c                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   r   unsqueeze_dimq_embedk_embeds           r=   apply_rotary_pos_embr      sY    ( --
&C
--
&C3w;q>C/0G3w;q>C/0GGr<   rs   n_reprT   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r"   N)r^   r   reshape)rs   r   batchnum_key_value_headsslenhead_dims         r=   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr<   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )NrW   r   rV   )rY   rB   )ptrainingr"   )r   num_key_value_groupsrM   matmulr   r^   r   
functionalsoftmaxr   rZ   rB   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r=   eager_attention_forwardr      s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r<   c                   :    e Zd ZdZddeeef   dedef fdZ	 e
ddd	      	 	 dd
ej                  deej                  ej                  f   deej                     dee   deej                      dee   deej                  ej                  f   fd       Z xZS )DiaSelfAttention=Multi-headed attention from 'Attention Is All You Need' paperr*   	layer_idx	is_causalc                    t         |           || _        || _        |j                  | _        | j                  j
                  | _        | j                  j                  xs | j                  | _        | j                  | j                  z  | _        t        |d|j                  | j                  z        | _
        d| _        d| _        || _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  z  | j                  d      | _        y )Nr   r"           Frj   )rF   rG   r*   r   rK   num_attention_heads	num_headsr   r   getattrr   r   attention_dropoutr   r   rl   q_projk_projv_projo_proj)rQ   r*   r   r   rR   s       r=   rG   zDiaSelfAttention.__init__   sF   "!--88#';;#B#B#Tdnn $(NNd6N6N$N!
F4F4F$..4XY!$"ii 0 0$..4==2PW\]ii 0 0$2J2JT]]2Zafgii 0 0$2J2JT]]2Zafgii >@P@PW\]r<   past_key_valuepast_key_valuesz4.58)new_nameversionrs   position_embeddingsr   cache_positionr   rT   c                 4   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  rt        | j                  j                     } || |	|
||f| j                  sdn| j                  | j                   d|\  }} |j"                  g |d j%                         }| j'                  |      }||fS )NrV   r"   rW   )r   r   r   eagerr   )r   r   )r^   r   r   r]   r   r   r   r   updater   r   r*   _attn_implementationr   r   r   r   r   r   r   )rQ   rs   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   s                     r=   rb   zDiaSelfAttention.forward  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r<   )FNN)r/   r0   r1   rc   r   r%   r$   intboolrG   r!   rM   rd   r   r   r	   
LongTensorr   r   rb   re   rf   s   @r=   r   r      s    G^u%57G%GH ^UX ^ei ^$ %0A6R ,059))||)) #5<<#=>)) !.	))
 "%)) !!1!12)) +,)) 
u||U\\)	*)) S))r<   r   c                        e Zd ZdZdedef fdZ	 	 ddej                  dej                  de	ej                     de	e
   d	ee   d
eej                  e	ej                     f   fdZ xZS )DiaCrossAttentionr   r*   r   c                 f   t         |           || _        || _        |j                  | _        |j
                  | _        | j                  j                  | _        | j                  j                  | _	        | j                  | j                  z  | _
        |j                  | _        d| _        d| _        d| _        t!        j"                  | j                  | j                  | j                  z  d      | _        t!        j"                  | j
                  | j                  | j                  z  d      | _        t!        j"                  | j
                  | j                  | j                  z  d      | _        t!        j"                  | j                  | j                  z  | j                  d      | _        y )Nr"   r   Frj   )rF   rG   r*   r   rK   cross_hidden_sizecross_num_attention_headsr   cross_num_key_value_headsr   r   cross_head_dimr   r   r   r   r   rl   r   r   r   r   rQ   r*   r   rR   s      r=   rG   zDiaCrossAttention.__init__@  s?   "!--!'!9!9>>#';;#H#H $(NNd6N6N$N!--!$ii 0 0$..4==2PW\]ii 6 68P8PSWS`S`8`glmii 6 68P8PSWS`S`8`glmii >@P@PW\]r<   rs   cross_attention_statesr   r   r   rT   c                 b   |j                   d d }g |d| j                  }g |j                   d d d| j                  }| j                  |      j                  |      j	                  dd      }	|%|j
                  j                  | j                        nd}
|]|
r[|j                  j                  | j                     j                  }|j                  j                  | j                     j                  }n| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }|C|j                  j                  ||| j                        \  }}d|j
                  | j                  <   t        }| j                   j"                  dk7  rt$        | j                   j"                     } || |	|||fd| j&                  i|\  }}|j)                  g |d      j+                         }| j-                  |      }||fS )NrV   r"   rW   FTr   r   )r^   r   r   r]   r   
is_updatedr   r   cross_attention_cachelayerskeysvaluesr   r   r   r   r*   r   r   r   r   r   r   )rQ   rs   r
  r   r   r   r   r   cross_shaper   r  r   r   r   r   r   s                   r=   rb   zDiaCrossAttention.forwardS  s    $))#2.88b8$--8M.44Sb9M2Mt}}M{{=166|DNNqRSTGVGb_//33DNNChm
&:(>>EEdnnUZZJ*@@GGW^^L%;<AA+NXXYZ\]^J;;'=>CCKPZZ[\^_`L*+:+P+P+W+W NN,(
L >B**4>>:(?;;++w6"9$++:Z:Z"[$7%
 LL%
 %
!\ "))*<K*<*<=HHJkk+.L((r<   r   )r/   r0   r1   rc   r$   r   rG   rM   rd   r   r   r   r   r   rb   re   rf   s   @r=   r  r  =  s    G^/ ^C ^. 269=1)||1) !&1) !.	1)
 ""561) -.1) 
u||Xell33	41)r<   r  c                        e Zd Zdedef fdZ	 	 d
dej                  dee	ej                  ej                  f      deej                     de
e   de	ej                  eej                     f   f
d	Z xZS )r-   r*   r   c                     t         |           t        |j                  |j                        | _        t        ||d      | _        t        |j                  |j                        | _        t        |      | _
        y )Nr   Fr   )rF   rG   r{   rK   norm_epspre_sa_normr   self_attentionpost_sa_normrh   mlpr	  s      r=   rG   zDiaEncoderLayer.__init__  s\    %f&8&8fooN.vyER&v'9'9vO&>r<   rs   r   r   r   rT   c                     |}| j                  |      } | j                  |f||d|\  }}||z   }|}| j                  |      }| j                  |      }	||	z   }||fS )Nr   r   )r  r  r  r  )
rQ   rs   r   r   r   residualnormed_statesself_attn_outputself_attn_weightsmlp_outs
             r=   rb   zDiaEncoderLayer.forward  s     !((7.Ad.A.A/
 3)/
 	/
++ !#33 ))-8((=) 7*///r<   r   )r/   r0   r1   r%   r   rG   rM   rd   r   r   r   r   rb   re   rf   s   @r=   r-   r-     s    "/ "C " LP15	0||0 &eELL%,,,F&GH0 !.	0
 -.0 
u||Xell33	40r<   r-   c                        e Zd Zdef fdZee	 	 	 ddej                  de	ej                     de	e
   de	e
   dee   d	eeef   fd
              Zdeej                  df   dej                  fdZ xZS )
DiaEncoderr*   c           	         t         |   |       || _        t        j                  |j
                  |j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t!        |      | _        y c c}w Nr  )rF   rG   r*   r   rH   rI   rK   	embedding
ModuleListrangenum_hidden_layersr-   r  r{   r  normr   rotary_embeddingsr	  s      r=   rG   zDiaEncoder.__init__  s     f&7&79K9KLmmAFvG_G_A`aI_VY/a
 v11vG	!3F!; bs   -CNr,   r   output_attentionsoutput_hidden_statesr   rT   c                    | j                  |      }t        j                  |j                  d   |j                        d d d f   }| j                  ||      }| j                  ||      }|rdnd }	|rdnd }
| j                  D ]'  }|r|	|fz   }	 ||f||d|}|d   }|s|
|d   fz   }
) | j                  |      }|r|	|fz  }	t        ||	|
      S )NrV   r[   r;   r  r   r"   last_hidden_staters   
attentions)
r&  rM   rN   r^   r[   r+  _update_full_maskr  r*  r   )rQ   r,   r   r,  r-  r   rs   r   r   encoder_statesall_attentionsencoder_layerlayer_outputss                r=   rb   zDiaEncoder.forward  s    y1
 ||IOOB$7	@P@PQRVXYRYZ"44]LQ//

  40d![[ 	FM#!/=2B!B)$7- 	M *!,M !/=3C2E!E	F 		-0}..N+>Vd
 	
r<   inputs_embedsc                 f   || j                   j                  dk(  rd|v r|}|S d }|S | j                   j                  dk(  rt        ||j                        }|S | j                   j                  dk(  r)t	        |t
        j                        rt        |d      }|S t        ||j                        }|S )Nflash_attention_2r   sdpaflex_attentionFr  	r*   r   r   rB   r   rM   rd   r'   r   )rQ   r   r8  s      r=   r3  zDiaEncoder._update_full_mask  s    
 %{{//3FF343F  MQ  11V; "E^UbUhUh!i  115EEnell;%@[`%aN
  "<NML_L_!`r<   )NFF)r/   r0   r1   r%   rG   r   r   rM   rd   r   r   r   r   r   r   r   rb   r3  re   rf   s   @r=   r#  r#    s    	</ 	<  26,1/4.
<<.
 !..
 $D>	.

 'tn.
 -..
 
%	&.
  .
bellD01 ||r<   r#  c                   l    e Zd Zdedef fdZ	 	 	 	 	 	 ddej                  dee	ej                  ej                  f      deej                     deej                     deej                     d	ee
   d
eej                     de	ej                  eej                     eej                     f   fdZ xZS )r.   r*   r   c                    t         |           |j                  | _        t	        ||d      | _        t        ||      | _        t        |j                  |j                        | _
        t        |j                  |j                        | _        t        |j                  |j                        | _        t        |      | _        y )NTr  r  )rF   rG   rK   	embed_dimr   r  r  cross_attentionr{   r  r  pre_ca_normpre_mlp_normrh   r  r	  s      r=   rG   zDiaDecoderLayer.__init__  s    ++.vyDQ0C%f&8&8fooN%f&8&8fooN&v'9'9vO&>r<   rs   r   r   encoder_hidden_statesencoder_attention_maskr   r   rT   c                 d   |}	t        |	t              r|	j                  }	|}
| j                  |      } | j                  ||||	fd|i|\  }}|
|z   }|}
| j                  |      } | j                  ||f||d|\  }}|
|z   }|}
| j                  |      }| j                  |      }|
|z   }|||fS )Nr   )r   r   )	r   r   self_attention_cacher  r  rB  rA  rC  r  )rQ   rs   r   r   rD  rE  r   r   r   self_attn_cacher  r  r  r   cross_statescross_attn_weightsr!  s                    r=   rb   zDiaDecoderLayer.forward	  s    *o':;-BBO ((7.Ad.A.A 	/
 *	/
 	/
++ !#33 ((7+?4+?+?!,
 2+	,

 ,
(( !</ ))-8((=) 7*/1CCCr<   )NNNNNN)r/   r0   r1   r$   r   rG   rM   rd   r   r   r   r  rb   re   rf   s   @r=   r.   r.     s    "/ "C " LP158<9=9=59-D||-D &eELL%,,,F&GH-D !.	-D
  (5-D !) 6-D ""56-D !!1!12-D 
u||Xell3Xell5KK	L-Dr<   r.   c                       e Zd ZdZdef fdZee	 	 	 	 	 	 	 	 ddej                  de
ej                     de
ej                     de
ej                     d	e
ej                     d
e
e   de
e   de
e   de
ej                     deeef   fd              Zdeej                  df   d	eej                  df   dej&                  dej                  fdZ xZS )
DiaDecoderz-Transformer Decoder Stack using DenseGeneral.r*   c           	         t         |   |       |j                  | _        |j                  | _        t	        |      | _        t        |      | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                         | _        y c c}w r%  )rF   rG   rJ   rI   r?   
embeddingsr   r+  r   r'  r(  r)  r.   r  r{   rK   r  r*  r	  s      r=   rG   zDiaDecoder.__init__<  s     "// ++26:!3F!;mmAFvG_G_A`aI_VY/a
 v11vG	 bs   9B?Nr,   r   r   rD  rE  r   r,  r-  r   rT   c
                    |j                         dd \  }}||j                         nd}|	%t        j                  |||z   |j                        }	|	|	dddf   }| j                  |      }| j                  ||      }|1t               s'||z   }t        j                  |||j                        }t        | j                  |||	||      }| j                  |||j                  dd |      }|rdnd}|rdnd}|r|dnd}| j                  D ]7  }|r||fz  } |||||f|||	d|
}|d   }|s#||d	   fz   }|/||d   fz   }9 | j                  |      }|r||fz  }t        |||||
      S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`):
            The original `decoder_input_ids` in 3D shape to facilitate more efficient computations.

            [What are input IDs?](../glossary#input-ids)
        NrV   r   r/  )r*   input_embedsr   r   r   r   rW   r;   )rE  r   r   r"   )r1  r   rs   r2  cross_attentions)sizeget_seq_lengthrM   rN   r[   rN  r+  r   r~   r   r*   _update_cross_attn_maskr^   r  r*  r   )rQ   r,   r   r   rD  rE  r   r,  r-  r   r   
batch_size
seq_lengthpast_key_values_lengthrs   r   mask_seq_lengthall_hidden_statesall_self_attnsall_cross_attentionslayerr7  s                         r=   rb   zDiaDecoder.forwardG  s   , "+!1#2!6
JETE`!?!?!Afg!"\\&(>(KT]TdTdN )$'2L 	2"44]LQ!*B*D4zAO"ZZ
OIL\L\]N+;;&))+%
 "&!=!=!"#	"
 #7BD0d&7<Q<]rdh[[ 	VE#!m%55!!#%		
 (> /-	 	M *!,M !/=3C2E!E(4+?=QRCSBU+U()	V, 		-0-!118+++%1
 	
r<   r   r8  c                    ||| j                   j                  dk(  rd|v r|}|S d }|S | j                   j                  dk(  rt        ||j                  |d         }|S | j                   j                  dk(  r-t	        |t
        j                        rt        ||d   d      }|S t        ||j                  |d         }|S )	Nr:  r   r;  rV   )tgt_lenr<  F)query_lengthr   r=  )rQ   rD  rE  r   r8  s        r=   rT  z"DiaDecoder._update_cross_attn_mask  s     !,1G1S{{//3FFCDH^C^)?&. &%/ ei&. &%- 11V; *M*!'''O*&$ &% 115EE4ellC-H.%0_"'.* &%	 *D*M,?,?UW*& &%r<   )NNNNNFFN)r/   r0   r1   rc   r$   rG   r   r   rM   rd   r   r  rx   r   r   r   r   r   rb   SizerT  re   rf   s   @r=   rL  rL  9  s[   7	H/ 	H  4815=A=A9=,1/459Z
<<Z
 u//0Z
 !.	Z

  ((9(9:Z
 !))9)9 :Z
 ""56Z
 $D>Z
 'tnZ
 !!1!12Z
 
8%?	@Z
  Z
z!&$U\\4%78!& !&ellD&8 9!& ZZ	!&
 ||!&r<   rL  z[
    The bare Dia model outputting raw hidden-states without any specific head on top.
    )custom_introc                   \    e Zd Zdef fdZd Zee	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     dee	j                     d	eeeef      d
ee   dee   dee   dee   dee	j                     deeef   fd              Z xZS )DiaModelr*   c                     t         |   |       || _        t        |j                        | _        t        |j                        | _        | j                          y r   )
rF   rG   r*   r#  encoder_configencoderrL  decoder_configdecoder	post_initrr   s     r=   rG   zDiaModel.__init__  sE     !&"7"78!&"7"78r<   c                     | j                   S r   )rf  r   s    r=   get_encoderzDiaModel.get_encoder  s    ||r<   r,   r   decoder_input_idsdecoder_position_idsdecoder_attention_maskencoder_outputsr   	use_cacher,  r-  r   rT   c                    ||t        d      |	|	n| j                  j                  }	|
|
n| j                  j                  }
||n| j                  j                  }| j
                  r%| j                  r|rt        j                  d       d}|r6|4t        t        | j                        t        | j                              }| | j                  d|||	|
d|}nGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      d	kD  r|d	   nd
      }|d   j                  d   d| j                  j                   j"                  }}}|9t%        j&                  |d|f| j                  j(                  | j*                        }|j,                  d	k(  r#|j/                  |||      j1                  dd	      } | j2                  d||||d   |||	|
||d
|}t5        |j6                  |j8                  |j:                  |j<                  |j>                  |d   |j:                  |j<                        S )a\  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        NzXYou should either provide text ids or the cached text encodings. Neither has been found.zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r*   )r,   r   r,  r-  r   r"   rW   r0  rV   )rR  
fill_valuer[   )
r,   r   r   rD  rE  r   r,  r-  rp  r   )r1  r   decoder_hidden_statesdecoder_attentionsrQ  encoder_last_hidden_staterD  encoder_attentionsr;   ) 
ValueErrorr*   r,  r-  rp  is_gradient_checkpointingr   loggerwarning_oncer   r
   rf  r   r   lenr^   rg  rJ   rM   fullbos_token_idr[   ndimr   r   rh  r   r1  r   rs   r2  rQ  )rQ   r,   r   rl  rm  rn  ro  r   rp  r,  r-  r   r   bszseq_lenchannelsdecoder_outputss                    r=   rb   zDiaModel.forward  so   N !8j  2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	))dmm##p "	01,dkk2RT`hlhshsTtuO"*dll #-"3%9	
 O O_=-"1!"4474H14Loa0RV14_1E1I?1-tO #2!"4":":1"=r4;;C]C]CjCjhW$ %

1h'DKK4L4LUYU`U`! !!Q& 1 9 9#x Q [ [\]_` a&$,, 
'-1"1!"4#1+/!5)
 
 "-??+;;"1"?"?.99,==&5a&8"1"?"?.99	
 		
r<   )NNNNNNNNNNN)r/   r0   r1   r#   rG   rk  r   r   r   rM   r  r   r   r   r   r   r   rb   re   rf   s   @r=   rc  rc    sE   y   15598<;?=ACG9=$(,0/359k
E,,-k
 !!1!12k
 $E$4$45	k

 'u'7'78k
 !))9)9 :k
 "%(>"?@k
 ""56k
 D>k
 $D>k
 'tnk
 !!1!12k
 
u((	)k
  k
r<   rc  zl
    The Dia model consisting of a (byte) text encoder and audio decoder with a prediction head on top.
    c                       e Zd ZdZdef fdZd Zd Zee		 	 	 	 	 	 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     d	e
ej                     d
e
ej                     de
eeef      de
e   de
e   de
e   de
e   de
ej                     de
ej                     deeef   fd              Z xZS )DiaForConditionalGenerationr+   r*   c                 |   t         |   |       || _        t        |      | _        |j
                  j                  | _        |j
                  j                  | _        t        j                  |j
                  j                  | j                  | j                  z  d      | _        d| _        | j                          y )NFrj   ForMaskedLM)rF   rG   r*   rc  r+   rg  rJ   rI   r   rl   rK   logits_dense	loss_typeri  rr   s     r=   rG   z$DiaForConditionalGeneration.__init__R  s     f%
"11>> //::II!!--0A0ADOO0S[`
 ' 	r<   c                 6    | j                   j                         S r   )r+   rk  r   s    r=   rk  z'DiaForConditionalGeneration.get_encodera      zz%%''r<   c                 6    | j                   j                         S r   )r+   get_decoderr   s    r=   r  z'DiaForConditionalGeneration.get_decoderd  r  r<   r,   r   rl  rm  rn  ro  r   rp  r,  r-  labelsr   rT   c                 ^    | j                   d	|||||||||	|
|d|}|d   }|j                  d   }| j                  |      j                  |d| j                  | j
                  f      j                  dd      j                         j                  || j                  z  d| j
                        }d}|  | j                  d	||| j
                  d|}t        |||j                  |j                  |j                  |j                  |j                  |j                  |j                   	      S )
a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size * num_codebooks,)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in
            `[0, ..., config.decoder_config.vocab_size - 1]` or -100. Tokens with indices set to `-100`
            are ignored (masked).
        )r,   r   rl  rm  rn  ro  r   rp  r,  r-  r   r   rV   r"   rW   N)logitsr  rI   )	lossr  r   rs  rt  rQ  ru  rD  rv  r;   )r+   r^   r  r]   rJ   rI   r   r   loss_functionr   r   rs  rt  rQ  ru  rD  rv  )rQ   r,   r   rl  rm  rn  ro  r   rp  r,  r-  r  r   r   outputsr1  rU  audio_logitsr  s                      r=   rb   z#DiaForConditionalGeneration.forwardg  sH   X $** 
)/!5#9++/!5)
 
 $AJ&,,Q/
 /0T:r4#4#4dooFGYq!_Z\T*t000"dooF 	 %4%%o\&UYUdUdohnoD#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r<   )NNNNNNNNNNNN)r/   r0   r1   r3   r#   rG   rk  r  r   r   r   rM   r  r   r   r   r   r   r   rb   re   rf   s   @r=   r  r  J  sj     y ((  15598<;?=ACG9=$(,0/3-159R
E,,-R
 !!1!12R
 $E$4$45	R

 'u'7'78R
 !))9)9 :R
 "%(>"?@R
 ""56R
 D>R
 $D>R
 'tnR
 ))*R
 !!1!12R
 
uo%	&R
  R
r<   r  )rc  r)   r  )Nr"   )r   )Ntypingr   r   r   rM   r   activationsr   cache_utilsr	   r
   r   integrationsr   masking_utilsr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r    utils.deprecationr!   configuration_diar#   r$   r%   generation_diar&   integrations.flex_attentionr'   
get_loggerr/   ry  r)   Moduler?   rh   r{   r   r   r   rd   r   r   r   r   r   r  r-   r#  r.   rL  rc  r  __all__r;   r<   r=   <module>r     sQ  , - ,   ! C C 7 / g B 9  L F &  1 L L .  !J 
		H	% 	? 	? 	?!ryy !8)RYY )$ Y'J J (J(!< !<H(6	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%4?)ryy ?)DG)		 G)T00 0BS# Sl8D0 8DvN&# N&b 
x
! x

x
v 
l
"46H l

l
^ Lr<   