
    hcB                    b   d Z ddlZddlmZmZ ddlZddlZddlZddlm	Z	 ddl
mZmZmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/  e*j`                  e1      Z2dZ3dejh                  de5de5fdZ6	 d~dejh                  de5deejh                     fdZ7	 	 dde8e5e5f   de9de5deejt                     de5d ejv                  fd!Z< G d" d#e      Z= G d$ d%e      Z> G d& d'e      Z? G d( d)e	j                        ZA G d* d+e	j                        ZB G d, d-e	j                        ZC G d. d/ej                  j                        ZD G d0 d1e	j                        ZE G d2 d3e	j                        ZF G d4 d5e	j                        ZG G d6 d7e	j                        ZH G d8 d9e	j                        ZI G d: d;e	j                        ZJ G d< d=e	j                        ZK G d> d?e	j                  e&      ZL G d@ dAe	j                  e&      ZM G dB dCe	j                  e&      ZN G dD dEe	j                        ZO G dF dGe	j                        ZP G dH dIe      ZQ G dJ dKe      ZRe) G dL dMe'             ZS G dN dOeS      ZT G dP dQeS      ZU G dR dSeS      ZV G dT dUeS      ZW G dV dWeS      ZX G dX dYeS      ZY G dZ d[eS      ZZ G d\ d]eS      Z[ G d^ d_e	j                        Z\ G d` dae	j                        Z] e)dbc       G dd deeS             Z^ e)dfc       G dg dheSe             Z_	 	 	 	 	 	 	 	 ddieSdej                  djeej                     deejt                     dke9dle9dme9dnee	j                     doeadpead eej                  e8ej                  ej                  f   f   fdqZb e)drc       G ds dteS             Zc e)duc       G dv dweS             Zd G dx dye	j                        Ze e)dzc       G d{ d|e'             Zfg d}Zgy)zPyTorch SpeechT5 model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossL1Loss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSeq2SeqSpectrogramOutput)EmbeddingAccessMixinPreTrainedModel)auto_docstringlogging)deprecate_kwarg   )SpeechT5ConfigSpeechT5HifiGanConfig	input_idspad_token_iddecoder_start_token_idc                     | j                  | j                        }| ddddf   j                         |ddddf<   ||dddf<   |t        d      |j	                  |dk(  |       |S )z1
    Shift input ids one token to the right.
    Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r!   r"   r#   shifted_input_idss       l/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/speecht5/modeling_speecht5.pyshift_tokens_rightr-   7   s}     "++IOO<(CRC0668ae4adLMM""#4#<lK    input_valuesreduction_factorattention_maskc                     |dkD  r | dd|dz
  d|f   } ||dd|dz
  d|f   }| j                  | j                        }| ddddf   j                         |ddddf<   |j                  |dk(  d       ||fS )zw
    Shift input spectrograms one timestep to the right. Also applies the reduction factor to the sequence length.
    r   Nr%         Y        )r&   r'   r(   r*   )r/   r0   r1   shifted_input_valuess       r,   shift_spectrograms_rightr6   G   s     !#A'7!';'O?O'O$OP%+A/?!/C/WGW/W,WXN'11,2D2DE".q#2#v"6"<"<">AB %%&:f&DcJ//r.   r'   	mask_probmask_length	min_masksreturnc                    | \  }dk  rt        d      kD  rt        d d d      t        j                  j                  d      j	                         fd}|-|j                         j                  d      j                         nt        |      D cg c]  } c}}t        j                  |ft        	      }	g }
 |      }|d
k(  r|	S |D ]  } ||      }t        j                  j                  t        j                  |dz
  z
        |d      }t        |      d
k(  rdz
  }n|d
   }t        j                  |t        j                  ||z
  t        j                   	      |z  g      }|
j#                  |        t        j$                  |
      }
t        j&                  |
dddddf   ||f      }
|
j)                  ||z        }
t        j                        ddddf   }t        j&                  |||f      j)                  ||z        }|
|z   }
|
j+                         dz
  kD  rdz
  |
|
dz
  kD  <   t        j,                  |	|
dd       |	S c c}w )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t        | z  z  z         }t        |      }|z  kD  rz  }| dz
  z
  |k  rt        | dz
  z
  d      }|S )z;Given input length, compute how many spans should be maskedr   r   )intmax)input_lengthnum_masked_spanepsilonr8   r7   r9   sequence_lengths     r,   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_span   so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr.   Nr%   dtyper   F)replace)r)   nprandomranditemdetachsumtolistrangezerosboolchoicearangelenconcatenateonesint32appendarraybroadcast_toreshaper?   put_along_axis)r'   r7   r8   r1   r9   
batch_sizerD   _input_lengthsspec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr@   rA   spec_aug_mask_idxdummy_mask_idxoffsetsrB   rC   s    `` `            @@r,   _compute_mask_indicesrf   ]   s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89!o9  HHj/:$GM1/Ba% 51,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;o(MUWU]U] ^ao op
 	!!"34/52 "45 1a:&5H+(V ,33J@SVa@ab ii$T4]3Goog
4G'UV^^'+5G ,g5 /A"55GVYZGZ-!0CCD m%7B?w :s   $	I+c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5NoLayerNormConvLayerc                 d   t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        y )Nr   r   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr
   feat_extract_activation
activationselfconfiglayer_id	__class__s      r,   ro   z%SpeechT5NoLayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@r.   c                 J    | j                  |      }| j                  |      }|S N)rw   ry   r{   hidden_statess     r,   forwardz$SpeechT5NoLayerNormConvLayer.forward   s$    		-06r.   r   __name__
__module____qualname__ro   r   __classcell__r~   s   @r,   rh   rh      s    Ar.   rh   c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5LayerNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        j                  | j                  d      | _        t        |j                     | _        y )Nr   r   rj   T)elementwise_affine)rn   ro   rp   rq   rr   r   rs   rt   ru   rv   rw   	LayerNorm
layer_normr
   rx   ry   rz   s      r,   ro   z#SpeechT5LayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@r.   c                     | j                  |      }|j                  dd      }| j                  |      }|j                  dd      }| j                  |      }|S )Nr%   )rw   	transposer   ry   r   s     r,   r   z"SpeechT5LayerNormConvLayer.forward   sV    		-0%//B76%//B76r.   r   r   r   s   @r,   r   r      s    Ar.   r   c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5GroupNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        t        j                  | j                  | j                  d      | _        y )Nr   r   rj   T)
num_groupsnum_channelsaffine)rn   ro   rp   rq   rr   r   rs   rt   ru   rv   rw   r
   rx   ry   	GroupNormr   rz   s      r,   ro   z#SpeechT5GroupNormConvLayer.__init__  s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@,,$2C2CRVRcRclpqr.   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )rw   r   ry   r   s     r,   r   z"SpeechT5GroupNormConvLayer.forward  s2    		-066r.   r   r   r   s   @r,   r   r     s    r r.   r   c            	            e Zd ZdZddededee   f fdZddededee   fdZeddededee   fd       Z	 e
j                         dd	e
j                  d
efd       Z	 dd	e
j                  ded
ee   fdZ xZS )%SpeechT5SinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.num_positionsembedding_dimpadding_idxc                     t         |           d| _        || _        || _        | j                  || j                  z   ||       y N   )rn   ro   offsetr   r   make_weights)r{   r   r   r   r~   s       r,   ro   z.SpeechT5SinusoidalPositionalEmbedding.__init__#  s@    *&-$++5}kRr.   num_embeddingsc                     | j                  |||      }t        | d      r;|j                  | j                  j                  | j                  j
                        }| j                  d|d       y )NweightsrF   deviceF
persistent)get_embeddinghasattrtor   rF   r   register_buffer)r{   r   r   r   emb_weightss        r,   r   z2SpeechT5SinusoidalPositionalEmbedding.make_weights*  s[    ((T4#%..t||/A/A$,,J]J].^KYFr.   c                    |dz  }t        j                  d      |dz
  z  }t        j                  t        j                  |t        j
                        j                         | z        }t        j                  | t        j
                        j                         j                  d      |j                  d      z  }t        j                  t        j                  |      t        j                  |      gd      j                  | d      }|dz  dk(  r-t        j                  |t        j                  | d      gd      }|	d||ddf<   |j                  t        j                               S )	z
        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
        description in Section 3.5 of "Attention Is All You Need".
        r   i'  r   rE   r   dimr%   N)mathlogtorchexprS   int64float	unsqueezecatsincosviewrP   r   get_default_dtype)r   r   r   half_dimembs        r,   r   z3SpeechT5SinusoidalPositionalEmbedding.get_embedding2  s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r.   r!   past_key_values_lengthc                    |j                         \  }}| j                  || j                  |      j                  |j                        }| j                  dz   |z   }|| j
                  j                  d      kD  r4| j                  || j                  z   | j                  | j                         | j
                  j                  d|j                  d            j                  ||d      j                         S )Nr   r   r%   )size"create_position_ids_from_input_idsr   r   r   r   r   r   r   index_selectr   rL   )r{   r!   r   bszseq_lenposition_idsmax_poss          r,   r   z-SpeechT5SinusoidalPositionalEmbedding.forwardD  s     ~~'W>>y$JZJZ\rsvv

 ""Q&0T\\&&q))g3T5G5GIYIYZ||((L,=,=b,ABGGWVXY``bbr.   c                     |j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
        symbols are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:
        Returns: torch.Tensor
        r   r   )ner>   r   cumsumtype_aslong)r{   r!   r   r   maskincremental_indicess         r,   r   zHSpeechT5SinusoidalPositionalEmbedding.create_position_ids_from_input_idsS  sW     ||K(,,.$||Da8@@FI__cgg"'')K77r.   r   r   )r   r   r   __doc__r>   r   ro   r   staticmethodr   r   no_gradTensorr   r   r   r   s   @r,   r   r      s    NSc S# SHUXM SG3 Gs GQYZ]Q^ G 1c 1# 1HUXM 1 1" U]]_c cs c c bc88478QYZ]Q^8r.   r   c                   $     e Zd Z fdZd Z xZS )SpeechT5PositionalConvEmbeddingc                    t         |           t        j                  |j                  |j                  |j
                  |j
                  dz  |j                        | _        t        j                  j                  }t        t        j                  j                  d      r$t        j                  j                  j                  }t               r(dd l}|j                  j                  | j                  j                   d      5   || j                  dd      | _        d d d        t        | j                  d      rU| j                  j                  j                   j"                  }| j                  j                  j                   j$                  }n,| j                  j&                  }| j                  j(                  }|j                  j+                  | |       |j                  j+                  | |       n || j                  dd      | _        t-        |j
                        | _        t0        |j2                     | _        y # 1 sw Y   'xY w)	Nr   )rk   paddinggroupsweight_normr   )modifier_rankweight)namer   parametrizations)rn   ro   r   rs   hidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsrw   utilsr   r   r   r   	deepspeedzeroGatheredParametersr   	original0	original1weight_gweight_vregister_external_parameterSpeechT5SamePadLayerr   r
   rx   ry   )r{   r|   r   r   r   r   r~   s         r,   ro   z(SpeechT5PositionalConvEmbedding.__init__f  s   II6622a777
	 hh**288,,m<((33??K%'224993C3CST2U I'		aH	Ityy"459955<<FF9955<<FF99--99--NN66tXFNN66tXF#DIIH!DDI+F,J,JK !?!?@I Is   IIc                     |j                  dd      }| j                  |      }| j                  |      }| j                  |      }|j                  dd      }|S Nr   r   )r   rw   r   ry   r   s     r,   r   z'SpeechT5PositionalConvEmbedding.forward  sV    %//15		-0]36%//15r.   r   r   s   @r,   r   r   e  s    ABr.   r   c                   *     e Zd ZdZd fd	Zd Z xZS ) SpeechT5ScaledPositionalEncodingu[   
    Scaled positional encoding, see §3.2 in https://huggingface.co/papers/1809.08895
    c                    t        j                  ||      }t        j                  d|      j                  d      }t        j                  t        j                  d|dt         j
                        j                         t        j                  d      |z   z        }t        j                  |j                         |z        |d d dd df<   t        j                  |j                         |z        |d d dd df<   |j                  d      }t        | 1          | j                  d|d       t        j                  |	      | _        || _        t        j$                  t        j&                  d
            | _        y )Nr   r   r   rE   g     @peFr   p      ?)r   rP   rS   r   r   r   r   r   r   r   r   rn   ro   r   r   Dropoutdropoutr   	Parametertensoralpha)r{   r   r   max_lenr   positiondiv_termr~   s          r,   ro   z)SpeechT5ScaledPositionalEncoding.__init__  s    [[#&<<7+55a899U\\!S!5;;GMMOTXT\T\]dTehkTkRllmii 08 ;<1add7ii 08 ;<1add7\\!_T2%8zzG,\\%,,s"34
r.   c                     || j                   | j                  d d d |j                  d      f   z  z   }| j                  |      }|S )Nr   )r   r   r   r   )r{   r   s     r,   r   z(SpeechT5ScaledPositionalEncoding.forward  sB    DJJMchhqkM)9!:::ll3
r.   )i  )r   r   r   r   ro   r   r   r   s   @r,   r   r     s    5r.   r   c                   &     e Zd Zd fd	Zd Z xZS )"SpeechT5RelativePositionalEncodingc                     t         |           || _        || _        t        j
                  j                  d|z  |      | _        y r   )rn   ro   r   
max_lengthr   r   	Embeddingpe_k)r{   r   r  r~   s      r,   ro   z+SpeechT5RelativePositionalEncoding.__init__  s8    $HH&&q:~s;	r.   c                 ~   |j                   d   }t        j                  d|      j                  |j                  t        j
                        }|d d d f   |d d d f   z
  }| j                   ||| j                   k  <   | j                  dz
  ||| j                  k\  <   || j                  z   }| j                  |      S )Nr   r   r   rF   )r'   r   rS   r   r   r   r  r  )r{   r   r   pos_seqs       r,   r   z*SpeechT5RelativePositionalEncoding.forward  s    %%a(,,q'*--]5I5IQVQ[Q[-\!T'"WT1W%55/3.>4??**+.2oo.A4??*+DOO+yy!!r.   )i  r   r   s   @r,   r  r    s    <	"r.   r  c                   $     e Zd Z fdZd Z xZS )r   c                 P    t         |           |dz  dk(  rd| _        y d| _        y )Nr   r   r   )rn   ro   num_pad_remove)r{   r   r~   s     r,   ro   zSpeechT5SamePadLayer.__init__  s)    #:Q#>!#Car.   c                 V    | j                   dkD  r|d d d d d | j                    f   }|S Nr   )r  r   s     r,   r   zSpeechT5SamePadLayer.forward  s6    ")!Q0F43F3F2F0F*FGMr.   r   r   s   @r,   r   r     s    Kr.   r   c                   .     e Zd ZdZ fdZd Zd Z xZS )SpeechT5FeatureEncoderz.Construct the features from raw audio waveformc           	         t         |           |j                  dk(  rDt        |d      gt	        |j
                  dz
        D cg c]  }t        ||dz          c}z   }nV|j                  dk(  r.t	        |j
                        D cg c]  }t        ||       }}nt        d|j                   d      t        j                  |      | _        d| _        d	| _        y c c}w c c}w )
Ngroupr   )r}   r   layerz`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)rn   ro   feat_extract_normr   rO   num_feat_extract_layersrh   r   r)   r   
ModuleListconv_layersgradient_checkpointing_requires_grad)r{   r|   ir  r~   s       r,   ro   zSpeechT5FeatureEncoder.__init__  s    ##w.5fqIJNSTZTrTruvTvNwNIJ,Va!eDN K %%0HMfNlNlHmCD*6A>K  01I1I0JJst  ==5&+#"Ns   C"	C'c                 J    | j                         D ]	  }d|_         d| _        y )NF)
parametersrequires_gradr  )r{   params     r,   _freeze_parametersz)SpeechT5FeatureEncoder._freeze_parameters  s(    __& 	(E"'E	(#r.   c                     |d d d f   }| j                   r| j                  rd|_        | j                  D ]
  } ||      } |S NT)r  trainingr  r  )r{   r/   r   
conv_layers       r,   r   zSpeechT5FeatureEncoder.forward  sP    $QW- 4==*.M'** 	6J&}5M	6 r.   )r   r   r   r   ro   r  r   r   r   s   @r,   r  r    s    8#&$

r.   r  c                   $     e Zd Z fdZd Z xZS )SpeechT5FeatureProjectionc                 4   t         |           t        j                  |j                  d   |j
                        | _        t        j                  |j                  d   |j                        | _	        t        j                  |j                        | _        y )Nr%   eps)rn   ro   r   r   rp   layer_norm_epsr   Linearr   
projectionr   feat_proj_dropoutr   r{   r|   r~   s     r,   ro   z"SpeechT5FeatureProjection.__init__  sf    ,,vr':@U@UV))FOOB$79K9KLzz&":":;r.   c                 p    | j                  |      }| j                  |      }| j                  |      }||fS r   )r   r*  r   )r{   r   norm_hidden_statess      r,   r   z!SpeechT5FeatureProjection.forward  s:    !__];(:;]3000r.   r   r   s   @r,   r$  r$    s    <1r.   r$  c                   6    e Zd Z fdZd Z	 	 ddej                  deej                     deej                     fdZ
dedej                  fdZd	eej                  ef   fd
Z	 	 ddej                  deej                     deej                     fdZ xZS )SpeechT5SpeechEncoderPrenetc                    t         |           || _        t        |      | _        t        |      | _        |j                  dkD  s|j                  dkD  rEt        j                  t        j                  |j                        j                               | _        t!        |      | _        t%        |j&                  |j(                  z   dz   |j                  |j(                        | _        y )Nr4   r   )rn   ro   r|   r  feature_encoderr$  feature_projectionmask_time_probmask_feature_probr   r   r   r   r   uniform_masked_spec_embedr   pos_conv_embedr   max_speech_positionsr"   pos_sinusoidal_embedr,  s     r,   ro   z$SpeechT5SpeechEncoderPrenet.__init__  s    5f=";F"C   3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"=fE$I''&*=*==A%
!r.   c                 8    | j                   j                          y r   )r2  r  r{   s    r,   freeze_feature_encoderz2SpeechT5SpeechEncoderPrenet.freeze_feature_encoder  s    //1r.   r/   r1   mask_time_indicesc                    | j                  |      }|j                  dd      }|| j                  |j                  d   |      }| j	                  |      \  }}| j                  |||      }| j                  |      }||z   }| |j                  d      j                         }n=t        j                  |j                  d d t        j                  |j                        }| j                  |      }||z   }||fS )Nr   r   )r>  r1   r   )r2  r   "_get_feature_vector_attention_maskr'   r3  _mask_hidden_statesr8  r   r   r   rP   r   r:  )	r{   r/   r1   r>  extract_featuresr   positional_conv_embeddingpadding_mask positional_sinusoidal_embeddingss	            r,   r   z#SpeechT5SpeechEncoderPrenet.forward  s     //=+55a;%!DD &&q)N
 +/*A*ABR*S''00->~ 1 
 %)$7$7$F!%(AA%),,Q/446L ;;}':':2A'>ejjYfYmYmnL+/+D+D\+R(%(HHn,,r.   feature_vector_lengthc                    |j                  d      d d df   }| j                  |      j                  t        j                        }|j
                  d   }t        j                  ||f|j                  |j                        }d|t        j                  |j
                  d   |j                        |dz
  f<   |j                  dg      j                  d      j                  dg      j                         }|S )Nr%   r   r   r   r   r   )r    _get_feat_extract_output_lengthsr   r   r   r'   rP   rF   r   rS   fliprQ   )r{   rF  r1   non_padded_lengthsoutput_lengthsr]   s         r,   r@  z>SpeechT5SpeechEncoderPrenet._get_feature_vector_attention_mask:  s     ,22r2:1b5A>>?QRUUV[V`V`a#))!,
./~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr.   r_   c                     d }t        | j                  j                  | j                  j                        D ]  \  }} ||||      } |S )zH
        Computes the output length of the convolutional layers
        c                 >    t        j                  | |z
  |d      dz   S )Nfloor)rounding_moder   )r   div)r@   rk   rl   s      r,   _conv_out_lengthzVSpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengths.<locals>._conv_out_lengthO  s"     99\K7wWZ[[[r.   )zipr|   rt   ru   )r{   r_   rR  rk   rl   s        r,   rI  z<SpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengthsJ  sQ    
	\
 $'t{{'>'>@W@W#X 	QK,]KPM	Q r.   r   c                    t        | j                  dd      s|S |j                         \  }}}|)| j                  j	                  |j
                        ||<   n| j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                  || j                  j                        }t        j                  ||j                  t        j                        }| j                  j	                  |j
                        ||<   | j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                   | j                  j"                        }t        j                  ||j                  t        j                        }|dddf   j%                  d|d      }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )r7   r8   r1   r9   r  )r7   r8   r9   r%   )getattrr|   r   r7  r   rF   r4  r!  rf   mask_time_lengthmask_time_min_masksr   r   r   rQ   r5  mask_feature_lengthmask_feature_min_masksexpand)r{   r   r>  r1   r]   rC   r   mask_feature_indicess           r,   rA  z/SpeechT5SpeechEncoderPrenet._mask_hidden_statesZ  s    t{{$8$?   4A3E3E3G0
O[(/3/E/E/H/HI\I\/]M+,[[''!+ 5_-++44 KK88-++99! !&->}G[G[chcmcm n/3/E/E/H/HI\I\/]M+,;;((1,#8[)++77 KK;;++<<	$  $)<<0D]MaMainisis#t #74#@#G#GO]_#` 23M./r.   NN)r   r   r   ro   r=  r   r   r   
LongTensorFloatTensorr   r>   r@  r   rI  rA  r   r   s   @r,   r0  r0    s    
"2 6:9=	 -ll - !!1!12 - $E$5$56	 -F ]b]m]m  eEDTDTVYDY>Z & :>59	,((, $E$5$56, !!1!12	,r.   r0  c                   f     e Zd Z fdZd Z	 ddej                  deej                     fdZ xZ	S )SpeechT5SpeechDecoderPrenetc           	      X   t         |           || _        t        j                  t        |j                        D cg c]=  }t        j                  |dk(  r|j                  n|j                  |j                        ? c}      | _
        t        j                  |j                  |j                        | _        t        |j                  |j                  |j                        | _        t        j                  |j"                  |j                  z   |j                        | _        y c c}w r  )rn   ro   r|   r   r  rO   speech_decoder_prenet_layersr)  num_mel_binsspeech_decoder_prenet_unitslayersr   final_layerr   positional_dropoutr9  encode_positionsspeaker_embedding_dimspeaker_embeds_layerr{   r|   r  r~   s      r,   ro   z$SpeechT5SpeechDecoderPrenet.__init__  s    mm vBBC
 	 		+,6F''v7Y7Y66
 99V%G%GI[I[\ @%%''!

 %'IIf.J.JVM_M_._agasas$t!s   AD'c                     t        j                  |d   |      }|j                  d      j                  |j	                  d      dd      }t        j
                  |dk(  |d      dz  d|z
  z  S )Nr   r   r   )r   	bernoullir   repeatr   where)r{   inputs_embedsr   r   	all_maskss        r,   _consistent_dropoutz/SpeechT5SpeechDecoderPrenet._consistent_dropout  sd    }Q/15NN1%,,]-?-?-BAqI	{{9>=!<q@AEJJr.   r/   speaker_embeddingsc                 8   |}| j                   D ]M  }t        j                  j                   ||            }| j	                  || j
                  j                        }O | j                  |      }| j                  |      }|t        j                  j                  |      }|j                  d      j                  d|j                  d      d      }t        j                  ||gd      }t        j                  j                  | j                  |            }|S )Nr   r%   r   )rf  r   
functionalrelurs  r|   speech_decoder_prenet_dropoutrg  ri  	normalizer   r[  r   r   r   rk  )r{   r/   rt  rq  r  s        r,   r   z#SpeechT5SpeechDecoderPrenet.forward  s     %[[ 	oEMM..u]/CDM 44]DKKDmDmnM	o ((7--m<)!#!8!89K!L!3!=!=a!@!G!GML^L^_`Lace!f!II}6H&IrRMMM..t/H/H/WXMr.   r   )
r   r   r   ro   rs  r   r   r   r   r   r   s   @r,   ra  ra    s8    u,K 6:ll %U\\2r.   ra  c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5BatchNormConvLayerc                 
   t         |           |dk(  r|j                  }n|j                  }||j                  dz
  k(  r|j                  }n|j                  }t        j                  |||j                  d|j                  dz
  dz  d      | _        t        j                  |      | _
        ||j                  dz
  k  rt        j                         | _        nd | _        t        j                  |j                        | _        y )Nr   r   r   F)rk   rl   r   rm   )rn   ro   rd  speech_decoder_postnet_unitsspeech_decoder_postnet_layersr   rs   speech_decoder_postnet_kernelrw   BatchNorm1d
batch_normTanhry   r   speech_decoder_postnet_dropoutr   )r{   r|   r}   rq   rr   r~   s        r,   ro   z#SpeechT5BatchNormConvLayer.__init__  s    q= --K ==Kv;;a??!..L!>>LII<<99A=!C
	 ..6f::Q>> ggiDO"DOzz&"G"GHr.   c                     | j                  |      }| j                  |      }| j                  | j                  |      }| j                  |      }|S r   )rw   r  ry   r   r   s     r,   r   z"SpeechT5BatchNormConvLayer.forward  sJ    		-06??& OOM:M]3r.   r   r   r   s   @r,   r{  r{    s    I<r.   r{  c                   ^     e Zd Z fdZdej
                  fdZdej
                  fdZ xZS )SpeechT5SpeechDecoderPostnetc           	         t         |           || _        t        j                  |j
                  |j                  |j                  z        | _        t        j                  |j
                  |j                        | _	        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        y c c}w r   )rn   ro   r|   r   r)  r   rd  r0   feat_outprob_outr  rO   r~  r{  rf  rl  s      r,   ro   z%SpeechT5SpeechDecoderPostnet.__init__  s    		&"4"4f6I6IFLcLc6cd		&"4"4f6M6MNmm<A&BfBf<ghq'2h
hs   (Cr   c                    | j                  |      j                  |j                  d      d| j                  j                        }| j                  |      }| j                  |      j                  |j                  d      d      }|||fS )Nr   r%   )r  r   r   r|   rd  postnetr  )r{   r   outputs_before_postnetoutputs_after_postnetlogitss        r,   r   z$SpeechT5SpeechDecoderPostnet.forward  s~    !%}!=!B!B=CUCUVWCXZ\^b^i^i^v^v!w $-C D}-22=3E3Ea3H"M%'<fDDr.   c                     |j                  dd      }| j                  D ]
  } ||      } ||j                  dd      z   S r   )r   rf  )r{   r   layer_outputr  s       r,   r  z$SpeechT5SpeechDecoderPostnet.postnet  sI    $..q!4[[ 	/E .L	/|55a;;;r.   )	r   r   r   ro   r   r   r   r  r   r   s   @r,   r  r    s*    	
EU\\ E<U\\ <r.   r  c                   >     e Zd Z fdZdej
                  fdZ xZS )SpeechT5TextEncoderPrenetc                    t         |           || _        t        j                  |j
                  |j                  |j                        | _        t        |j                  |j                  |j                        | _        y r   )rn   ro   r|   r   r  
vocab_sizer   r"   embed_tokensr   rh  max_text_positionsri  r,  s     r,   ro   z"SpeechT5TextEncoderPrenet.__init__  se    LL):):F<N<NPVPcPcd @%%%%!
r.   r!   c                 J    | j                  |      }| j                  |      }|S r   )r  ri  )r{   r!   rq  s      r,   r   z!SpeechT5TextEncoderPrenet.forward	  s(    )))4--m<r.   )r   r   r   ro   r   r   r   r   r   s   @r,   r  r    s    
 r.   r  c                   l     e Zd Z fdZ	 	 ddej
                  deej                     dee   fdZ	 xZ
S )SpeechT5TextDecoderPrenetc                    t         |           || _        t        j                  |j
                        | _        |j                  rt        j                  |j                        nd| _        t        j                  |j                  |j                  |j                        | _        t!        |j"                  |j                  z   dz   |j                  |j                        | _        y )Nr   r   )rn   ro   r|   r   r   rh  r   scale_embeddingr   sqrtr   embed_scaler  r  r"   r  r   r  embed_positionsr,  s     r,   ro   z"SpeechT5TextDecoderPrenet.__init__  s    zz&";";<<B<R<R499V%7%78X[LL):):F<N<NPVPcPcdD%%(;(;;a? 
r.   r!   r1   past_key_valuesc                 n   |&|j                         }|j                  d|d         }nt        d      d}|5t        |t              s|d   d   j
                  d   n|j                         }| j                  ||      }| j                  |      | j                  z  }||z  }| j                  |      }||fS )Nr%   z'You have to specify `decoder_input_ids`r   r   )r   r   r)   
isinstancer   r'   get_seq_lengthr  r  r  r   )r{   r!   r1   r  input_shaper   	positionsrq  s           r,   r   z!SpeechT5TextDecoderPrenet.forward  s      #..*K!r;r?;IFGG!"& "/59  "1%++B/$335 # ((4JK	)))4t7G7GG"]3n,,r.   r]  )r   r   r   ro   r   r   r   r^  r   r   r   r   s   @r,   r  r    sD    
" 6:+/	-<<- !!1!12- "%	-r.   r  c                   J     e Zd Z fdZdej
                  fdZd Zd Z xZ	S )SpeechT5TextDecoderPostnetc                     t         |           || _        t        j                  |j
                  |j                  d      | _        y )NFrm   )rn   ro   r|   r   r)  r   r  lm_headr,  s     r,   ro   z#SpeechT5TextDecoderPostnet.__init__<  s5    yy!3!3V5F5FUSr.   r   c                 $    | j                  |      S r   r  r   s     r,   r   z"SpeechT5TextDecoderPostnet.forwardA  s    ||M**r.   c                     | j                   S r   r  r<  s    r,   get_output_embeddingsz0SpeechT5TextDecoderPostnet.get_output_embeddingsD  s     ||r.   c                     || _         y r   r  r{   new_embeddingss     r,   set_output_embeddingsz0SpeechT5TextDecoderPostnet.set_output_embeddingsI  s	    %r.   )
r   r   r   ro   r   r   r   r  r  r   r   s   @r,   r  r  ;  s#    T
+U\\ +
&r.   r  c                       e Zd ZdZ	 	 	 	 ddededee   dee   dee   dee   f fdZ e	d	d
d      	 	 	 	 	 	 	 dde
j                  dee
j                     d
ee   dee
j                     dee
j                     dee
j                     dedee
j                     dee
j                  ee
j                     ee   f   fd       Z xZS )SpeechT5Attentionz
    Multi-headed attention from 'Attention Is All You Need' paper with relative position bias (see
    https://aclanthology.org/N18-2074.pdf)
    	embed_dim	num_headsr   
is_decoderrm   	layer_idxc                    t         |           || _        || _        || _        ||z  | _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _        || _	        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r  )rn   ro   r  r  r   head_dimr)   scalingr  r  r   r)  k_projv_projq_projout_proj)r{   r  r  r   r  rm   r  r~   s          r,   ro   zSpeechT5Attention.__init__S  s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"ii	94@ii	94@ii	94@		)YTBr.   past_key_valuer  4.58new_nameversionr   key_value_statesr1   layer_head_maskposition_biasoutput_attentionscache_positionr:   c	                 <   |du}	|j                         \  }
}}| j                  |      | j                  z  }|St        |t              rA|j
                  j                  | j                        }|	r|j                  }n|j                  }n|}|	r|n|}|	rK|IrGj                  | j                     j                  }|j                  | j                     j                  }n| j                  |      }| j                  |      }|j                  |
d| j                   | j"                        j%                  dd      }|j                  |
d| j                   | j"                        j%                  dd      }|D|	s|nd}j'                  ||| j                  d|i      \  }}|	rd|j
                  | j                  <   |
| j                   z  d| j"                  f}|j                  |
|| j                   | j"                        j%                  dd      } |j(                  | } |j(                  | } |j(                  | }|j                  d      }t+        j,                  ||j%                  dd            }|j                         |
| j                   z  ||fk7  r/t/        d|
| j                   z  ||f d|j                                ||j1                         j                  |
| j                   z  d| j"                        j%                  d	d      }t+        j2                  ||j%                  d
d            }|j%                  d	d      j                  |
| j                   z  |j                  d	      |j                  d            }||z  }|{|j                         |
d||fk7  r#t/        d|
d||f d|j                                |j                  |
| j                   ||      |z   }|j                  |
| j                   z  ||      }t4        j6                  j9                  |d      }||j                         | j                   fk7  r*t/        d| j                   f d|j                                |j                  dddd      |j                  |
| j                   ||      z  }|j                  |
| j                   z  ||      }|r?|j                  |
| j                   ||      }|j                  |
| j                   z  ||      }nd}t4        j6                  j;                  || j:                  | j<                        }t+        j,                  ||      }|j                         |
| j                   z  || j"                  fk7  r7t/        d|
| j                   || j"                  f d|j                                |j                  |
| j                   || j"                        }|j%                  dd      }|j)                  |
|| j>                        }| jA                  |      }||fS )z#Input shape: Batch x Time x ChannelNr%   r   r   r  Tz$Attention weights should be of size z	, but is r   r   z!Attention mask should be of size r   z/Head mask for a single layer should be of size )r   r!  z `attn_output` should be of size )!r   r  r  r  r   
is_updatedgetr  cross_attention_cacheself_attention_cacherf  keysvaluesr  r  r   r  r  r   updater[   r   bmmr)   
contiguousmatmulr   rv  softmaxr   r!  r  r  )r{   r   r  r  r1   r  r  r  r  is_cross_attentionr   tgt_lenr^   query_statesr  curr_past_key_valuecurrent_states
key_statesvalue_states
proj_shapesrc_lenattn_weights	reshape_qrel_pos_biasattn_weights_reshaped
attn_probsattn_outputs                              r,   r   zSpeechT5Attention.forwardp  s     .T9',,.Wa {{=1DLL@&/+>?,77;;DNNK
%*9*O*O'*9*N*N'&5#-?)]/"=*,33DNNCHHJ.55dnnELLL^4J;;~6L#b$..$--PZZ[\^_`J',,S"dnndmmT^^_`bcdL*7It+>+E+Ednn?OQ_>`,(
L &AEO..t~~>DNN*B>
#((gt~~t}}U__`acde+|++Z8'Z''4
+|++Z8//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(*  $$//166sT^^7KRQUQ^Q^_iijkmnoI <<	=3J3J2r3RSL'11!Q7<<dnn$m&8&8&;]=O=OPQ=RL L(L%""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL',,S4>>-A7GTL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVZVdVdfmov?wwL',,S4>>-A7GTL
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CRVR_R_3`2a b$$&') 
 "&&sDNNGT]]S!++Aq1 "))#wGmmK0111r.   )r4   FTN)NNNNNFN)r   r   r   r   r>   r   r   rQ   ro   r   r   r   r   tupler   r   r   s   @r,   r  r  M  sX    $'%*#$(CC C %	C
 TNC tnC D>C: %0A6R 48+/152604"'15~2||~2 #5<<0~2 "%	~2
 !.~2 "%,,/~2  -~2  ~2 !.~2 
u||Xell3Xe_D	E~2 S~2r.   r  c                   $     e Zd Z fdZd Z xZS )SpeechT5FeedForwardc                    t         |           t        j                  |j                        | _        t        j                  |j                  |      | _        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  ||j                        | _        t        j                  |j                        | _        y r   )rn   ro   r   r   activation_dropoutintermediate_dropoutr)  r   intermediate_denser  
hidden_actstrr
   intermediate_act_fnoutput_densehidden_dropoutoutput_dropout)r{   r|   intermediate_sizer~   s      r,   ro   zSpeechT5FeedForward.__init__  s    $&JJv/H/H$I!"$))F,>,>@Q"Rf''-'-f.?.?'@D$'-'8'8D$II&79K9KL jj)>)>?r.   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S r   )r  r  r  r  r  r   s     r,   r   zSpeechT5FeedForward.forward   sX    //>00?11-@))-8++M:r.   r   r   s   @r,   r  r    s    @r.   r  c                        e Zd Zdef fdZ	 	 	 	 d	dej                  deej                     deej                     deej                     def
dZ	 xZ
S )
SpeechT5EncoderLayerr|   c                    t         |           t        |j                  |j                  |j
                  d      | _        t        j                  |j                        | _
        t        j                  |j                  |j                        | _        t        ||j                        | _        t        j                  |j                  |j                        | _        y )NF)r  r  r   r  r&  )rn   ro   r  r   encoder_attention_headsattention_dropout	attentionr   r   r  r   r   r(  r   r  encoder_ffn_dimfeed_forwardfinal_layer_normr,  s     r,   ro   zSpeechT5EncoderLayer.__init__  s    *((44,,	
 zz&"7"78,,v'9'9v?T?TU/8N8NO "V-?-?VEZEZ [r.   r   r1   r  r  r  c                     |}| j                  |||||      \  }}| j                  |      }||z   }| j                  |      }|| j                  |      z   }| j	                  |      }|f}|r||fz  }|S )as  
        Args:
            hidden_states (`torch.FloatTensor`):
                input to the layer of shape `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`):
                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
                large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(config.encoder_attention_heads,)`.
            position_bias (`torch.FloatTensor`):
                relative position embeddings of size `(seq_len, seq_len, hidden_size // encoder_attention_heads)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r1   r  r  r  )r  r   r   r  r  )	r{   r   r1   r  r  r  residualr  outputss	            r,   r   zSpeechT5EncoderLayer.forward  s    . !&*nn')+'/ '5 '
#| ]3 =06%(9(9-(HH--m< "&Gr.   )NNNF)r   r   r   r   ro   r   r   r   rQ   r   r   r   s   @r,   r  r  
  ss    \~ \  262604"',||, !., "%,,/	,
  -,  ,r.   r  c                   H    e Zd Zddef fdZ eddd      	 	 	 	 	 	 	 	 	 ddej                  deej                     d	eej                     d
eej                     deej                     deej                     dee	   dee
   dee
   deej                     fd       Z xZS )SpeechT5DecoderLayerr|   c                    t         |           t        |j                  |j                  |j
                  d|      | _        t        j                  |j                        | _
        t        j                  |j                  |j                        | _        t        |j                  |j                  |j
                  d|      | _        t        j                  |j                  |j                        | _        t!        ||j"                        | _        t        j                  |j                  |j                        | _        y )NT)r  r  r   r  r  r&  )r   r  r  )rn   ro   r  r   decoder_attention_headsr  	self_attnr   r   r  r   r   r(  self_attn_layer_normencoder_attnencoder_attn_layer_normr  decoder_ffn_dimr  r  )r{   r|   r  r~   s      r,   ro   zSpeechT5DecoderLayer.__init__H  s    *((44,,
 zz&"7"78$&LL1C1CI^I^$_!-**,,
 (*||F4F4FFLaLa'b$/8N8NO "V-?-?VEZEZ [r.   r  r  r  r  r   r1   encoder_hidden_statesencoder_attention_maskr  cross_attn_layer_head_maskr  	use_cacher  c           	      ~   |}| j                  ||||||
      \  }}| j                  |      }||z   }| j                  |      }d}|D|}| j                  |||||||
      \  }}| j                  |      }||z   }| j	                  |      }|| j                  |      z   }| j                  |      }|f}|r|||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, hidden_size)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size `(decoder_attention_heads,)`.
            past_key_values (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r  r1   r  r  r  N)r   r  r1   r  r  r  r  )r  r   r  r  r  r  r  )r{   r   r1   r  r	  r  r
  r  r  r  r  r  self_attn_weightscross_attn_weightsr  s                  r,   r   zSpeechT5DecoderLayer.forward`  s   @ ! ,0>>'+)+/) ,: ,
(( ]3 =011-@ " ,$H040A0A+!65 : /"3- 1B 1-M- !LL7M$}4M 88GM &(9(9-(HH--m< ")+=>>Gr.   r   )	NNNNNNFTN)r   r   r   r   ro   r   r   r   r   r   rQ   r   r   r   s   @r,   r   r   G  s   \~ \0 %0A6R 268<9=26=A+/,1$(15I||I !.I  (5	I
 !) 6I "%,,/I %-U\\$:I "%I $D>I D>I !.I SIr.   r   c                   D    e Zd ZU eed<   dZdZdZdej                  fdZ
y)SpeechT5PreTrainedModelr|   speecht5r/   Tmodulec           
      F   | j                   j                  }t        |t              rt        j
                  j                  |j                  j                  ddt        j                  d|j                  j                  d   |j                  j                  z  z        z         t        j
                  j                  |j                  j                  d       nt        |t              r'|j                   j"                  j%                  d       nt        |t&              rt        j                  d|j(                  j*                  z        }t        j
                  j-                  |j(                  j                  | |       t        j
                  j-                  |j(                  j                  | |       n/t        |t        j.                        rZ|j                  j"                  j                  d|       |j                  |j                  j"                  j1                          nt        |t        j2                  t        j4                  t        j6                  f      rK|j                  j"                  j1                          |j                  j"                  j%                  d       n7t        |t        j8                        rt        j
                  j;                  |j                         |j                  t        j                  |j<                  |j                  |j                  d   z  z        }t        j
                  j-                  |j                  | |       n~t        |t        j>                        rd|j                  j"                  j                  d|       |j@                  1|j                  j"                  |j@                     j1                          tC        |d	      r*t        j
                  j-                  |jD                         yy)
zInitialize the weightsr   r   r   meanstdr   )abr4   Nr7  )#r|   initializer_ranger  r   r   initnormal_rw   r   r   r  rk   in_channels	constant_rm   r   r   datafill_r$  r*  in_featuresr6  r)  zero_r   r   r  rs   kaiming_normal_r   r  r   r   r7  )r{   r  r  ks       r,   _init_weightsz%SpeechT5PreTrainedModel._init_weights  s   kk++f=>GGOO""		!v{{'>'>q'AFKKD[D['["\]]  
 GGfkk..2 @ALL##C( 9:		!f//;;;<AGGV..55!qAGGV..33rQ?		*MM&&CS&9{{&  &&(r||R^^ LMKK""$MM$$S)		*GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8-MM&&CS&9!!-""6#5#56<<>6./GGV556 0r.   N)r   r   r   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointingr   Moduler$   r.   r,   r  r    s)    "$O&*#"7BII "7r.   r  c                        e Zd ZdZdef fdZ	 	 	 	 	 ddej                  deej                     deej                     dee
   dee
   d	ee
   d
eeef   fdZ xZS )SpeechT5Encoderzu
    Transformer encoder consisting of *config.encoder_layers* layers. Each layer is a [`SpeechT5EncoderLayer`].
    r|   c                    t         |   |       t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        |j                  | _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        t#        |j                  |j$                  z  |j&                        | _        d| _        | j-                          y c c}w )Nr&  F)rn   ro   r   r   r   r(  r   r   r  r   encoder_layerdrop	layerdropr  rO   encoder_layersr  rf  r  r  encoder_max_relative_positionr  r  	post_init)r{   r|   r^   r~   s      r,   ro   zSpeechT5Encoder.__init__  s     ,,v'9'9v?T?TUzz&"7"7811mm5QWQfQfKg$ha%9&%A$hiA&"@"@@&BfBf 
 ',# 	 %is   Dr   r1   	head_maskr  output_hidden_statesreturn_dictr:   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        ||j
                        }| j                  |      }| j                  |      }| j                  |      }t               xs t        |       }|rdnd}	|rdnd}
|_|j                         d   t        | j                        k7  r6t        dt        | j                         d|j                         d    d      t        | j                        D ]l  \  }}|r|	|fz   }	d}| j                   r$t#        j$                  g       }|| j&                  k  }|r|r |||||||   nd|      }|d   }|rd	}|sd|
d
   fz   }
n |r|	|fz   }	|st)        d ||	|
fD              S t+        ||	|
      S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
                Features extracted from the speech or text input by the encoder prenet.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr*  r   z&The head_mask should be specified for  layers, but it is for .F)r1   r  r  r  r]  r   c              3   &   K   | ]	  }||  y wr   r*  .0vs     r,   	<genexpr>z*SpeechT5Encoder.forward.<locals>.<genexpr>O  s     mq_`_lms   last_hidden_stater   
attentions)r|   r  r4  use_return_dictr   rF   r   r   r  r   r   r   rT   rf  r)   	enumerater!  r   rJ   r/  r  r   )r{   r   r1   r3  r  r4  r5  r  synced_gpusall_hidden_statesall_self_attentionsidxencoder_layerskip_the_layerdropout_probabilitylayer_outputss                   r,   r   zSpeechT5Encoder.forward  s   H 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] %7H[H[\N6]3,,];02R6LT6R"6BD$5b4  ~~"c$++&66 <S=M<N O!(+,A/ 
 #,DKK"8 	PC#$58H$H! #N}}&+jjn#!4t~~!E![ -!#1"/7@7LYs^RV&7! !.a 0 , &9]1=M<O&O#3	P6   1]4D Dm]4EGZ$[mmm++*
 	
r.   NNNNNr   r   r   r   r   ro   r   r_  r   r   rQ   r   r  r   r   r   r   s   @r,   r,  r,    s    ~ ( 26,0,0/3&*f
((f
 !.f
 ELL)	f

 $D>f
 'tnf
 d^f
 
uo%	&f
r.   r,  c                        e Zd ZdZdef fdZ	 	 	 	 	 ddej                  deej                     deej                     dee
   dee
   d	ee
   d
eeef   fdZ xZS )SpeechT5EncoderWithSpeechPrenetz
    Wrapper around SpeechT5Encoder that applies SpeechT5SpeechEncoderPrenet to convert the audio waveform data to
    hidden features.
    r|   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r   )rn   ro   r0  prenetr,  wrapped_encoderr2  r,  s     r,   ro   z(SpeechT5EncoderWithSpeechPrenet.__init__^  5     1&9.v6 	r.   r/   r1   r3  r  r4  r5  r:   c                 ^    | j                  ||      \  }}| j                  ||||||      }|S N)r   r1   r3  r  r4  r5  rP  rQ  	r{   r/   r1   r3  r  r4  r5  r   r  s	            r,   r   z'SpeechT5EncoderWithSpeechPrenet.forwardf  sG     )-L.(Q%~&&')/!5# ' 
 r.   rK  rL  r   s   @r,   rN  rN  X  s    
~  26,0,0/3&*'' !. ELL)	
 $D> 'tn d^ 
uo%	&r.   rN  c                        e Zd ZdZdef fdZd Zd Z	 	 	 	 	 ddej                  de
ej                     de
ej                     d	e
e   d
e
e   de
e   deeef   fdZ xZS )SpeechT5EncoderWithTextPrenetz|
    Wrapper around SpeechT5Encoder that applies SpeechT5TextEncoderPrenet to convert the input_ids to hidden features.
    r|   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r   )rn   ro   r  rP  r,  rQ  r2  r,  s     r,   ro   z&SpeechT5EncoderWithTextPrenet.__init__  5     /7.v6 	r.   c                 6    | j                   j                         S r   rP  get_input_embeddingsr<  s    r,   r]  z2SpeechT5EncoderWithTextPrenet.get_input_embeddings      {{//11r.   c                 :    | j                   j                  |       y r   rP  set_input_embeddingsr{   values     r,   ra  z2SpeechT5EncoderWithTextPrenet.set_input_embeddings      ((/r.   r/   r1   r3  r  r4  r5  r:   c                 V    | j                  |      }| j                  ||||||      }|S rT  rU  rV  s	            r,   r   z%SpeechT5EncoderWithTextPrenet.forward  s@     L1&&')/!5# ' 
 r.   rK  )r   r   r   r   r   ro   r]  ra  r   r_  r   r   rQ   r   r  r   r   r   r   s   @r,   rX  rX  }  s    ~ 20 26,0,0/3&*'' !. ELL)	
 $D> 'tn d^ 
uo%	&r.   rX  c                        e Zd ZdZdef fdZ	 	 	 	 	 ddej                  deej                     deej                     dee
   dee
   d	ee
   d
eeef   fdZ xZS )SpeechT5EncoderWithoutPrenet
    This wrapper class is a helper class to correctly load pretrained checkpoints when used in combination with
    [`SpeechT5Model`].
    r|   c                 d    t         |   |       t        |      | _        | j	                          y r   )rn   ro   r,  rQ  r2  r,  s     r,   ro   z%SpeechT5EncoderWithoutPrenet.__init__  )     .v6 	r.   r/   r1   r3  r  r4  r5  r:   c                 0    | j                  ||||||      S rT  )rQ  )r{   r/   r1   r3  r  r4  r5  s          r,   r   z$SpeechT5EncoderWithoutPrenet.forward  s.     ##&)/!5# $ 
 	
r.   rK  rL  r   s   @r,   rg  rg    s    
~  26,0,0/3&*
''
 !.
 ELL)	

 $D>
 'tn
 d^
 
uo%	&
r.   rg  c                   v    e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eeej                        dee   dee   dee   dee   deej                     deeef   fdZ xZS )SpeechT5Decoderzt
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SpeechT5DecoderLayer`]
    r|   c           	      
   t         |   |       |j                  | _        t	        j
                  t        |j                        D cg c]  }t        ||       c}      | _	        d| _
        | j                          y c c}w )N)r  F)rn   ro   decoder_layerdropr/  r   r  rO   decoder_layersr   rf  r  r2  rl  s      r,   ro   zSpeechT5Decoder.__init__  sh     11mmX]^d^s^sXt$uST%9&A%N$uv&+# 	 %vs   B r   r1   r  r	  r3  cross_attn_head_maskr  r  r  r4  r5  r  r:   c                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }|j                         dd }| j                  r%| j                  r|rt        j                  d       d}|r6|4t        t        | j                         t        | j                               }|r:t        |t              r*t        j                  d       t        j                  |      }||j                         nd}t!        ||||      }||t#        ||j$                  |d         }t'               xs t)        |       }|
rd	nd}|	rd	nd}|	r|d	nd}t+        ||gd
dg      D ]j  \  }}|	|j                         d   t-        | j.                        k7  s3t1        d| dt-        | j.                         d|j                         d    d       t3        | j.                        D ]  \  }}|
r||fz   }d}| j                  r$t5        j6                  g       }|| j8                  k  }|r|sE ||||||||   nd|||   nd||	||
      }|d   }|	sm||d   fz   }|y||d   fz   } |
r||fz   }|st        d |||||fD              S t;        |||||      S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
                Features extracted from the speech or text input by the decoder prenet.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
                cross-attention on hidden heads. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr%   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r|   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   )r  r*  r3  rq  zThe `z` should be specified for r7  r8  )r	  r  r
  r  r  r  r  r   r   c              3   $   K   | ]  }|| 
 y wr   r*  r:  s     r,   r=  z*SpeechT5Decoder.forward.<locals>.<genexpr>{  s      = s   )r?  r  r   r@  cross_attentions)r|   r  r4  r  rA  r   r  r!  loggerwarning_oncer   r   r  r  from_legacy_cacher  r   r   rF   r   r   rS  rT   rf  r)   rB  r   rJ   r/  r   )r{   r   r1   r  r	  r3  rq  r  r  r  r4  r5  r  r  r   rC  rD  rE  all_cross_attentions	attn_mask	mask_namerF  decoder_layerrH  rI  rJ  s                             r,   r   zSpeechT5Decoder.forward  sc   T 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]#((*3B/&&4==##p "	01,dkk2RT`hlhshsTtuOOU;\
 2CCOTOETE`!?!?!Afg:K8N

 !,1G1S%?&(;(;[QS_&" 12R6LT6R #7BD$5b4&7<Q<]rdh %(4H(IKYoKp$q 	 Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03 	 #,DKK"8 	VC#$58H$H! #N}}&+jjn#!4t~~!Ek)%'=3<3H3dI]Ii,@,Eos /"3#-M *!,M &9]1=M<O&O#(4+?=QRCSBU+U(;	V>   1]4D D ':KM`bvw   9+++*1
 	
r.   NNNNNNNNNNNNr   r   r   r   r   ro   r   r   r_  r^  r   listrQ   r   r  r   r   r   r   s   @r,   rm  rm    s?   	~ 	 6:59=A=A,07;=A$(,0/3&*15p
 1 12p
 !!1!12p
  ((9(9:	p

 !))9)9 :p
 ELL)p
 'u||4p
 "$u'8'8"9:p
 D>p
 $D>p
 'tnp
 d^p
 !.p
 
u??	@p
r.   rm  c                       e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deeej                        dee   dee   dee   dee   deej                     deeef   fdZ xZS )SpeechT5DecoderWithSpeechPrenetz
    Wrapper around SpeechT5Decoder that applies SpeechT5SpeechDecoderPrenet to convert log-mel filterbanks to hidden
    features.
    r|   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r   )rn   ro   ra  rP  rm  wrapped_decoderr2  r,  s     r,   ro   z(SpeechT5DecoderWithSpeechPrenet.__init__  rR  r.   r/   r1   r  r	  rt  r3  rq  r  r  r  r4  r5  r  r:   c                 d    | j                  ||      }| j                  ||||||||	|
|||      }|S N)r   r1   r  r	  r3  rq  r  r  r  r4  r5  r  rP  r  )r{   r/   r1   r  r	  rt  r3  rq  r  r  r  r4  r5  r  decoder_hidden_statesr  s                   r,   r   z'SpeechT5DecoderWithSpeechPrenet.forward  sV      !%L:L M&&/)"7#9!5+/!5#) ' 
 r.   )NNNNNNNNNNNNNr}  r   s   @r,   r  r    sG   
~  5959=A=A59,07;=A$(,0/3&*15!u001! !!1!12!  ((9(9:	!
 !))9)9 :! %U\\2! ELL)! 'u||4! "$u'8'8"9:! D>! $D>! 'tn! d^! !.! 
u??	@!r.   r  c                       e Zd ZdZdef fdZd Zd Z	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     deee	j                        dee   dee   dee   dee   dee	j                     deeef   fdZ xZS )SpeechT5DecoderWithTextPrenetz{
    Wrapper around SpeechT5Decoder that applies SpeechT5TextDecoderPrenet to convert input tokens to hidden features.
    r|   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r   )rn   ro   r  rP  rm  r  r2  r,  s     r,   ro   z&SpeechT5DecoderWithTextPrenet.__init__  rZ  r.   c                 6    | j                   j                         S r   r\  r<  s    r,   r]  z2SpeechT5DecoderWithTextPrenet.get_input_embeddings  r^  r.   c                 :    | j                   j                  |       y r   r`  rb  s     r,   ra  z2SpeechT5DecoderWithTextPrenet.set_input_embeddings  rd  r.   r/   r1   r  r	  r3  rq  r  r  r  r4  r5  r  r:   c                 l    | j                  |||      \  }}| j                  |||||||||	|
||      }|S r  r  )r{   r/   r1   r  r	  r3  rq  r  r  r  r4  r5  r  r  r  s                  r,   r   z%SpeechT5DecoderWithTextPrenet.forward  s]     15L.Zi0j-~&&/)"7#9!5+/!5#) ' 
 r.   r|  )r   r   r   r   r   ro   r]  ra  r   r   r_  r^  r   r~  rQ   r   r  r   r   r   r   s   @r,   r  r    s;   ~ 20
 5959=A=A,07;=A$(,0/3&*15 u001  !!1!12   ((9(9:	 
 !))9)9 :  ELL)  'u||4  "$u'8'8"9:  D>  $D>  'tn  d^  !.  
u??	@ r.   r  c                   v    e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eeej                        dee   dee   dee   dee   deej                     deeef   fdZ xZS )SpeechT5DecoderWithoutPrenetrh  r|   c                 d    t         |   |       t        |      | _        | j	                          y r   )rn   ro   rm  r  r2  r,  s     r,   ro   z%SpeechT5DecoderWithoutPrenet.__init__  rj  r.   r/   r1   r  r	  r3  rq  r  r  r  r4  r5  r  r:   c                 @    | j                  |||||||||	|
||      }|S r  )r  )r{   r/   r1   r  r	  r3  rq  r  r  r  r4  r5  r  r  s                 r,   r   z$SpeechT5DecoderWithoutPrenet.forward  sD     &&&)"7#9!5+/!5#) ' 
 r.   r|  r}  r   s   @r,   r  r    s1   
~  5959=A=A,07;=A$(,0/3&*15u001 !!1!12  ((9(9:	
 !))9)9 : ELL) 'u||4 "$u'8'8"9: D> $D> 'tn d^ !. 
u??	@r.   r  c                        e Zd ZdZdef fdZdej                  dej                  dej                  dej                  fdZ
d	 Zed
        Z xZS )$SpeechT5GuidedMultiheadAttentionLossz
    Guided attention loss from the paper [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional
    Networks with Guided Attention](https://huggingface.co/papers/1710.08969), adapted for multi-head attention.
    r|   c                 f    t         |           |j                  | _        |j                  | _        y r   )rn   ro   guided_attention_loss_sigmasigmaguided_attention_loss_scalescaler,  s     r,   ro   z-SpeechT5GuidedMultiheadAttentionLoss.__init__%  s(    77
77
r.   r@  input_masksoutput_masksr:   c                 F   | j                  |||j                        }|j                  d      |j                  d      z  }|j                  |j                        j                  d      }||z  }t	        j
                  |j                  |            }| j                  |z  S )aY  
        Compute the attention loss.

        Args:
            attentions (`torch.FloatTensor` of shape `(batch_size, layers * heads, output_sequence_length, input_sequence_length)`):
                Batch of multi-head attention weights
            input_masks (`torch.BoolTensor` of shape `(batch_size, input_sequence_length)`):
                Input attention mask as booleans.
            output_masks (`torch.BoolTensor` of shape `(batch_size, output_sequence_length)`):
                Target attention mask as booleans.

        Returns:
            `torch.Tensor` with the loss value
        r%   r   r   )_make_guided_attention_masksr   r   r   r   r  masked_selectr  )r{   r@  r  r  guided_attn_masksmaskslosseslosss           r,   r   z,SpeechT5GuidedMultiheadAttentionLoss.forward*  s    " !==k<YcYjYjk&&r*[-B-B2-FF**+55a8"Z/zz&..u56zzD  r.   c                 r   |j                  d      }|j                  d      }t        j                  t        |      |j                  d   |j                  d   f|      }t        t        ||            D ]0  \  }\  }}	| j                  ||	| j                  |      ||d |	d |f<   2 |j                  d      S )Nr%   r   rH  )
rM   r   rP   rT   r'   rB  rS  _make_guided_attention_maskr  r   )
r{   r  r  r   r_   rL  r  rF  ilenolens
             r,   r  zASpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_masksC  s    #+%))"-!KK[)9<;M;Ma;PR]RcRcdeRf(gpvw!*3}n+M!N 	tC$373S3STXZ^`d`j`jlr3sc5D5%4%/0	t !**1--r.   c                 (   t        j                  t        j                  | |      t        j                  ||      d      \  }}|j                         |z  }|j                         | z  }dt        j                  ||z
  dz   d|dz  z  z        z
  S )NrH  xy)indexingr   r   )r   meshgridrS   r   r   )r@   output_lengthr  r   grid_ygrid_xs         r,   r  z@SpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_maskN  s    LLf5LLv6

 -/,.UYY&6/a!78ANKLLLr.   )r   r   r   r   r   ro   r   r_  
BoolTensorr   r   r  r   r  r   r   s   @r,   r  r    sj    
8~ 8
!++!:?:J:J!Z_ZjZj!	!2	. M Mr.   r  c                        e Zd ZdZdef fdZ	 ddej                  dej                  dej                  dej                  dej                  d	e	ej                     d
ej                  fdZ xZS )SpeechT5SpectrogramLossz;
    Loss computation used by SpeechT5ForTextToSpeech.
    r|   c                 (   t         |           |j                  | _        |j                  | _        |j                  | _        t               | _        t        t        j                  d            | _
        | j                  rt        |      | _        y y )Ng      @)
pos_weight)rn   ro   use_guided_attention_lossguided_attention_loss_num_headsr0   r   l1_criterionr   r   r   bce_criterionr  attn_criterionr,  s     r,   ro   z SpeechT5SpectrogramLoss.__init___  ss    )/)I)I&/5/U/U, & 7 7"H.%,,s:KL))"Fv"ND *r.   r1   r  r  r  labelsrt  r:   c           	      V   |dk7  }|j                  |      }|j                  |      }|j                  |      }| j                  ||      | j                  ||      z   }|d d d d df   }	t        j                  |	 dz  t        j                  |	j                  d      d      j                  |	j                        gd      }
|
d d dd f   j                  |	      }
|j                  |	      }| j                  ||
      }||z   }| j                  rt        j                  |D cg c]  }|d d d | j                  f    c}d      }|dk(  }|d d d d df   }| j                  dkD  r#|d d | j                  dz
  d | j                  f   }| j                  |||      }||z  }|S c c}w )Nr3   r   r   r   r   )r  r  r   r   rV   r   r   r   r  r  r  r0   r  )r{   r1   r  r  r  r  rt  rD  l1_lossr  stop_labelsbce_lossr  xattnr  r  	attn_losss                     r,   r   zSpeechT5SpectrogramLoss.forwardk  s    ' %%l3!7!E!El!S 5 C CL Q ##$96BTEVEVWmouEvv Q1W%ii%#uzz%**Q-/K/N/Nu||/\ ]cde!!QR%(66u=%%e, %%fk: ! ))99Tdeqa#IT%I%I#I IJeklmD(A-K'1a0L$$q(+At/D/Dq/H/aDLaLa/a,ab++D+|LIID fs   $F&r   )r   r   r   r   r   ro   r   r^  r_  r   r   r   r   r   s   @r,   r  r  Z  s    
O~ 
O& 9=)(() !& 1 1)  %00	)
 !!) !!) #5#4#45) 
)r.   r  zv
    The bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets.
    custom_introc            $       b    e Zd Z	 	 ddedeej                     deej                     f fdZd Zd Z	d Z
d Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd	eej                     d
eej                     deej                     deej                     deej                      deej                      deej                     deeeej                            deeeej                            dee   deej                      dee   dee   dee   deej                     deeej                      ef   f d       Z xZS )SpeechT5Modelr|   encoderdecoderc                     t         |   |       || _        |t        |      n|| _        |t        |      n|| _        | j                          y)z
        encoder (`PreTrainedModel`, *optional*):
            The encoder model to use.
        decoder (`PreTrainedModel`, *optional*):
            The decoder model to use.
        N)rn   ro   r|   rg  r  r  r  r2  )r{   r|   r  r  r~   s       r,   ro   zSpeechT5Model.__init__  sM     	 ?F3F;T[?F3F;T[ 	r.   c                     t        | j                  t              r| j                  j                         S t        | j                  t
              r| j                  j                         S t        r   )r  r  rX  r]  r  r  NotImplementedErrorr<  s    r,   r]  z"SpeechT5Model.get_input_embeddings  sL    dll$AB<<4466dll$AB<<4466!!r.   c                     t        | j                  t              r| j                  j                  |       t        | j                  t
              r| j                  j                  |       y y r   )r  r  rX  ra  r  r  rb  s     r,   ra  z"SpeechT5Model.set_input_embeddings  sJ    dll$ABLL--e4dll$ABLL--e4 Cr.   c                     | j                   S r   )r  r<  s    r,   get_encoderzSpeechT5Model.get_encoder  s    ||r.   c                     t        | j                  t              r%| j                  j                  j	                          yyz
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)r  r  rN  rP  r=  r<  s    r,   r=  z$SpeechT5Model.freeze_feature_encoder  s/    
 dll$CDLL668 Er.   r/   r1   decoder_input_valuesdecoder_attention_maskr3  decoder_head_maskrq  encoder_outputsr  r  rt  r  r4  r5  r  r:   c                    ||n| j                   j                  }||n| j                   j                  }|
|
n| j                   j                  }
||n| j                   j                  }|| j                  ||||||      }nI|rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|Qt        | j
                  t              r7| j
                  j                  j                  |d   j                  d   |      }n|}t        | j                  t              rd|i}ni } | j                  d
|||d   ||||	|
||||d|}|s||z   S t        |j                   |j"                  |j$                  |j&                  |j(                  |j                   |j$                  |j&                  	      S )a  
        input_values (`torch.Tensor` of shape `(batch_size, sequence_length)`):
            Depending on which encoder is being used, the `input_values` are either: float values of the input raw
            speech waveform, or indices of input sequence tokens in the vocabulary, or hidden states.
        decoder_input_values (`torch.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Depending on which decoder is being used, the `decoder_input_values` are either: float values of log-mel
            filterbank features extracted from the raw speech waveform, or indices of decoder input sequence tokens in
            the vocabulary, or hidden states.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        N)r/   r1   r3  r  r4  r5  r   r   r   r>  rt  )r/   r1   r  r	  r3  rq  r  r  r  r4  r5  r  )r?  r  r  decoder_attentionsrt  encoder_last_hidden_stater  encoder_attentionsr*  )r|   r  r4  r  rA  r  r  r   rT   rN  rP  r@  r'   r  r  r   r?  r  r   r@  rt  )r{   r/   r1   r  r  r3  r  rq  r  r  r  rt  r  r4  r5  r  r	  decoder_argsdecoder_outputss                      r,   r   zSpeechT5Model.forward  s   T 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ""ll)-#"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO %*T\\Cb*c%)\\%8%8%[%["((+^&" &4"dll$CD02DELL&$,, 
-1"1!"4#9'!5+/!5#)
 
  "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r.   r]  NNNNNNNNNNNNNNN)r   r   r   r   r   r   r)  ro   r]  ra  r  r=  r   r   r   r^  r_  r  rQ   r   r   r   r   r   s   @r,   r  r    s    (,'+	 "))$ "))$	("59  04597;=A159=7;EIEI$(:>,0/3&*15!k
u||,k
 !!1!12k
 'u||4	k

 !))9)9 :k
 E--.k
 $E$5$56k
 'u||4k
 "%e.?.?(@"ABk
 "%e.?.?(@"ABk
 D>k
 %U%6%67k
 $D>k
 'tnk
 d^k
  !.!k
" 
uU&&');;	<#k
 k
r.   r  zB
    SpeechT5 Model with a speech encoder and a text decoder.
    c            $           e Zd ZdgZdef fdZd Zd Zd Zd Z	d Z
e	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd	eej                     d
eej                     deej                     deej                     deej                     deej                     deej                      deeeej                           deeeej                           dee   dee   dee   dee   deej                     deej                      deeef   f d       Z xZS )SpeechT5ForSpeechToTextz#text_decoder_postnet.lm_head.weightr|   c                     t         |   |       |j                  t        d| j                   d      t        |      }t        |      }t        |||      | _        t        |      | _
        | j                          y )NYou are trying to instantiate a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForSpeechToText.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)rn   ro   r  r)   r~   rN  r  r  r  r  text_decoder_postnetr2  )r{   r|   speech_encodertext_decoderr~   s       r,   ro   z SpeechT5ForSpeechToText.__init__@  s     $00@ A/ /  9@4V<%fnlK$>v$F! 	r.   c                 6    | j                   j                         S r   r  r  r<  s    r,   r  z#SpeechT5ForSpeechToText.get_encoderT      }}((**r.   c                 6    | j                   j                         S r   r  get_decoderr<  s    r,   r  z#SpeechT5ForSpeechToText.get_decoderW  r  r.   c                 T    | j                         j                  j                          yr  r  rP  r=  r<  s    r,   r=  z.SpeechT5ForSpeechToText.freeze_feature_encoderZ      
 	!!88:r.   c                 6    | j                   j                         S r   )r  r  r<  s    r,   r  z-SpeechT5ForSpeechToText.get_output_embeddingsa  s    ((>>@@r.   c                 :    | j                   j                  |       y r   )r  r  r  s     r,   r  z-SpeechT5ForSpeechToText.set_output_embeddingsd  s    !!77Gr.   r/   r1   decoder_input_idsr  r3  r  rq  r  r  r  r  r4  r5  r  r  r:   c                    ||n| j                   j                  }|7|5t        || j                   j                  | j                   j                        }| j                  |||||||||	|
||d|      }| j                  |d         }d}|Ft               } ||j                  d| j                   j                        |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                  |j                  |j                   |j"                  	      S )a(  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            SpeechT5 uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

            Label indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToText
        >>> from datasets import load_dataset

        >>> dataset = load_dataset(
        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
        >>> model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")

        >>> # audio file is decoded on the fly
        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
        >>> predicted_ids = model.generate(**inputs, max_length=100)

        >>> # transcribe speech
        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        >>> transcription[0]
        'mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'
        ```

        ```python
        >>> inputs["labels"] = processor(text_target=dataset[0]["text"], return_tensors="pt").input_ids

        >>> # compute loss
        >>> loss = model(**inputs).loss
        >>> round(loss.item(), 2)
        19.68
        ```
        NT)r/   r1   r  r  r3  r  rq  r  r  r  r  r4  r5  r  r   r%   r   )	r  r  r  r  r  rt  r  r  r  )r|   rA  r-   r"   r#   r  r  r   r   r  r   r  r  r  rt  r  r  r  )r{   r/   r1   r  r  r3  r  rq  r  r  r  r  r4  r5  r  r  r  r  r  loss_fctoutputs                        r,   r   zSpeechT5ForSpeechToText.forwardg  s]   v &1%<k$++B]B] ($6DKK44dkk6X6X%! --%)!2#9/!5++/!5)   
" **71:6')HFKKDKK,B,BCV[[QS_UDY,F)-)9TGf$EvE#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r.   r  )r   r   r   _tied_weights_keysr   ro   r  r  r=  r  r  r   r   r   r_  r^  r   r  rQ   r   r   r   r   r   s   @r,   r  r  8  s    @@~ (++;AH  59598<=A159=7;EIEI$(,0/3&*-115!H
u001H
 !!1!12H
 $E$4$45	H

 !))9)9 :H
 E--.H
 $E$5$56H
 'u||4H
 "%e.?.?(@"ABH
 "%e.?.?(@"ABH
 D>H
 $D>H
 'tnH
 d^H
 ))*H
  !.!H
" 
uo%	&#H
 H
r.   r  modelrt  	thresholdminlenratiomaxlenratiovocoderoutput_cross_attentionsreturn_output_lengthsc
           
      j   |t        d      |+d|| j                  j                  k(  j                         z
  }
n|}
|j	                  d      }| j
                  j                  ||
d      }|j                  }t        | j
                  j                  t              r@| j
                  j                  j                  j                  |d   j                  d   |
      }
t        |j	                  d      |z  | j                  j                  z        }t        |j	                  d      |z  | j                  j                  z        }|j                  |d| j                  j                        }g }g }d }d}i }	 |dz  }| j
                  j                   j                  ||      }| j
                  j                   j#                  |d d dd f   d ||
|d|d      }|r0|j%                  t'        j(                  |j*                  d             |j                  j-                  d      }|j.                  }| j0                  j3                  |      }|j5                  || j                  j                  | j                  j                        }|j%                  |       |d d dd d f   j5                  |d| j                  j                        }t'        j(                  ||fd      }t'        j6                  | j0                  j9                  |            }||k  r||k  rAt'        j:                  |d      |k\  }t'        j<                  |      d   j?                         }ntA        tC        |            }|D cg c]	  }||vs| }}tC        |      dkD  rat'        jD                  |      }|jG                  dd      jI                  dd	      }| j0                  jK                  |      }|D ]
  } ||    || <    tC        |      |k\  rntA        tC        |            D cg c]  }||   	 }}|	s|dk(  r|d   n4t&        jL                  jN                  jP                  jS                  |d
      }|	 ||      }!n|}!|r`t'        j(                  |d	      }|dkD  r@ |j4                  |t        |j	                  d      |z        g|j	                         dd   }|!|f}!|!S g }"tA        |      D ]%  }|"j%                  ||   j	                  d             ' |:t&        jL                  jN                  jP                  jS                  |d
      }||"f}!nyg }#t&        jL                  jN                  jP                  jS                  |d
      } ||      }#|"D cg c]+  }t        |#j	                  d      tU        |"      z        |z  - }$}|#|$f}!|r^t'        j(                  |d	      } |j4                  |t        |j	                  d      |z        g|j	                         dd   }g |!|}!|!S c c}w c c}w c c}w )Na  `speaker_embeddings` must be specified. For example, you can use a speaker embeddings by following
                    the code snippet provided in this link:
                    https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors
                    r   r   T)r/   r1   r5  r%   )r   r1   r  r	  r  r  r  r5  r   r   )batch_first)+r)   r|   r"   r>   r   r  r  r?  r  rN  rP  r@  r'   r0   r&   rd  r  r  rX   r   r   rt  squeezer  speech_decoder_postnetr  r   sigmoidr  rM   rp  rN   rO   rT   stackr   flattenr  r   r   rnnpad_sequencer?   )%r  r/   rt  r1   r  r  r  r  r  r  r	  r   encoder_outr  maxlenminlenoutput_sequencespectrogramrt  r  rF  result_spectrogramr  decoder_outlast_decoder_outputspectrumnew_spectrogramprobmeet_thresholdsmeet_indexesr  spectrograms
meet_indexr  spectrogram_lengths	waveformswaveform_lengthss%                                        r,   _generate_speechr    s    !
 	
 !"lell6O6O&O%T%T%V!V!/


A
C..((!- ) K !, = = %..((*IJ!&!7!7!>!>!a!aN  #%;"
 *//2[@5<<C`C``aF*//2[@5<<C`C``aF 099#q%,,B[B[\OKO
C
q !& 6 6 = =oOa bnn,,<</237";#9+5 = 	
 ###EIIk.J.JPQ$RS);;CCAF%55 //889LM==ell&C&CU\\E^E^_8$ #1b!8,11#q%,,:S:ST))_o$FAN}}U99BBCVWX< V|"'))Db"9Y"F${{?;A>EEG$SY/'3S!q@R7RASLS< 1$${{;7+55a;CCAqI$;;CCLQ". NJ5A*5M&z2N%&#-i j 49=O9P3QRa&q)RLR ),l1ouxx~~7I7I7V7VWcqu7V7vk*G!G"$yy)9qAQw#8#3#8#8-2215;<$?O?T?T?VWYWZ?[$   01G* N% !s 	@A&&|A';';A'>?	@? 88>>--::<UY:ZL#%89GI 88>>--::<UY:ZL-I_rsZ[INN1$5<O8P$P QTU Uss "23G"$yy)9qA4/44S)..q1C78 ;K;P;P;RSUSV;W  32!12GNW T S4  ts   4	X&>X&X+0X0zB
    SpeechT5 Model with a text encoder and a speech decoder.
    c            (           e Zd ZdZdef fdZedefd       Zd Z	d Z
e	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d!deej                     deej                     d	eej                     d
eej                     deej                     deej                     deej                      deeeej                           deeeej                           dee   dee   dee   dee   deej                     deej                     deej                      deej                      deeef   f$d       Z ej*                         	 	 	 	 	 	 	 	 d"dej                  deej                     deej                     dedededeej0                     dededeej                  eej                  ej                  f   f   fd       Z ej*                         	 	 	 	 	 	 	 	 d"dej                  deej                     deej                     dedededeej0                     dededeej                  eej                  ej                  f   f   fd        Z xZS )#SpeechT5ForTextToSpeechr!   r|   c                     t         |   |       |j                  t        d| j                   d      t        |      }t        |      }t        |||      | _        t        |      | _
        | j                          y )Nr  a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForTextToSpeech.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)rn   ro   r  r)   r~   rX  r  r  r  r  r  r2  )r{   r|   text_encoderspeech_decoderr~   s       r,   ro   z SpeechT5ForTextToSpeech.__init__	  s     $00@ A/ /  5V<8@%flNK&B6&J# 	r.   r:   c                      yr   r*  )clss    r,   can_generatez$SpeechT5ForTextToSpeech.can_generate	  s    
 r.   c                 6    | j                   j                         S r   r  r<  s    r,   r  z#SpeechT5ForTextToSpeech.get_encoder	  r  r.   c                 6    | j                   j                         S r   r  r<  s    r,   r  z#SpeechT5ForTextToSpeech.get_decoder	  r  r.   r1   r  r  r3  r  rq  r  r  r  r  r4  r5  rt  r  r  r  c                 d   ||n| j                   j                  }|>|$t        || j                   j                  |      \  }}| j                   j                  rd}| j                  |||||||||	|
|||d|      }| j                  |d         \  }}}d}|,t        | j                         } |||||||j                        }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                  |j                  |j                  |j                  	      S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
            [`~PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
            Float values of input mel spectrogram.

            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
            Float values of target mel spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
            computation. Spectrograms can be obtained using [`SpeechT5Processor`]. See [`SpeechT5Processor.__call__`]
            for details.
        stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Binary tensor indicating the position of the stop token in the sequence.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, set_seed
        >>> import torch

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
        >>> model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

        >>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
        >>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

        >>> set_seed(555)  # make deterministic

        >>> # generate speech
        >>> speech = model.generate(inputs["input_ids"], speaker_embeddings=speaker_embeddings, vocoder=vocoder)
        >>> speech.shape
        torch.Size([15872])
        ```
        NTr/   r1   r  r  r3  r  rq  r  r  r  rt  r  r4  r5  r  r   r   	r  r  r  r  r  rt  r  r  r  )r|   rA  r6   r0   r  r  r  r  rt  r   r  r  r  r  r  r  )r{   r!   r1   r  r  r3  r  rq  r  r  r  r  r4  r5  rt  r  r  r  r  r  r  r  r  	criterionr  s                            r,   r   zSpeechT5ForTextToSpeech.forward	  s   Z &1%<k$++B]B]#+?WDKK88:P@<$&< {{44$(!--")!5#9/!5++1/!5)   
$ AE@[@[\cde\f@g= 5v/<I&%((D +-;F)-)9TGf$EvE'-#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r.   r  r  r  r  r  r  c
                     |W|j                  d      }|j                  d      |k7  r2|j                  d      dk(  r|j                  |d      }nt        d      t        | |||||||||	
      S )aE  
        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
        speech waveform using a vocoder.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
                [`~PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Attention mask from the tokenizer, required for batched inference to signal to the model where to
                ignore padded tokens from the input_ids.
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        r   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch_size.r   ro  r)   r  )r{   r!   r1   rt  r  r  r  r  r  r  kwargsr]   s               r,   generatez SpeechT5ForTextToSpeech.generate2
  s    J )"*J!&&q)Z7%**1-2);)B)B:q)Q&$o   #!
 	
r.   c
                     |W|j                  d      }
|j                  d      |
k7  r2|j                  d      dk(  r|j                  |
d      }nt        d      t        | |||||||||	
      S )a  
        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
        speech waveform using a vocoder.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
                [`~PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*, defaults to `None`):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        r   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch size.r!  )r{   r!   rt  r1   r  r  r  r  r  r  r]   s              r,   generate_speechz'SpeechT5ForTextToSpeech.generate_speech
  s    R )"*J!&&q)Z7%**1-2);)B)B:q)Q&$o   #!
 	
r.   NNNNNNNNNNNNNNNNNNNg      ?r4   g      4@NFF)r   r   r   r'  r   ro   classmethodrQ   r  r  r  r   r   r   r^  r_  r   r  r   r   r   r   r   r   r)  r#  r%  r   r   s   @r,   r  r  	  s    "O~ ( T  ++  1559<@=A159=7;EIEI$(,0/3&*:>.2.215%D
E,,-D
 !!1!12D
 'u'8'89	D

 !))9)9 :D
 E--.D
 $E$5$56D
 'u||4D
 "%e.?.?(@"ABD
 "%e.?.?(@"ABD
 D>D
 $D>D
 'tnD
 d^D
 %U%6%67D
  **+!D
" ell+#D
$ !.%D
& 
u..	/'D
 D
L U]]_ 6::> !'+(-&+Y
##Y
 !!1!12Y
 %U%6%67	Y

 Y
 Y
 Y
 "))$Y
 "&Y
  $Y
 
u  %(9(95;L;L(L"MM	NY
 Y
v U]]_ ;?59 !'+(-&+]
##]
 %U%6%67]
 !!1!12	]

 ]
 ]
 ]
 "))$]
 "&]
  $]
 
u  %(9(95;L;L(L"MM	N]
 ]
r.   r  zD
    SpeechT5 Model with a speech encoder and a speech decoder.
    c            (            e Zd Zdef fdZd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d de	e
j                     de	e
j                     de	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	e
j                     de	eee
j                           de	eee
j                           de	e   de	e   de	e   de	e   de	e
j                     de	e
j                     de	e
j                     de	e
j                     deeef   f$d       Z e
j&                         	 	 	 	 	 	 	 	 d!de
j                  de	e
j                     de	e
j                     dededede	ej,                     dedede
j                  fd       Z xZS )"SpeechT5ForSpeechToSpeechr|   c                     t         |   |       t        |      }t        |      }t	        |||      | _        t        |      | _        | j                          y r   )	rn   ro   rN  r  r  r  r  r  r2  )r{   r|   r  r  r~   s       r,   ro   z"SpeechT5ForSpeechToSpeech.__init__
  sM     8@8@%fnnM&B6&J# 	r.   c                 6    | j                   j                         S r   r  r<  s    r,   r  z%SpeechT5ForSpeechToSpeech.get_encoder  r  r.   c                 6    | j                   j                         S r   r  r<  s    r,   r  z%SpeechT5ForSpeechToSpeech.get_decoder  r  r.   c                 T    | j                         j                  j                          yr  r  r<  s    r,   r=  z0SpeechT5ForSpeechToSpeech.freeze_feature_encoder  r  r.   r/   r1   r  r  r3  r  rq  r  r  r  r  r4  r5  rt  r  r  r  r:   c                    ||n| j                   j                  }|&|$t        || j                   j                  |      \  }}| j	                  |||||||||	|
|||d|      }| j                  |d         \  }}}d}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                  |j                  |j                  |j                  	      S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding and conversion into
            a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
        decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
            Float values of input mel spectrogram.

            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
            Float values of target mel spectrogram. Spectrograms can be obtained using [`SpeechT5Processor`]. See
            [`SpeechT5Processor.__call__`] for details.
        stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Binary tensor indicating the position of the stop token in the sequence.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan, set_seed
        >>> from datasets import load_dataset
        >>> import torch

        >>> dataset = load_dataset(
        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
        >>> model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

        >>> # audio file is decoded on the fly
        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

        >>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

        >>> set_seed(555)  # make deterministic

        >>> # generate speech
        >>> speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)
        >>> speech.shape
        torch.Size([77824])
        ```
        NTr  r   r   r  )r|   rA  r6   r0   r  r  r   r  r  r  rt  r  r  r  )r{   r/   r1   r  r  r3  r  rq  r  r  r  r  r4  r5  rt  r  r  r  r  r^   r  r  r  r  s                           r,   r   z!SpeechT5ForSpeechToSpeech.forward  s-   h &1%<k$++B]B]#+?WDKK88:P@<$&< --%)!5#9/!5++1/!5)   
$ "&!<!<WQZ!H;!^gabk1F)-)9TGf$EvE'##33")"?"?&99$55&-&G&G")"?"?&99

 
	
r.   r  r  r  r  r  r  c
                 p    |!t        j                  d|j                        }t        | |||||||||	
      S )a'  
        Converts a raw speech waveform into a sequence of mel spectrograms, which are subsequently turned back into a
        speech waveform using a vocoder.

        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
                Float values of input raw speech waveform.

                Values can be obtained by loading a *.flac* or *.wav* audio file into an array of type `list[float]`,
                a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`)
                or the soundfile library (`pip install soundfile`).
                To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding and
                conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*, defaults to `None`):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        )r   i   rH  )r   rP   r   r  )
r{   r/   rt  r1   r  r  r  r  r  r  s
             r,   r%  z)SpeechT5ForSpeechToSpeech.generate_speech  sM    T %!&Xl>Q>Q!R#!
 	
r.   r&  r'  )r   r   r   r   ro   r  r  r=  r   r   r   r_  r^  r   r  rQ   r   r   r   r   r   r   r)  r%  r   r   s   @r,   r*  r*  
  s   
~ 
++;  5959<@=A159=7;EIEI$(,0/3&*:>.2.215%
u001
 !!1!12
 'u'8'89	

 !))9)9 :
 E--.
 $E$5$56
 'u||4
 "%e.?.?(@"AB
 "%e.?.?(@"AB
 D>
 $D>
 'tn
 d^
 %U%6%67
  **+!
" ell+#
$ !.%
& 
u..	/'
 
B U]]_ ;?59 !'+(-&+W
''W
 %U%6%67W
 !!1!12	W

 W
 W
 W
 "))$W
 "&W
  $W
 
		W
 W
r.   r*  c                   :     e Zd Zd fd	ZddZd Zd Zd Z xZS )HifiGanResidualBlockc                    t         |           || _        t        j                  t        t        |            D cg c]3  }t        j                  |||d||   | j                  |||               5 c}      | _	        t        j                  t        t        |            D cg c]-  }t        j                  |||dd| j                  |d            / c}      | _
        y c c}w c c}w )Nr   )rl   dilationr   )rn   ro   leaky_relu_sloper   r  rO   rT   rs   get_paddingconvs1convs2)r{   channelsrk   r4  r5  r  r^   r~   s          r,   ro   zHifiGanResidualBlock.__init__  s     0mm s8}-
  		%a[ ,,[(1+F

 mm s8}-
  		 ,,[!<



s   8C$%2C)c                     ||z  |z
  dz  S r   r*  )r{   rk   r4  s      r,   r6  z HifiGanResidualBlock.get_padding  s    h&1a77r.   c                 ,   t         j                  j                  }t        t         j                  j                  d      r$t         j                  j                  j                  }| j
                  D ]
  } ||        | j                  D ]
  } ||        y Nr   )r   r   r   r   r   r7  r8  r{   r   r  s      r,   apply_weight_normz&HifiGanResidualBlock.apply_weight_norm  sp    hh**288,,m<((33??K[[ 	E	[[ 	E	r.   c                     | j                   D ]!  }t        j                  j                  |       # | j                  D ]!  }t        j                  j                  |       # y r   )r7  r   r   remove_weight_normr8  r{   r  s     r,   r@  z'HifiGanResidualBlock.remove_weight_norm  sL    [[ 	/EHH''.	/[[ 	/EHH''.	/r.   c                 ,   t        | j                  | j                        D ]p  \  }}|}t        j                  j                  || j                        } ||      }t        j                  j                  || j                        } ||      }||z   }r |S r   )rS  r7  r8  r   rv  
leaky_relur5  )r{   r   conv1conv2r  s        r,   r   zHifiGanResidualBlock.forward  s    T[[9 	5LE5$HMM44]DDYDYZM!-0MMM44]DDYDYZM!-0M)H4M	5 r.   )r	   )r   r	      g?)r   )	r   r   r   ro   r6  r>  r@  r   r   r   s   @r,   r2  r2    s    
>8/r.   r2  z
    HiFi-GAN vocoder.
    c                        e Zd ZU eed<   dZdef fdZdej                  fdZ	d Z
d Z ed	      dej                  d
ej                  fd       Z xZS )SpeechT5HifiGanr|   r  c                    t         |   |       t        |j                        | _        t        |j
                        | _        t        j                  |j                  |j                  ddd      | _        t        j                         | _        t        t        |j
                  |j                               D ]d  \  }\  }}| j                  j#                  t        j$                  |j                  d|z  z  |j                  d|dz   z  z  ||||z
  dz               f t        j                         | _        t)        t        | j                              D ]p  }|j                  d|dz   z  z  }t        |j                  |j*                        D ]6  \  }}| j&                  j#                  t-        ||||j.                               8 r t        j                  dddd      | _        | j3                  dt5        j6                  |j                               | j3                  dt5        j8                  |j                               | j;                          y )N   r   r	   )rk   rl   r   r   r  r  )rn   ro   rT   resblock_kernel_sizesnum_kernelsupsample_ratesnum_upsamplesr   rs   model_in_dimupsample_initial_channelconv_prer  	upsamplerrB  rS  upsample_kernel_sizesrX   ConvTranspose1d	resblocksrO   resblock_dilation_sizesr2  r5  	conv_postr   r   rP   rV   r2  )r{   r|   r  upsample_raterk   r9  r4  r~   s          r,   ro   zSpeechT5HifiGan.__init__2  s    v;;< !6!67		++
 /8V=R=RTZTpTp9q/r 		+A+{NN!!""331=33a!eE +((=8Q>		 s4>>*+ 	vA661Q<HH),V-I-I6KiKi)j v%X%%&:8[RZ\b\s\s&tuv	v
 8QAaQRSVU[[1D1D%EFWejj1D1D&EF 	r.   r  c                 2   t        |t        j                  t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyy)zInitialize the weights.r4   r  N)r  r   rs   rT  r   r  r  r|   r  rm   r!  )r{   r  s     r,   r$  zSpeechT5HifiGan._init_weightsX  sl    fryy"*<*<=>MM&&CT[[5R5R&S{{&  &&( ' ?r.   c                    t         j                  j                  }t        t         j                  j                  d      r$t         j                  j                  j                  } || j
                         | j                  D ]
  } ||        | j                  D ]  }|j                            || j                         y r<  )
r   r   r   r   r   rQ  rR  rU  r>  rW  r=  s      r,   r>  z!SpeechT5HifiGan.apply_weight_norm_  s    hh**288,,m<((33??KDMM"^^ 	E	^^ 	&E##%	&DNN#r.   c                 J   t         j                  j                  | j                         | j                  D ]!  }t         j                  j                  |       # | j
                  D ]  }|j                           t         j                  j                  | j                         y r   )r   r   r@  rQ  rR  rU  rW  rA  s     r,   r@  z"SpeechT5HifiGan.remove_weight_normk  sr    
##DMM2^^ 	/EHH''.	/^^ 	'E$$&	'
##DNN3r.   a  
        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
        waveform.
        r  r:   c                    | j                   j                  r|| j                  z
  | j                  z  }|j	                         dk(  }|s|j                  d      }|j                  dd      }| j                  |      }t        | j                        D ]  }t        j                  j                  || j                   j                        } | j                  |   |      } | j                  || j                   z     |      }t        d| j                         D ]*  }| | j                  || j                   z  |z      |      z  }, || j                   z  } t        j                  j                  |      }| j#                  |      }t%        j&                  |      }|s2|j)                  d      j                  dd      j+                  d      }|S |j)                  d      }|S )a  
        spectrogram (`torch.FloatTensor`):
            Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
            config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.

        Returns:
            `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
            shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
        r	   r   r   r   r%   )r|   normalize_beforer  r  r   r   r   rQ  rO   rN  r   rv  rC  r5  rR  rU  rL  rW  r   tanhr  r   )r{   r  
is_batchedr   r  	res_statejwaveforms           r,   r   zSpeechT5HifiGan.forwards  s   " ;;''&2djj@K __&!+
%//2K#--a3m4t))* 	9AMM44]DKKD`D`aM-DNN1-m<M<q4+;+;';<]KI1d../ UET^^A0@0@,@1,DEmTT	U%(8(88M	9 00?}5

=1$,,Q/99!Q?DDRHH
  %,,Q/Hr.   )r   r   r   r    r%  r'  ro   r   r)  r$  r>  r@  r   r   r_  r   r   r   s   @r,   rH  rH  )  sp     "!#O$4 $L)BII )
$4 (5#4#4 (9J9J ((r.   rH  )r  r*  r  r  r  rH  )r   Nr  r'  )hr   r   typingr   r   numpyrH   r   torch.utils.checkpointr   torch.nnr   r   r   activationsr
   cache_utilsr   r   r   
generationr   integrations.deepspeedr   integrations.fsdpr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   r   r   r   utils.deprecationr   configuration_speecht5r   r    
get_loggerr   ru  _HIDDEN_STATES_START_POSITIONr   r>   r-   r6   r  r   r^  ndarrayrf   rh   r   r   r)  r   r   r   r  r   r  r$  r0  ra  r{  r  r  r  r  r  r  r  r   r  r,  rN  rX  rg  rm  r  r  r  r  r  r  r  r_  rQ   r  r  r*  r2  rH  __all__r*  r.   r,   <module>rv     s     "     @ @ ! C C ) @ 7 e 9  D , 0 I 
		H	% !" %,, c [^ " ei0,,0250KSTYT`T`Ka04 26tc?tt t U--.	t
 t ZZtp#= ,!; 8!; 2A8BII A8J*bii *Zryy 0" "(299 %RYY %R1		 1D")) DN1")) 1h% %P<299 <2		+? ")-		+? )-X&,@ &$b2		 b2J")) 0:5 :zc5 cL (7o (7 (7V|
- |
~"&= "J'$; 'T
#: 
@@
- @
F/&= /d3$; 3l*#: *Z8M299 8Mv:bii :z 
Y
+ Y

Y
x 
s
5 s

s
r 7;15#'$)"'L"L##L !!2!23L U--.	L
 L L L bii L "L  L 5eE$5$5u7H7H$HIIJL^ 
e
5 e

e
P 
t
 7 t

t
n;299 ;| 
to t
tnr.   