
    h                       d Z ddlZddlZddlmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2  e       rddlm3Z3  e&jh                  e5      Z6 G d de	jn                        Z8 G d de8      Z9e8e9dZ: G d de	jn                        Z; G d de      Z<e# G d d e              Z= G d! d"e=e      Z> e#d#$       G d% d&e>             Z? e#d'$       G d( d)e>             Z@ e#d*$       G d+ d,e=             ZA e#d-$       G d. d/e=             ZBg d0ZCy)1zPyTorch BARK model.    N)OptionalUnion)nn)
functional   )DynamicCache)GenerationMixin)#AlternatingCodebooksLogitsProcessor!BarkEosPrioritizerLogitsProcessorSuppressTokensLogitsProcessor)_prepare_4d_attention_mask)!flash_attn_supports_top_left_maskis_flash_attn_available)GradientCheckpointingLayer)CausalLMOutputWithPastMaskedLMOutput)PreTrainedModelget_parameter_device)auto_docstringis_accelerate_availableis_torch_accelerator_availablelogging   )	AutoModel   )BarkCoarseConfig
BarkConfigBarkFineConfigBarkSemanticConfigBarkSubModelConfig)BarkCoarseGenerationConfigBarkFineGenerationConfigBarkSemanticGenerationConfig)_flash_attention_forwardc                   H     e Zd Zd fd	Zd Zd ZddZ	 	 	 	 	 	 ddZ xZS )	BarkSelfAttentionc                    t         |           |j                  | _        t        j                  |j                        | _        t        j                  |j                        | _        |j                  | _        |j                  | _	        | j                  | j                  z  | _
        |j                  |j                  z  dk7  r&t        d| j                   d| j                   d      t        j                  |j                  d|j                  z  |j                        | _        t        j                  |j                  |j                  |j                        | _        || _        || _        |ra|j$                  }t'        j(                  t'        j*                  ||ft,                    j/                  dd||      }| j1                  d	|       y y )
Nr   z;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   biasdtyper   r)   )super__init__dropoutr   Dropoutattn_dropoutresid_dropouthidden_size	embed_dim	num_headshead_dim
ValueErrorLinearr)   att_projout_proj	is_causal	layer_idx
block_sizetorchtrilonesboolviewregister_buffer)selfconfigr:   r;   r<   r)   	__class__s         d/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/bark/modeling_bark.pyr-   zBarkSelfAttention.__init__F   sr    ~~JJv~~6ZZ7++))$..8 0 00A5MdnnM] ^NN#2'  		&"4"4a&:L:L6LSYS^S^_		&"4"4f6H6Hv{{[""**J::ejj*j)ANOTTUVXY[egqrD  .     c                 |    |j                         dd ||fz   }|j                  |      }|j                  dddd      S )J
        Splits hidden_size dim into attn_head_size and num_heads
        Nr   r   r   r   )sizerA   permuterC   tensorr4   attn_head_size	new_shapes        rF   _split_headszBarkSelfAttention._split_headse   sC     KKM#2&)^)DD	Y'~~aAq))rG   c                     |j                  dd      j                         }|j                  |j                         dd ||z  fz         }|S )S
        Merges attn_head_size dim and num_attn_heads dim into hidden_size
        r   r   N)	transpose
contiguousrA   rK   rC   rN   r4   rO   s       rF   _merge_headszBarkSelfAttention._merge_headsm   sL     !!!Q'224V[[]3B/9~3M2OOPrG   c                    t        j                  ||j                  dd            dt        j                  | j
                        z  z  }| j                  rz|j                  d      |j                  d      }}|j                  | j                  d d d d ||z
  |d |f   dk(  t        j                  |j                        j                        }|||z   }t        j                  j                  |d      }|j!                  |j                        }| j#                  |      }|||z  }t        j                  ||      }	|	|fS )NrJ   rT         ?r   dim)r=   matmulrU   mathsqrtr5   r:   rK   masked_fillr)   finfor+   minr   r   softmaxtor0   )
rC   querykeyvalueattention_mask	head_maskattn_weightsquery_length
key_lengthattn_outputs
             rF   _attnzBarkSelfAttention._attny   s&   ||E3==R+@AS499UYUbUbKcEcd>>',zz"~sxx|*L (33		!Q
\ 9J FSTXYYL../33L
 %'.8L}},,\r,B#u{{3((6  ')3L ll<7L((rG   c                 `   | j                  |      j                  | j                  d      \  }}	}
| j                  || j                  | j
                        }| j                  |	| j                  | j
                        }	| j                  |
| j                  | j
                        }
|#|j                  |	|
| j                  d|i      \  }	}
| j                  ||	|
||      \  }}| j                  || j                  | j
                        }| j                  |      }| j                  |      }||fS )Nr   r[   cache_position)r8   splitr3   rQ   r4   r5   updater;   rn   rX   r9   r1   )rC   hidden_statesrh   past_key_valuesri   	use_cacheoutput_attentionsrp   re   rf   rg   rm   rj   s                rF   forwardzBarkSelfAttention.forward   s    !MM-8>>t~~ST>UsE!!%GT^^T]]C!!%G&(//UDNNM]_mLnoJC$(JJuc5.R[$\!\''T^^T]]SmmK0((5L((rG   FN)NNNNNFFN)	__name__
__module____qualname__r-   rQ   rX   rn   rw   __classcell__rE   s   @rF   r&   r&   B   s1    />*
)D )rG   r&   c                   B     e Zd ZdZ fdZd Zd Z	 	 	 	 	 	 ddZ xZS )BarkSelfFlashAttention2aH  
    Bark flash attention module. This module inherits from `BarkSelfAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                 B    t        |   |i | t               | _        y N)r,   r-   r   _flash_attn_uses_top_left_mask)rC   argskwargsrE   s      rF   r-   z BarkSelfFlashAttention2.__init__   s#    $)&)
 /P.Q+rG   c                 X    |j                         dd ||fz   }|j                  |      }|S )rI   NrJ   )rK   rA   rM   s        rF   rQ   z$BarkSelfFlashAttention2._split_heads   s5     KKM#2&)^)DD	Y' rG   c                 X    |j                  |j                         dd ||z  fz         }|S )rS   NrT   )rA   rK   rW   s       rF   rX   z$BarkSelfFlashAttention2._merge_heads   s1     V[[]3B/9~3M2OOPrG   c           
         |j                         \  }}	}
| j                  |      j                  | j                  d      \  }}}| j	                  || j
                  | j                        }| j	                  || j
                  | j                        }| j	                  || j
                  | j                        }|#|j                  ||| j                  d|i      \  }}t        |||||	| j                  r| j                  nd| j                  | j                        }| j                  || j
                  | j                        }| j                  |      }| j!                  |      }|d fS )Nr   r[   rp           )r.   use_top_left_maskr:   )rK   r8   rq   r3   rQ   r4   r5   rr   r;   r$   trainingr.   r   r:   rX   r9   r1   )rC   rs   rh   rt   ri   ru   rv   rp   
batch_size	query_len_re   rf   rg   rm   s                  rF   rw   zBarkSelfFlashAttention2.forward   sC    $1#5#5#7 
Iq !MM-8>>t~~ST>UsE!!%GT^^T]]C!!%G&(//UDNNM]_mLnoJC.$(MMDLLs"AAnn	
 ''T^^T]]SmmK0((5D  rG   ry   )	rz   r{   r|   __doc__r-   rQ   rX   rw   r}   r~   s   @rF   r   r      s1    R %!rG   r   )eagerflash_attention_2c                   $     e Zd Z fdZd Z xZS )BarkMLPc                    t         |           t        j                  |j                  d|j                  z  |j
                        | _        t        j                  d|j                  z  |j                  |j
                        | _        t        j                  |j                        | _	        t        j                         | _        y )N   r(   )r,   r-   r   r7   r2   r)   in_projr9   r/   r.   GELUgelurC   rD   rE   s     rF   r-   zBarkMLP.__init__  s    yy!3!3Q9K9K5KRXR]R]^		!f&8&8"8&:L:LSYS^S^_zz&..1GGI	rG   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r9   r.   )rC   rs   s     rF   rw   zBarkMLP.forward  s@    ]3		-0m4]3rG   rz   r{   r|   r-   rw   r}   r~   s   @rF   r   r     s    rG   r   c                   4     e Zd Zd fd	Z	 	 	 	 	 	 ddZ xZS )	BarkBlockc                    t         |           |rat        j                  |j                  |j
                        | _        t        j                  |j                  |j
                        | _        nHt        j                  |j                        | _        t        j                  |j                        | _        t        |j                     |||      | _
        t        |      | _        y )Nr(   r:   r;   )r,   r-   r   	LayerNormr2   r)   layernorm_1layernorm_2BARK_ATTENTION_CLASSES_attn_implementationattnr   mlp)rC   rD   r:   r;   rE   s       rF   r-   zBarkBlock.__init__  s      "||F,>,>V[[QD!||F,>,>V[[QD!||F,>,>?D!||F,>,>?D*6+F+FGi9
	 6?rG   c           	          | j                  |      }| j                  |||||||      }	|	d   }
|	dd  }||
z   }|| j                  | j                  |            z   }|f|z   S )Nrt   rh   ri   ru   rv   rp   r   r   )r   r   r   r   )rC   rs   rt   rh   ri   ru   rv   rp   intermediary_hidden_statesattn_outputsrm   outputss               rF   rw   zBarkBlock.forward(  s     &*%5%5m%D"yy&+)/) ! 
 #1oqr"%2[%@"%?$((78C
 &
" +,w66rG   rx   ry   r   r~   s   @rF   r   r     s!    #* 7rG   r   c                   b     e Zd ZU eed<   dZdZd Z fdZe	de
j                  fd       Z xZS )BarkPreTrainedModelrD   FTc                    t        |t        j                  f      rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yy)zInitialize the weights.r   )meanstdNrZ   )
isinstancer   r7   weightdatanormal_rD   initializer_ranger)   zero_	Embeddingpadding_idxr   fill_rC   modules     rF   _init_weightsz!BarkPreTrainedModel._init_weightsO  s   fryyl+ MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .rG   c                 $    t        |   |i | y r   )r,   r-   )rC   inputsr   rE   s      rF   r-   zBarkPreTrainedModel.__init___  s    &+F+rG   returnc                 :   t        | d      st        |       S | j                         D ]g  }t        |d      st        |j                  d      s'|j                  j                  >t        j                  |j                  j                        c S  t        |       S )
        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
        device).
        _hf_hookexecution_device)hasattrr   modulesr   r   r=   devicer   s     rF   r   zBarkPreTrainedModel.deviceb  s     tZ('--lln 	FF
+FOO-?@OO44@||FOO$D$DEE	F $D))rG   )rz   r{   r|   r   __annotations__supports_gradient_checkpointing_supports_flash_attnr   r-   propertyr=   r   r}   r~   s   @rF   r   r   I  s>    &+#* , * * *rG   r   c                       e Zd ZU eed<    fdZd Zd Zd Z	 	 	 	 	 	 d fd	Z	e
	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deeej                        d	eej                     d
eej                     deej                     deej                      deej                     dee   dee   dee   dee   deej                     deeej                     ef   fd       Z xZS )BarkCausalModelrD   c           
         t         |   |       || _        t        j                  |j
                  |j                        | _        t        j                  |j                  |j                        | _	        t        j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        |d|       c}      | _        |j$                  dk(  | _        t        j(                  |j                  |j*                        | _        t        j.                  |j                  |j0                  d      | _        d| _        | j7                          y c c}w )NTr   r   r(   F)r,   r-   rD   r   r   input_vocab_sizer2   input_embeds_layerr<   position_embeds_layerr/   r.   drop
ModuleListrange
num_layersr   layersr   _use_flash_attention_2r   r)   layernorm_finalr7   output_vocab_sizelm_headgradient_checkpointing	post_init)rC   rD   irE   s      rF   r-   zBarkCausalModel.__init__{  s
     #%,,v/F/FHZHZ"[%'\\&2C2CVEWEW%X"JJv~~.	mm]bcictct]u$vXYYvQR%S$vw&,&A&AEX&X#!||F,>,>V[[Qyy!3!3V5M5MTYZ&+# 	 %ws    E.c                      y r    rC   s    rF   get_output_embeddingsz%BarkCausalModel.get_output_embeddings  s     rG   c                     | j                   S r   r   r   s    rF   get_input_embeddingsz$BarkCausalModel.get_input_embeddings  s    &&&rG   c                     || _         y r   r   rC   new_embeddingss     rF   set_input_embeddingsz$BarkCausalModel.set_input_embeddings  s
    "0rG   c           
      `    t        
|   |f||||||d|}	|	j                  dd       |	d<   |	S )N)rh   inputs_embedsrt   position_idsru   rp   r   input_embeds)r,   prepare_inputs_for_generationpop)rC   	input_idsrh   r   rt   r   ru   rp   r   model_inputsrE   s             rF   r   z-BarkCausalModel.prepare_inputs_for_generation  sW     w<	
)&+%)	
 	
 (4'7'7'N^$rG   r   rt   rh   r   ri   labelsr   ru   rv   output_hidden_statesreturn_dictrp   r   c           
      `   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }d}|t        d      ||t        d      ||n"|| j                  |      }n|nt        d      |j                         dd }|j                  d   }|d   }||j                  n|j                  }| j                  r%| j                  r|rt        j                  d       d}|r|t        | j                   	      }|r:t!        |t"              r*t        j                  d
       t        j$                  |      }||j'                         nd}|;t)        j*                  |||z   t(        j,                  |      }|j/                  d      }| j1                  |      }|O|dk  rt        d      | j2                  r	d|v r|nd}n*|j5                  |d      }t7        ||j8                  d      }| j;                  || j                   j<                        }| j?                  ||z         }||j                  d      fz   }|	rdnd}|
rdnd}tA        | jB                        D ]0  \  }}|
r||fz   } ||||||   ||	|      }|d   }|	s(||d   fz   }2 | jE                  |      }|j5                  |      }|
r||fz   }| jG                  |      }|st#        d d||||fD              S tI        |||||      S )a  
        input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
            Here, due to `Bark` particularities, if `past_key_values` is used, `input_embeds` will be ignored and you
            have to use `input_ids`. If `past_key_values` is not used and `use_cache` is set to `True`, `input_embeds`
            is used in priority instead of `input_ids`.
        NzXTraining is not implemented yet for Bark - ensure you do not pass `labels` to the model.CYou cannot specify both input_ids and input_embeds at the same time4You have to specify either input_ids or input_embedsrJ   r   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)rD   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `DynamicCache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.r+   r   $batch_size has to be defined and > 0r   tgt_lenr   r   c              3   &   K   | ]	  }||  y wr   r   .0vs     rF   	<genexpr>z*BarkCausalModel.forward.<locals>.<genexpr>=  s      fgfs   )losslogitsrt   rs   
attentions)%rD   rv   r   ru   use_return_dictNotImplementedErrorr6   r   rK   shaper   r   r   loggerwarning_oncer   r   tuplefrom_legacy_cacheget_seq_lengthr=   arangelong	unsqueezer   r   rA   r   r+   get_head_maskr   r   	enumerater   r   r   r   )rC   r   rt   rh   r   ri   r   r   ru   rv   r   r   rp   r   input_shaper   
seq_lengthr   past_lengthposition_embedsrs   output_shapeall_self_attentionsall_hidden_statesr   blockr   r  s                               rF   rw   zBarkCausalModel.forward  s   . 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]%j   \%=bcc%/*A "229=L%STT"'')#2.!''*
 _
%.%:!!@S@S&&4==##p "	0*$++>OOU;U
 +<<_MO:I:Uo446[\ <<Z+5MUZU_U_hnoL'11!4L44\B %Q !GHH**343FD!/!4!4Z!D "<NLL^L^hi!j &&y$++2H2HI			,"@A"m&8&8&<%>>$5b4"6BD!$++. 	JHAu#$58H$H! /-#A,#"3-G $AJM &9WQZM&I##	J& ,,];%**<8   1]4D Dm,  &/;LNab   &++*
 	
rG   )NNNNNN)NNNNNNNNNNNN)rz   r{   r|   r    r   r-   r   r   r   r   r   r   r=   Tensorr  FloatTensor
LongTensorr@   r   r   rw   r}   r~   s   @rF   r   r   x  st   *
'1 4  -1>B15/3,0-1/3$(,0/3&*15Q
ELL)Q
 "%(9(9":;Q
 !.	Q

 u||,Q
 ELL)Q
 ))*Q
 u||,Q
 D>Q
 $D>Q
 'tnQ
 d^Q
 !.Q
 
uU\\"$::	;Q
 Q
rG   r   z
    Bark semantic (or text) model. It shares the same architecture as the coarse model.
    It is a GPT-2 like autoregressive model with a language modeling head on top.
    )custom_introc                        e Zd ZU dZeed<   	 	 	 d	dej                  dede	e
eej                  f      de	ej                     dej                  f
 fdZ xZS )
BarkSemanticModelsemanticrD   r   semantic_generation_confighistory_promptrh   r   c           
         |t        d      |j                  d   }|j                  }||j                  z   }|-|j	                  d|z
  j                         |j                        }|E|d   | d }t        j                  j                  |d|t        |      z
  f|j                  d      }nLt        j                  |j                  g|z  t        j                        j                  | j                         }t        j"                  |d   |d	      }t        j                  |j$                  gg|z  t        j                        j                  | j                         }	t        j&                  | j)                  |ddd|f         | j)                  |ddd|dz   f         z   | j)                  |	      gd	      }
t+        t-        |j.                  |j                              }|j1                  t+        t-        |j                  dz   | j2                  j4                                     t7        ||j                   
      }|j9                  d|j:                        }t=        |j>                  ||j                         }tA        |   t        jD                  ||dz   ft        j                  | j                         f|
||g|d|}|dd|dz   df   }|S )a  
        Generates text semantic tokens from an input prompt and an additional optional `Bark` speaker prompt.

        Args:
            input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
                Input ids, i.e tokenized input sentences. Will be truncated up to
                semantic_generation_config.max_input_semantic_length tokens. Note that the output audios will be as
                long as the longest generation among the batch.
            semantic_generation_config (`BarkSemanticGenerationConfig`):
                Generation config indicating how to generate the semantic tokens.
            history_prompt (`Optional[dict[str,torch.Tensor]]`, *optional*):
                Optional `Bark` speaker prompt.
            attention_mask (`Optional[torch.Tensor]`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
        Returns:
            torch.LongTensor: Output semantic tokens.
        N/`semantic_generation_config` has to be providedr   r   semantic_promptconstant)rg   moder*   r[   r   	min_eos_p)eos_token_idr'  r   r   )r   logits_processorgeneration_config)#r6   r  max_input_semantic_lengthtext_encoding_offsetr`   r@   text_pad_tokenr   r   padlensemantic_pad_tokenr=   rN   intrd   r   repeat_interleavesemantic_infer_tokencatr   listr   semantic_vocab_sizeextendrD   r   r   getr'  r   r(  r,   generater?   )rC   r   r  r   rh   r   r   r+  semantic_historyinfer_arrayr   tokens_to_suppress suppress_tokens_logits_processorr'  early_stopping_logits_processorsemantic_outputrE   s                   rF   r9  zBarkSemanticModel.generateT  s   < &-NOO__Q'
$>$X$X! : O OO	%!--q>/A.G.G.IKeKtKtuI%-.?@B[A[A\]!}}00 -4D0EEF0CC	  1    %||+>>?B[[chclcl bo  !223CD3I:[\]ll(==>?*LTYT]T]

"T[[/ 	 yy''	!5O6O5O2O(PQ))*:1>]@Y\]@]>];]*^_`''4
 
 ",@@B\BoBop
 	!!1DDqH$++JgJghi	
 ,II[dmdtdt+u(JJ{,F,P,PQ	*K3@@I^g^n^n+
'  '*JJ
$=$AB%))\`\g\gh
%>@_`8	

 
 *!-F-J-L*LMrG   NNN)rz   r{   r|   base_model_prefixr   r   r=   r  r#   r   dictstrr  r9  r}   r~   s   @rF   r  r  J  s     #
 DH<@15a<<a %Aa !c5<<&7!89	a
 !.a 
		a arG   r  z
    Bark coarse acoustics model.
    It shares the same architecture as the semantic (or text) model. It is a GPT-2 like autoregressive model with a
    language modeling head on top.
    c                   .    e Zd ZU dZeed<   	 ddedededededeee	e
j                  f      fd	Z	 	 	 	 	 dd
e
j                  dedededeee	e
j                  f      dee   dee
j"                  ee
j"                  e
j"                  f   f   f fdZ xZS )BarkCoarseModelcoarse_acousticsrD   max_coarse_historysemantic_to_coarse_ratior   r  codebook_sizer   c           
         |t        j                  |d   d   |d      }|d   j                         }|2t        d|j                  d         D ]  }	||	ddfxx   ||	z  z  cc<    t        j
                  |dd      j                  d      }||j                  z   }t        j                  |d   |d      }t        t        j                  ||z              }
t        |
|j                  d   |j                  d   dz  z
  t        t        j                  |j                  d   |z              g      }t        t        ||z              }|dd| df   j                         }|dd| df   j                         }|dddd	f   }||fS t        j                  g g|z  t         j                  | j                  
      }t        j                  g g|z  t         j                  | j                  
      }||fS )a  
        Preprocess the optional `Bark` speaker prompts before `self.generate`.

        Args:
            max_coarse_history (`int`):
                Maximum size of coarse tokens used.
            semantic_to_coarse_ratio (`int`):
                Ratio of semantic to coarse frequency
            batch_size (`int`):
                Batch size, i.e the number of samples.
            semantic_generation_config (`BarkSemanticGenerationConfig`):
                Generation config indicating how to generate the semantic tokens.
            codebook_size (`int`):
                Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
            history_prompt (`Optional[dict[str,torch.Tensor]]`):
                Optional `Bark` speaker prompt.
        Returns: Returns:
            `tuple(torch.FloatTensor)`:
            - **x_semantic_history** (`torch.FloatTensor` -- Processed semantic speaker prompt.
            - **x_coarse_history** (`torch.FloatTensor`) -- Processed coarse speaker prompt.
        Nr#  r   r[   coarse_promptr   rJ   r   rT   r   )r=   r2  cloner   r  rU   reshaper6  r1  npfloorrb   roundrN   r   )rC   rG  rH  r   r  rI  r   x_semantic_historyx_coarse_historynmax_semantic_historyn_semantic_hist_providedn_coarse_hist_provideds                rF   preprocess_historiesz$BarkCoarseModel.preprocess_histories  s   < %!&!8!8HY9Z[_9`blrs!t-o>DDF (q"2"8"8";< @A$QT*ma.??*@
  %/?AFNNrR/2L2`2``$667G7Mz_`a $'rxx0BE]0]'^#_ '*(&,,Q/2D2J2J12MPQ2QQ!1!7!7!:=U!UVW($ &)/GJb/b)c%d"!3A8P7P7Q4Q!R!V!V!X/4J3J3K0KLPPR/3B37 "#333 "'rdZ.?uyyY]YdYd!e$||RD:,=UYYW[WbWbc!#333rG   r?  coarse_generation_configreturn_output_lengthsr   c           
         |t        d      |t        d      |j                  }|j                  }	|j                  }
|j	                  ||j
                  k(  |j                         |j                  |j                  z  |j                  z  }t        t        j                  |	|z              }||j                  k7  j                  d      }t        j                  ||z  |j                  z        }t        j                  ||j                  z        j                         }t        j                   |      j#                         }|j$                  d   }| j'                  ||	||||      \  }}|j$                  d   }t        j(                  ||g      }t        t        j*                  ||
z              }d}|j$                  d   }t-        |      D ]W  }|t        t        ||z              z   }|ddt        j                   d||z
  g      df   }|ddd|f   }t/        j0                  |d||j$                  d   z
  fd|j                        }t        j(                  |t        j2                  |j4                  gg|z  | j6                  	      |dd|	 df   g      }t9        |j$                  d   |j:                  |      }t=        | |  |f|gtA        |
||z
        |d
|}|j$                  d   }t        j(                  ||dd|df   g      }|j$                  d   |z
  }~Z |dd|df   }|r||fS |S )aW  
        Generates coarse acoustics tokens from input text semantic tokens and an additional optional `Bark` speaker
        prompt.

        Args:
            semantic_output (`torch.Tensor` of shape (batch_size, seq_len), *optional*):
                Input text semantic ids, i.e the output of `BarkSemanticModel.generate`.
            semantic_generation_config (`BarkSemanticGenerationConfig`):
                Generation config indicating how to generate the semantic tokens.
            coarse_generation_config (`BarkCoarseGenerationConfig`):
                Generation config indicating how to generate the coarse tokens.
            codebook_size (`int`, *optional*, defaults to 1024):
                Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
            history_prompt (`Optional[dict[str,torch.Tensor]]`, *optional*):
                Optional `Bark` speaker prompt.
            return_output_lengths (`bool`, *optional*):
                Whether or not to return the output lengths. Useful when batching.
        Returns:
            By default:
                torch.LongTensor: Output coarse acoustics tokens.
            If `return_output_lengths=True`:
                `Tuple(torch.Tensor, torch.Tensor): The output coarse acoustics tokens, and the length of each sample
                of the batch.
        Nr"  -`coarse_generation_config` has to be providedr   r   )r   rG  rH  r   r  rI  rJ   r$  r&  )r)  max_new_tokensr*  )!r6   max_coarse_input_lengthrG  sliding_window_lenmasked_fill_r0  coarse_semantic_pad_tokencoarse_rate_hzsemantic_rate_hzn_coarse_codebooksr1  rN  rO  sumr=   rP  maxitemr  rW  hstackceilr   Fr.  rN   coarse_infer_tokenr   r
   r6  r,   r9  rb   )rC   r?  r  rX  rI  r   rY  r   r]  rG  r^  rH  rT  output_lengthsmax_generated_lenr   rQ  x_coarsebase_semantic_idxn_window_stepstotal_generated_lenlen_coarse_historyr   semantic_idxinput_coarsealternatingLogitsProcessoroutput_coarseinput_coarse_lencoarse_outputrE   s                                rF   r9  zBarkCoarseModel.generate  s   F &-NOO#+LMM":"R"R5HH5HH 	$$9LLL$>>	
 %33(99:&99: 	!
  #288,>AY,Y#Z[)-E-_-__ddefg558P8c8cc
 ^6N6a6a%abffh!IIn5::<$**1-
'+'@'@)1%=!'A' (A (
$H /44Q7,,(:O'LMRWW%69K%KLM%^^A.~& (	A,s59LOg9g3h/iiL +1bffaH\9\5].^.`+`aL'+C,C+C(CDL55+l.@.@.DDE(BB	L !<< LL#;#N#N"O!PS]!]fjfqfqrQ!3 3 445L *M""1%*>>*& "G,"<!="#57HK^7^_":	
 M  ,11!4||X}Q@P@Q=Q/R$STH"*.."36H"HQ(	T !$6$7!78  .00rG   r   )NN   NN)rz   r{   r|   rA  r   r   r1  r   rB  rC  r=   r  rW  r#   r!   r@   r   r  r  r9  r}   r~   s   @rF   rE  rE    s    + =AH4H4 #&H4 	H4
 %(H4 H4 !c5<<&7!89H4Z DH?C!<@04FF %AF #=	F
 F !c5<<&7!89F  (~F 
uu'7'79I9I'I!JJ	KF FrG   rE  z
    Bark fine acoustics model. It is a non-causal GPT-like model with `config.n_codes_total` embedding layers and
    language modeling heads, one for each codebook.
    c                   f    e Zd ZU dZeed<   dZ fdZd Zd Z	d Z
d Zd"d	Z	 	 	 d#d
ee   dee   dedej"                  fdZd Zd Ze	 	 	 	 	 	 	 	 	 d$dedeej.                     deej.                     deej.                     deej.                     deej0                     deej.                     dee   dee   dee   deeej.                     ef   fd       Z ej:                         	 	 	 	 	 d%dej.                  dedede ded ee!e"ej.                  f      dej0                  fd!       Z# xZ$S )&BarkFineModelfine_acousticsrD   codebook_idxc           
         t         |   |       || _        t        j                  t        |j                        D cg c],  }t        j                  |j                  |j                        . c}      | _
        t        j                  |j                  |j                        | _        t        j                  |j                        | _        t        j                  t        |j                         D cg c]  }t#        |d|       c}      | _        |j&                  dk(  | _        t        j*                  |j                        | _        t        j                  t        |j.                  |j                        D cg c].  }t        j0                  |j                  |j2                  d      0 c}      | _        d| _        |j                  | _        | j9                          y c c}w c c}w c c}w )NFr   r   r(   )r,   r-   rD   r   r   r   n_codes_totalr   r   r2   input_embeds_layersr<   r   r/   r.   r   r   r   r   r   r   r   r   n_codes_givenr7   r   lm_headsr   r   )rC   rD   r   r   rE   s       rF   r-   zBarkFineModel.__init__  sx     $&==PUV\VjVjPkl1R\\&1163E3EFl$
  &(\\&2C2CVEWEW%X"JJv~~.	mmFKFL]L]F^_Yv!<_
 '-&A&AEX&X#!||F,>,>? v33V5I5IJ 		&,,f.F.FUS
 ',##11 	/ m `s   1G&4G+>3G0c                     | j                   S r   r  r   s    rF   r   z"BarkFineModel.get_input_embeddings  s    '''rG   c                     || _         y r   r  r   s     rF   r   z"BarkFineModel.set_input_embeddings  s
    #1 rG   c                     | j                   S r   r  r   s    rF   r   z#BarkFineModel.get_output_embeddings  s    }}rG   c                     || _         y r   r  )rC   new_output_embeddingss     rF   set_output_embeddingsz#BarkFineModel.set_output_embeddings  s	    -rG   c                    | j                         }t        j                  |D cg c]  }| j                  ||||       c}      }| j	                  |       |d   j
                  j                  d   }| j                         j| j                  j                  sT| j                         }t        j                  |D cg c]  }| j                  ||       c}      }	| j                  |	       | j                         S c c}w c c}w )Nr   )r   r   r   _get_resized_embeddingsr   r   r  r   rD   tie_word_embeddings_get_resized_lm_headr  )
rC   new_num_tokenspad_to_multiple_ofmean_resizingold_embeddings_listold_embeddingsnew_embeddings_listold_lm_head_listold_lm_headnew_lm_head_lists
             rF   _resize_token_embeddingsz&BarkFineModel._resize_token_embeddings  s    "779 mm ':" ,,^^M_ano
 	!!"56,Q/66<<Q? %%'3DKK<[<[#99;!}}[klK**;Gl  &&'78((**! ms   C=<Dr  r  r  r   c                    | j                  |||      }|||S |d   j                  j                  d   | j                  _        |d   j                  j                  d   | j                  _        |d   j                  j                  d   | _        |d   j                  j                  d   | _        | j                          |S )a  
        Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.

        Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.

        Arguments:
            new_num_tokens (`int`, *optional*):
                The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
                returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the embedding matrix to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
                details about this, or help on choosing the correct value for resizing, refer to this guide:
                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
            mean_resizing (`bool`):
                Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
                covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.

                Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
                where the generated tokens' probabilities won't be affected by the added embeddings because initializing the new embeddings with the
                old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
                Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html

        Return:
            `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
        r   )r  r   r  rD   r   
vocab_sizetie_weights)rC   r  r  r  model_embedss        rF   resize_token_embeddingsz%BarkFineModel.resize_token_embeddings  s    F 44^EWYfg!&8&@ )5Q(>(>(D(DQ(G%!-a!7!7!=!=a!@!-a!7!7!=!=a!@&q/0066q9 	rG   c                 l   t        | j                  dd      rg | _        | j                         }| j	                         }t        | j                  j                  | j                  j                  z
        D ]<  }| j                  ||   ||dz             | j                  j                  d| d       > y y )Nr  Tr   z	lm_heads.z.weight)
getattrrD   _tied_weights_keysr   r   r   r~  r  _tie_or_clone_weightsappend)rC   output_embeddingsinput_embeddingsr   s       rF   _tie_weightszBarkFineModel._tie_weights  s    4;; 5t<&(D# $ : : <#88:4;;44t{{7P7PPQ G**+<Q+?AQRSVWRWAXY''..1#W/EFG =rG   c                 h    | j                         D ]  }t        |d      s|j                          ! yz
        Tie the weights between the input embeddings list and the output embeddings list.

        If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the
        weights instead.
        r  Nr   r   r  r   s     rF   r  zBarkFineModel.tie_weights$  /     lln 	&Fv~.##%	&rG   r   rh   r   ri   r   r   rv   r   r   c           
         ||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
d}|t	        d      |dk(  rt        d      ||t        d      ||t        d      |t        | j                        D cg c]&  \  }} ||dddd|f         j                  d      ( }}}t        j                  |d      }|ddddddd|d	z   f   j                  d      }|j                         dd }|j                  d   }|d	   }||j                  n|j                  }|8t        j                  d|t        j                   |
      }|j                  d      }| j#                  |      }|=|dk  rt        d      | j$                  r	d|v r|nd}nt'        ||j(                  d	      }| j+                  || j                   j,                        }| j/                  ||z         }||j                  d      fz   }|rdnd}|	rdnd}t        | j0                        D ]-  \  }}|	r||fz   } |||||   |      }|d   }|s%||d	   fz   }/ | j3                  |      }|j5                  |      }|	r||fz   } | j6                  || j                   j8                  z
     |      }|
st;        d d|||fD              S t=        ||||      S c c}}w )a  
        codebook_idx (`int`):
            Index of the codebook that will be predicted.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            NOT IMPLEMENTED YET.
        input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. If
            `past_key_values` is used, optionally only the last `input_embeds` have to be input (see
            `past_key_values`). This is useful if you want more control over how to convert `input_ids` indices into
            associated vectors than the model's internal embedding lookup matrix.
        NzTraining is not implemented yetr   zRCannot predict 0th codebook - 0th codebook should be predicted by the coarse modelr   r   rJ   r[   r   r   r   r   r   )rh   ri   rv   c              3   &   K   | ]	  }||  y wr   r   r   s     rF   r   z(BarkFineModel.forward.<locals>.<genexpr>  s     lq^_^klr   )r   r  rs   r  )rD   rv   r   r  r  r6   r  r  r  r=   r4  rd  rK   r  r   r  r  r   r   r   r+   r  r   r   r   r   rA   r  r  r  r   )rC   r|  r   rh   r   ri   r   r   rv   r   r   r   r   r   r  r   r  r   r  rs   r  r  r  r  r   r  s                             rF   rw   zBarkFineModel.forward/  sa   2 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%&GHH1qrr \%=bcc!5STT  .7t7O7O-P)A) #9Q1W#56@@DL  !99\r:L'1a1C<!3C1C(CDHHRHPL"'')#2.!''*
 ^
%.%:!!@S@S <<:UZZPVWL'11!4L44\B %Q !GHH**343FD "<NLL^L^hi!j&&y$++2H2HI			,"@A"m&8&8&<%>>$5b4"6BD!$++. 	JHAu#$58H$H!-#A,"3	G $AJM &9WQZM&I#	J  ,,];%**<8   1]4D DH|dkk.G.GGHWlT63DFY$Zlll+*	
 	
Es   &+K7rw  r  rX  fine_generation_configrI  r   c           	         |t        d      |t        d      |t        d      |j                  d|j                        }|j                  }	|j                  }
|j                  |j                  d   d|j                        }t        j                  ||j                  z
  |      }|j                  d   }|)t        j                  |d   j                  d   |d	      }nd}|j                  }t        j                  |d|j                  |z
  fd
|      }|Ct        j                   |dd|	 dddf   |gd	      }|dd|	 dddf   j                  d   }nd}d}|j                  d   |
k  r/|
|j                  d   z
  }t        j                  |ddd|fd
|      }|j                  d   |
|z
  z
  |	z  }t#        t%        j&                  |            }t)        d|      dz   }t+        |      D ]  }t-        ||	z  |j                  d   |
z
  g      }t-        |||	z  z   |j                  d   |	z
  g      }||z
  }|dd|||
z   ddf   }t+        ||j                        D ]  }| j/                  ||      j0                  }||dk(  r%|dd|dd|f   }t        j2                  |d      }nk|ddddd|f   |z  }t        j4                  |d	      dd||
f   }|j7                  d|f      }t        j8                  |d      j                  |d      }|j;                  t        j<                        }||dd|d|f<   ~~ t+        ||j                        D ]  }|dd|d|f   |dd|||
|z
  z   |f<     ~ |j?                  dd      dddd|df   }|dkD  r|ddddd| f   }|j                  d   |j                  d   k7  rt        d      |S )ap  
        Generates fine acoustics tokens from input coarse acoustics tokens and an additional optional `Bark` speaker
        prompt.

        Args:
            coarse_output (`torch.Tensor` of shape (batch_size, seq_len)):
                Input coarse acoustics ids, i.e the output of `BarkCoarseModel.generate`.
            semantic_generation_config (`BarkSemanticGenerationConfig`):
                Generation config indicating how to generate the semantic tokens.
            coarse_generation_config (`BarkCoarseGenerationConfig`):
                Generation config indicating how to generate the coarse tokens.
            fine_generation_config (`BarkFineGenerationConfig`):
                Generation config indicating how to generate the fine tokens.
            codebook_size (`int`, *optional*, defaults to 1024):
                Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
            history_prompt (`Optional[dict[str,torch.Tensor]]`, *optional*):
                Optional `Bark` speaker prompt.
        Returns:
            torch.LongTensor: Output fine acoustics tokens.
        Nr"  r[  z+`fine_generation_config` has to be providedtemperaturer   rJ   fine_promptr[   r$  r   )r%  rg   rZ   )num_samplesr   rT   z-input and output should have the same seq_len) r6   r8  r  max_fine_history_lengthmax_fine_input_lengthrA   r  rc  r=   	remainderr6  r2  Tri  r.  n_fine_codebooksr4  r1  rN  rh  re  r   rb   rw   r  argmaxrc   rM  multinomialrd   int32rU   )rC   rw  r  rX  r  rI  r   r   r  r  r  r   x_fine_historyn_coarse
fine_input	n_historyn_remove_from_endn_loopsn_outer	start_idxstart_fill_idxrel_start_fill_idxinput_buffern_innerr  relevant_logitscodebook_predsprobss                               rF   r9  zBarkFineModel.generate  s   > &-NOO#+LMM!)JKK
 jj0F0R0RS"8"P"P 6 L L &**=+>+>q+A2G_GrGrs 8R8f8f(fhuv"((+
%"44^M5R5T5TUY5Z\flmnN "N+>> UU&77(BC	

 %N17N6N6OQR3R$SU_#`fghJ 'q+B*B*CQ'FGMMaPIIA!66 5
8H8H8K KzAq!5F+Gj`mnJ !&&q)-BY-NOSjjbggg&'a/A%W~ 	GW'>>
@P@PQR@SVk@klmI W'>>>
@P@PQR@SVm@mnN "0)!;%aYAV5V)VXY&YZL +A+R+RS +g|<CC&+*<&,Q0B0C^m^-S&TO%*\\/2%FN&,Q>M>-A&B[&POIIo2>qBTUjBj?jkE!MM2}*=>E%*%6%6u!%L%Q%QR\^`%aN!/!2!25;;!?@NQ 2 3W<=N+$ !+A+R+RS B !$6$7!@A ~:ORd:d(eegnnB =	@  ))!Q/1ij0@A
q #Aq*=,=+=*=$=>JB=#6#6r#::LMMrG   NT)NNT)	NNNNNNNNN)NNNrx  N)%rz   r{   r|   rA  r   r   main_input_namer-   r   r   r   r  r  r   r1  r@   r   r   r  r  r  r   r=   r  r  r   r  r   rw   no_gradr#   r!   r"   rB  rC  r9  r}   r~   s   @rF   rz  rz    s    )$OB(2.+. )-,0"	0 0 %SM0 	0
 
0d	G	&  -115/3,0-1/3,0/3&*w
w
 ELL)w
 !.	w

 u||,w
 ELL)w
 ))*w
 u||,w
 $D>w
 'tnw
 d^w
 
uU\\"N2	3w
 w
r U]]_ DH?C;?!<@F||F %AF #=	F
 !9F F !c5<<&7!89F 
		F FrG   rz  a7  
    The full Bark model, a text-to-speech model composed of 4 sub-models:
    - [`BarkSemanticModel`] (also referred to as the 'text' model): a causal auto-regressive transformer model that
      takes
    as input tokenized text, and predicts semantic text tokens that capture the meaning of the text.
    - [`BarkCoarseModel`] (also referred to as the 'coarse acoustics' model), also a causal autoregressive transformer,
    that takes into input the results of the last model. It aims at regressing the first two audio codebooks necessary
    to `encodec`.
    - [`BarkFineModel`] (the 'fine acoustics' model), this time a non-causal autoencoder transformer, which iteratively
    predicts the last codebooks based on the sum of the previous codebooks embeddings.
    - having predicted all the codebook channels from the [`EncodecModel`], Bark uses it to decode the output audio
      array.

    It should be noted that each of the first three modules can support conditional speaker embeddings to condition the
    output sound according to specific predefined voice.
    c            
       .    e Zd ZU eed<    fdZedefd       Ze	de
j                  fd       Z	 ddee   fdZddZ e
j                          	 	 	 dd	ee
j"                     d
eeee
j"                  f      dee   de
j(                  fd       Zd Z xZS )	BarkModelrD   c                    t         |   |       t        |j                        | _        t        |j                        | _        t        |j                        | _
        t        j                  |j                        | _        || _        y r   )r,   r-   r  semantic_configr  rE  coarse_acoustics_configrF  rz  fine_acoustics_configr{  r   from_configcodec_configcodec_modelrD   r   s     rF   r-   zBarkModel.__init__H  sh     )&*@*@A /0N0N O+F,H,HI$001D1DErG   r   c                      yr  r   )clss    rF   can_generatezBarkModel.can_generateS  s     rG   c                 N   t        | j                  d      st        |       S | j                  j                         D ]g  }t        |d      st        |j                  d      s'|j                  j
                  >t        j                  |j                  j
                        c S  y)r   r   r   N)r   r  r   r   r   r   r=   r   r   s     rF   r   zBarkModel.device[  s}     t}}j1'--mm++- 	FF
+FOO-?@OO44@||FOO$D$DEE	FrG   accelerator_idc                    t               rddlm} nt        d      |j	                  dd      }|dk7  rt        j                  dt               |}d}t               r(t        j                  j                         j                  }t        j                  | d|       }t        t        |      }| j                  j                  dk7  r!| j                  d       |j!                           || j"                  j$                  |      \  | j"                  _        }d	}	| j"                  | j&                  | j(                  fD ]  }
 ||
||	
      \  }}	 |	| _         || j,                  ||	
      \  }}	|	| _        y	)a  
        Offloads all sub-models to CPU using accelerate, reducing memory usage with a low impact on performance. This
        method moves one whole sub-model at a time to the accelerator when it is used, and the sub-model remains in accelerator until the next sub-model runs.

        Args:
            accelerator_id (`int`, *optional*, defaults to 0):
                accelerator id on which the sub-models will be loaded and offloaded. This argument is deprecated.
            kwargs (`dict`, *optional*):
                additional keyword arguments:
                    `gpu_id`: accelerator id on which the sub-models will be loaded and offloaded.
        r   )cpu_offload_with_hookz1`enable_model_cpu_offload` requires `accelerate`.gpu_idzThe argument `gpu_id` is deprecated and will be removed in version 4.54.0 of Transformers. Please use `accelerator_id` instead.cuda:cpuN)prev_module_hook)r   
accelerater  ImportErrorr8  warningswarnFutureWarningr   r=   acceleratorcurrent_acceleratortyper   r  rd   empty_cacher  r   rF  r{  fine_acoustics_hookr  codec_model_hook)rC   r  r   r  r  device_typer   torch_accelerator_moduler   hookcpu_offloaded_models              rF   enable_cpu_offloadzBarkModel.enable_cpu_offloadm  s\     #$8QRRHa(Q;MM R $N)+++??AFFKQ~.>?@#*5+#> ;;u$GGEN$002 /DDMMDdDdfl.m+(!MM!!$
 	`
 ,,?Z^_GAt	` $( '(8(8&SWX4 !%rG   c                    |j                  dd      }| j                  j                  j                  |      }|nt	        ||      D cg c]  \  }}|ddd|f   j                  d      ! }}}|D cg c]+  }| j                  j                  |      j                         - }}|S | j                  j                  |      }|j                  d      }|S c c}}w c c}w )z:Turn quantized audio codes into audio array using encodec.r   r   N)rU   r  	quantizerdecodezipr  decodersqueeze)rC   fine_outputrk  embsamplelout	audio_arrs           rF   codec_decodezBarkModel.codec_decode  s     "++Aq1((//<% BES.AYZ+616!RaR%=**1-ZCZRUV))11&9AACVIV
  ""**3/CAI [Vs   	$C40Cr   r   rY  c           	         t        di | j                  j                  }t        di | j                  j                  }t        di | j                  j                  }|j                  dd      |j                  dd      d}i }	i }
|j                         D ]  \  }}|j                  d      r|t        d      d }|||<   +|j                  d      r|t        d      d }||	|<   P|j                  d      r|t        d      d }||
|<   u||vr|||<   ||	vr||	|<   ||
vs||
|<    d|v r|j                  d        | j                  j                  |f||d	|}d|	v r|	j                  d        | j                  j                  |f|||| j                  j                  |d
|	}d}|r|\  }}||j                  z  }d|
v r|
j                  d        | j                   j                  |f||||| j                  j                  d|
}t#        | dd      D| j$                  j'                          | j(                  j+                  | j,                        | _        | j/                  ||      }t#        | dd      | j0                  j'                          |rH|D cg c]  }t        |       }}t2        j4                  j6                  j9                  |dd      }||fS |S c c}w )a^	  
        Generates audio from an input prompt and an additional optional `Bark` speaker prompt.

        Args:
            input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
                Input ids. Will be truncated up to 256 tokens. Note that the output audios will be as long as the
                longest generation among the batch.
            history_prompt (`Optional[dict[str,torch.Tensor]]`, *optional*):
                Optional `Bark` speaker prompt. Note that for now, this model takes only one speaker prompt per batch.
            kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments are of two types:

                - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model.
                - With a *semantic_*, *coarse_*, *fine_* prefix, they will be input for the `generate` method of the
                semantic, coarse and fine respectively. It has the priority over the keywords without a prefix.

                This means you can, for example, specify a generation strategy for all sub-models except one.
            return_output_lengths (`bool`, *optional*):
                Whether or not to return the waveform lengths. Useful when batching.
        Returns:
            By default:
                - **audio_waveform** (`torch.Tensor` of shape (batch_size, seq_len)): Generated audio waveform.
            When `return_output_lengths=True`:
                Returns a tuple made of:
                - **audio_waveform** (`torch.Tensor` of shape (batch_size, seq_len)): Generated audio waveform.
                - **output_lengths** (`torch.Tensor` of shape (batch_size)): The length of each waveform in the batch
        Example:

        ```python
        >>> from transformers import AutoProcessor, BarkModel

        >>> processor = AutoProcessor.from_pretrained("suno/bark-small")
        >>> model = BarkModel.from_pretrained("suno/bark-small")

        >>> # To add a voice preset, you can pass `voice_preset` to `BarkProcessor.__call__(...)`
        >>> voice_preset = "v2/en_speaker_6"

        >>> inputs = processor("Hello, my dog is cute, I need him in my life", voice_preset=voice_preset)

        >>> audio_array = model.generate(**inputs, semantic_max_new_tokens=100)
        >>> audio_array = audio_array.cpu().numpy().squeeze()
        ```
        rh   Nr'  )rh   r'  	semantic_coarse_fine_r*  )r   r  )r   r  rX  rI  rY  )r   r  rX  r  rI  r  r  Tr   )batch_firstpadding_valuer   )r#   r*  r  r!   r  r"   r  r   items
startswithr/  r  r9  rF  rI  rc  r{  r  r  offloadr  rd   r   r  r  r   utilsrnnpad_sequence)rC   r   r   rY  r   r  rX  r  kwargs_semantickwargs_coarsekwargs_finerf   rg   r?  rw  rk  outputaudior  s                      rF   r9  zBarkModel.generate  s:   h &B%kDDZDZDjDj%k"#=#o@V@V@n@n#o !9!iD<R<R<h<h!i %jj)94@K6

  ,,. 	-JC~~k*#k*,-',$	*#i.*+%*c"(#g,.)#(C  o-+0OC(m+).M#&k)',K$%	-* /1 340$--00
)'A
 	
 -/126--66
)'A%=00>>"7
 
  ,9)M>+/G/Z/ZZN +-OO/0-$$--
)'A%=#900>>
 
 4.5A $$,,.#//224;;?D !!&.94+T2>!!))+ 8=>fc&k>N>HHLL--eUV-WE.((	 ?s   K%c                 h    | j                         D ]  }t        |d      s|j                          ! yr  r  r   s     rF   r  zBarkModel.tie_weightsK  r  rG   )r   r   r@  )rz   r{   r|   r   r   r-   classmethodr@   r  r   r=   r   r   r1  r  r  r  r  rB  rC  r  r9  r  r}   r~   s   @rF   r  r  3  s    & 	 T   F F F& )*8% 8%t$ U]]_ -1<@04	OELL)O !c5<<&7!89O  (~	O 
		O Ob	&rG   r  )rz  r  rE  r  r   r   )Dr   r^   r  typingr   r   numpyrN  r=   r   torch.nnr   ri  cache_utilsr   
generationr	   generation.logits_processr
   r   r   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   r  r   r   r   r   autor   configuration_barkr   r   r   r   r    generation_configuration_barkr!   r"   r#   r$   
get_loggerrz   r  Moduler&   r   r   r   r   r   r   r  rE  rz  r  __all__r   rG   rF   <module>r      s      "    $ ' ) 
 C h 9 F C     J 
		H	%p)		 p)fG!/ G!V 0 bii  17* 17h **/ ** **\O
)? O
d e eeP To TTn T' TTn $O&# O&%$O&drG   