
    h                        d dl mZmZmZ d dlZd dlZd dlmZ d dl	m
Z
mZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2  G d dejf                        Z4 G d dejf                        Z5dejl                  de7dejl                  fdZ8	 dEdejf                  dejl                  dejl                  dejl                  d eejl                     d!e9d"e9d#e*e,   fd$Z:d% Z;dFd&Z< G d' d(ejf                        Z= G d) d*ejf                        Z> G d+ d,e      Z? G d- d.e      Z@e- G d/ d0e(             ZA G d1 d2eA      ZBe- G d3 d4eA             ZC	 	 dGd5eDe7e7f   d6e9d7e7d eej                     d8e7dej                  fd9ZGe- G d: d;eA             ZHd<ejl                  d=e7d>e7fd?ZI e-d@A       G dB dCeAe             ZJg dDZKy)H    )CallableOptionalUnionN)OutputRecordercheck_model_inputs   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg   )MoonshineConfigc                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MoonshineEncoderMLPc                    t         |           || _        t        |   | _        t        j                  |j                  |j                        | _	        t        j                  |j                  |j                        | _
        y Nsuper__init__configr	   activation_fnnnLinearhidden_sizeintermediate_sizefc1fc2selfr*   
hidden_act	__class__s      n/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/moonshine/modeling_moonshine.pyr)   zMoonshineEncoderMLP.__init__4   s^    #J/99V//1I1IJ99V55v7I7IJ    hidden_statesreturnc                 l    | j                  |      }| j                  |      }| j                  |      }|S r&   )r0   r+   r1   )r3   r8   s     r6   forwardzMoonshineEncoderMLP.forward;   s4    /**=9/r7   __name__
__module____qualname__r)   torchTensorr;   __classcell__r5   s   @r6   r$   r$   3   s$    KU\\ ell r7   r$   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MoonshineDecoderMLPc                    t         |           || _        t        |   | _        t        j                  |j                  |j                  dz        | _	        t        j                  |j                  |j                        | _
        y )N   r'   r2   s      r6   r)   zMoonshineDecoderMLP.__init__C   sc    #J/99V//1I1IA1MN99V55v7I7IJr7   r8   r9   c                     | j                  |      }|j                  dd      \  }}| j                  |      |z  }| j                  |      }|S )NrG   dim)r0   chunkr+   r1   )r3   r8   gates      r6   r;   zMoonshineDecoderMLP.forwardJ   sS    /+11!1<t**40=@/r7   r<   rC   s   @r6   rE   rE   B   s$    KU\\ ell r7   rE   r8   n_repr9   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r!   N)shapeexpandreshape)r8   rN   batchnum_key_value_headsslenhead_dims         r6   	repeat_kvrW   R   so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr7   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )NrG   r   rI   )rK   dtype)ptrainingr!   )rW   num_key_value_groupsr@   matmul	transposerP   r,   
functionalsoftmaxfloat32torb   r^   rd   
contiguous)rX   rY   rZ   r[   r\   r]   r^   r_   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r6   eager_attention_forwardrr   ^   s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r7   c                 |    | ddddf   }| ddddf   }t        j                  | |fd      j                  d      S )	z*Rotates half the hidden dims of the input..r   NrG   r!   rI   rJ   ra   )r@   stackflatten)xx1x2s      r6   rotate_halfry   x   sJ    	
319B	
319B;;Ryb)11"55r7   c                    |j                  |      }|j                  |      }|dd|j                  d   dz  f   j                  dd      }|dd|j                  d   dz  f   j                  dd      }|j                  d   }| dd|f   | d|df   }}|dd|f   |d|df   }
}	||z  t        |      |z  z   }|	|z  t        |	      |z  z   }t	        j
                  ||gd      }t	        j
                  ||
gd      }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .NrI   rG   rJ   )	unsqueezerP   repeat_interleavery   r@   cat)qkcossinposition_idsunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds                r6   apply_rotary_pos_embr      sD   ( --
&C
--
&C c'SYYr]a'''
(
:
:1"
:
EC
c'SYYr]a'''
(
:
:1"
:
EC 2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{{51C78Gs{{51C78G ii&)r2Gii&)r2GGr7   c                       e Zd ZdZdededededef
 fdZ edd	d
      	 	 	 	 	 dde	j                  deee	j                  e	j                  f      dee	j                     d	ee   dee	j                     dee	j                     dee   dee	j                  ee	j                     eee	j                        f   fd       Z xZS )MoonshineAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr*   	layer_idx	is_causalnum_attention_headsrT   c                 8   t         |           |j                  ||d       || _        || _        t        |d|j                  |j                  z        | _        |j                  |j                  z  | _
        | j                  dz  | _        |j                  | _        || _        t        j                  |j                  |j                  | j                  z  |j                         | _        t        j                  |j                  |j                  | j                  z  |j                         | _        t        j                  |j                  |j                  | j                  z  |j                         | _        t        j                  |j                  | j                  z  |j                  d      | _        | j                  j*                  C| j                  j*                  }|| j                  |z   dz
  |z  z  }|| j                  z
  | _        y d| _        y )N)r   rT   rV   g      ࿩biasFr!   r   )r(   r)   updater*   r   getattrr.   r   rV   rT   re   r]   attention_dropoutr   r,   r-   attention_biasq_projk_projv_projo_projpad_head_dim_to_multiple_ofhead_dim_padding)	r3   r*   r   r   r   rT   target_multipletarget_head_dimr5   s	           r6   r)   zMoonshineAttention.__init__   s    	.AZmno"
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9"ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JFL^L^ejk ;;22>"kkEEO-$--/2QTU2UZi1ijO$3dmm$CD!$%D!r7   past_key_valuepast_key_values4.58new_nameversionr8   position_embeddingsr\   cache_positionkey_value_statesr_   r9   c                 j   |j                   d d \  }}	| j                  |      j                  ||	| j                  j                  | j
                        j                  dd      }
|d u}|Y|j                  j                  | j                        }|r&d|j                  | j                  <   |j                  }n|j                  }||n|}|rK|rIrG|j                  | j                     j                  }|j                  | j                     j                  }n| j                  |      j                  |d| j                  j                  | j
                        j                  dd      }| j!                  |      j                  |d| j                  j                  | j
                        j                  dd      }|r%|#|j#                  ||| j                  d|i      \  }}|s?|\  }}t%        |
|||      \  }
}|'|||d}|j#                  ||| j                  |      \  }}t&        }| j                  j(                  dk7  rt*        | j                  j(                     }| j,                  xr |d u xr |	dkD  }| j.                  dkD  rt0        j2                  j4                  j7                  |
d| j.                  f      }
t0        j2                  j4                  j7                  |d| j.                  f      }t0        j2                  j4                  j7                  |d| j.                  f      } || |
|||f| j8                  sd	n| j:                  | j<                  |d
|\  }}| j.                  dkD  r|dd | j.                   f   }|j?                  ||	d      jA                         }| jC                  |      }||fS )NrI   r!   rG   Tr   )r   r   r   eagerr           )r^   r]   r   .)"rP   r   viewr*   rT   rV   rg   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   r   r   rr   _attn_implementationr   r   r   r@   r,   rh   padrd   r   r]   rR   rl   r   )r3   r8   r   r\   r   r   r   r_   bszq_lenquery_statesis_cross_attentionr   current_statesrm   rn   r   r   cache_kwargsattention_interfacer   rq   ro   s                          r6   r;   zMoonshineAttention.forward   s    #(("-
U KK&++C8W8WY]YfYfgqqrsuvw 	 .T9&(3377GJ!=A**4>>:"1"G"G"1"F"F .>-I)}/j(//?DDJ*11$..AHHL N+c2t{{>>N1a  N+c2t{{>>N1a 
 "o&A+:+A+Adnn?OQ_>`,(
L "*HC';L*VY[^'_$L**'*3.Y+:+A+Adnnl,(
L )@;;++w6"9$++:Z:Z"[NNK~'=K%!)	  1$ 88..22<!TEZEZA[\L,,00aAVAV=WXJ 88..22<!TEZEZA[\L$7
%
  $}}C$2H2HLL
%
 
%
!\   1$%c+Cd.C.C-C+C&CDK!))#ub9DDFkk+.L((r7   )NNNNN)r=   r>   r?   __doc__r"   intboolr)   r    r@   rA   r   tupler
   
LongTensorr   r   r;   rB   rC   s   @r6   r   r      s0   G#&#& #& 	#&
 !#& !#&J %0A6R LP15+/5937U)||U) &eELL%,,,F&GHU) !.	U)
 "%U) !!1!12U) #5<<0U) -.U) 
u||Xell3XeELL>Q5RR	SU) SU)r7   r   c                   ~     e Zd ZU ej                  ed<   ddef fdZ ej                         e	d               Z
 xZS )MoonshineRotaryEmbeddinginv_freqr*   c                    t         |           t        |d      rUt        |j                  t
              r;|j                  j                  d|j                  j                  d            | _        nd| _        |j                  | _	        |j                  | _
        || _        t        | j                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                   | _        y )Nrope_scaling	rope_typetypedefaultr   F)
persistent)r(   r)   hasattr
isinstancer   dictr   r   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr*   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r3   r*   devicer   r5   s       r6   r)   z!MoonshineRotaryEmbedding.__init__-  s    6>*z&:M:Mt/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r7   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   rI   r!   mpscpuF)device_typeenabledrG   rJ   rb   )r   floatrQ   rP   rk   r   r   r   strr@   autocastrg   r}   r   r   r   rb   )
r3   rv   r   inv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r6   r;   z MoonshineRotaryEmbedding.forward>  sV    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s    BF%%F.r&   )r=   r>   r?   r@   rA   __annotations__r"   r)   no_gradr   r;   rB   rC   s   @r6   r   r   *  s=    ll/ /" U]]_<  <r7   r   c                   >    e Zd Zdedef fdZ eddd      	 	 	 	 	 	 ddej                  d	e	ej                     d
e	ej                     de	e   de	e   de	ej                     de	eej                  ej                  f      dee   dej                  fd       Z xZS )MoonshineEncoderLayerr*   r   c                 d   t         |           |j                  | _        t        ||d|j                  |j
                        | _        t        ||j                        | _	        t        j                  |j                  d      | _        t        j                  |j                  d      | _        y )NFr*   r   r   r   rT   r   )r(   r)   r.   r   encoder_num_attention_headsencoder_num_key_value_heads	self_attnr$   encoder_hidden_actmlpr,   	LayerNorminput_layernormpost_attention_layernormr3   r*   r   r5   s      r6   r)   zMoonshineEncoderLayer.__init__O  s    !--+ & B B & B B
 'vv/H/HI!||F,>,>UK(*V5G5Ge(T%r7   r   r   r   r   r8   r\   r   	use_cacher   r   r_   r9   c                     |}	| j                  |      } | j                  d|||||||d|\  }}
|	|z   }|}	| j                  |      }| j                  |      }|	|z   }|S )Nr8   r\   r   r   r   r   r    )r   r   r   r   )r3   r8   r\   r   r   r   r   r   r_   residual_s              r6   r;   zMoonshineEncoderLayer.forward_  s     !,,];)4>> 	
')%+) 3	
 	
q !=0 !55mD/ =0r7   )NNNFNN)r=   r>   r?   r"   r   r)   r    r@   rA   r   r   r
   r   r   r   r   r;   rB   rC   s   @r6   r   r   N  s    U U3 U  %0A6R 2637+/$)59KO|| !. u//0	
 "% D> !!1!12 &eELL%,,,F&GH +, 
 Sr7   r   c            !       &    e Zd Zddedee   f fdZ eddd      	 	 	 	 	 	 	 	 	 	 ddej                  d	eej                     d
eej                     deej                     deej                     deej                     dee   dee   deej                     deeej                  ej                  f      deeej                  ej                  f      dee   deej                   eeej                   ej                   f      f   fd       Z xZS )MoonshineDecoderLayerr*   r   c                    t         |           |j                  | _        t        ||d|j                  |j
                        | _        t        ||d|j                  |j
                        | _        t        ||j                        | _
        t        j                  |j                  d      | _        t        j                  |j                  d      | _        t        j                  |j                  d      | _        y )NTr   Fr   )r(   r)   r.   r   decoder_num_attention_headsdecoder_num_key_value_headsr   encoder_attnrE   decoder_hidden_actr   r,   r   r   r   final_layernormr   s      r6   r)   zMoonshineDecoderLayer.__init__  s    !--+ & B B & B B
 / & B B & B B
 'vv/H/HI!||F,>,>UK(*V5G5Ge(T%!||F,>,>UKr7   r   r   r   r   r8   r\   encoder_hidden_statesencoder_attention_maskr   encoder_position_idsr   r   r   encoder_position_embeddingsr_   r9   c                 (   |}| j                  |      } | j                  d||||||	|
d|\  }}||z   }|1|}| j                  |      }| j                  |||||      \  }}||z   }|}| j	                  |      }| j                  |      }||z   }|S )Nr   )r8   r   r\   r   r   r   )r   r   r   r   r   r   )r3   r8   r\   r   r  r   r  r   r   r   r   r  r_   r   r   s                  r6   r;   zMoonshineDecoderLayer.forward  s      !,,];)4>> 	
')%+) 3	
 	
q !=0 ,$H 99-HM#00+!65 /#  1  M1 %}4M ,,];/ =0r7   r&   )
NNNNNNFNNN)r=   r>   r?   r"   r   r   r)   r    r@   rA   r   r
   r   r   r   r   FloatTensorr;   rB   rC   s   @r6   r   r     s   L L8C= L0 %0A6R 268<9=37;?+/$)59KOSW.||. !..  (5	.
 !) 6. u//0. 'u'7'78. "%. D>. !!1!12. &eELL%,,,F&GH. &.eELL%,,4N.O%P. +,. 
u  (51B1BEDUDU1U+V"WW	X. S.r7   r   c                   X    e Zd ZU eed<   dZdZdZddgZdZ	dZ
dZdej                  fdZy	)
MoonshinePreTrainedModelr*   modelinput_valuesTr   r   input_lengthsc                 ~    t        |dz
  dz  dz         }t        |dz
  dz  dz         }t        |dz
  dz  dz         }|S )zH
        Computes the output length of the convolutional layers
           @   r!      r   rG   )r   )r3   r
  output_conv1_lengthoutput_conv2_lengthoutput_conv3_lengths        r6    _get_feat_extract_output_lengthsz9MoonshinePreTrainedModel._get_feat_extract_output_lengths  sZ     "=3#6""<q"@A!#6#:a"?!"CD!#6#:a"?!"CD""r7   N)r=   r>   r?   r"   r   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_can_compile_fullgraphr@   r   r  r   r7   r6   r  r    sH    $O&*#02IJN!#e>N>N #r7   r  c            
            e Zd ZdZdZeedZdef fdZ	de
j                  fdZde
j                  fd	Ze	 ddej                   d
eej$                     dee   defd       Z xZS )MoonshineEncoderz
    Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

    Args:
        config: MoonshineConfig
    r	  )
attentionsr8   r*   c           	      b   t         |   |       || _        |j                  }t	        j
                  d|ddd      | _        t	        j
                  |d|z  dd	      | _        t	        j
                  d|z  |dd	      | _        t	        j                  d|d
      | _
        t        |      | _        t	        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t	        j$                  |d      | _        d| _        | j+                          y c c}w )Nr!   r  r  F)kernel_sizestrider   rG   r  r   )r  r  gh㈵>)
num_groupsnum_channelsepsr*   r   )r(   r)   r*   r.   r,   Conv1dconv1conv2conv3	GroupNorm	groupnormr   
rotary_emb
ModuleListrangeencoder_num_hidden_layersr   r   r   
layer_normgradient_checkpointing	post_init)r3   r*   	embed_dimidxr5   s       r6   r)   zMoonshineEncoder.__init__  s     &&	YYq)ReT
YYy!i-QqQ
YYq9}iQqQ
PTU2&Amm;@AaAa;bcC"63/c
 ,,yu=&+#	 ds   D,r9   c                     | j                   S r&   r%  r3   s    r6   get_input_embeddingsz%MoonshineEncoder.get_input_embeddings  s    zzr7   r[   c                     || _         y r&   r4  r3   r[   s     r6   set_input_embeddingsz%MoonshineEncoder.set_input_embeddings  s	    
r7   r\   r_   c                    |j                  d      }t        j                  j                  | j	                  |            }| j                  |      }t        j                  j                  | j                  |            }t        j                  j                  | j                  |            }|j                  ddd      }|| j                  |j                  d         }d}|ddd|f   dd|f   }| j                  j                  dk(  r|d	k(  j                         r|nd}nF| j                  j                  d
k(  rt        ||j                         }nt#        ||j                         }t%        j&                  d|j                  d   |j(                        j                  d      }| j+                  ||      }| j,                  D ]  }	 |	|f|||d|} | j/                  |      }t1        |      S )a.  
        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
                Float values of the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
                `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
                the soundfile library (`pip install soundfile`). To prepare the array into
                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
                and conversion into a tensor of type `torch.FloatTensor`.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
        r!   r   rG   NrI     .flash_attention_2r   sdpar   )r\   r   r   )last_hidden_state)r{   r,   rh   tanhr%  r)  gelur&  r'  permuter  rP   r*   r   anyr   rb   r   r@   aranger   r*  r   r.  r   )
r3   r	  r\   r_   r8   mask_lendownsample_strider   r   encoder_layers
             r6   r;   zMoonshineEncoder.forward  s   , $--a0**4::l+CD}5**4::m+DE**4::m+DE%--aA6 %<<^=Q=QRT=UVH *+C1D3D1D,DEc9H9nUN{{//3FF4Bc4I3N3N3PVZ11V;!D^UbUhUh!i!;NML_L_!`||A}':':1'=mFZFZ[eefgh"oom\J![[ 	M)-)$7	
 M	 6&+
 	
r7   r&   )r=   r>   r?   r   r  r   r   _can_record_outputsr"   r)   r,   Moduler6  r9  r   r@   r  r   rA   r   r   r   r;   rB   rC   s   @r6   r  r    s     %O(.
 $bii "))   268
''8
 !.8
 +,	8

 
!8
 8
r7   r  c                   |    e Zd ZdZ eedd      e eedd      dZdef fdZ	e
	 	 	 	 	 	 	 	 	 ddeej                     d	eej                     d
eej                     dee   deej                      dee   deej                     deej                      deej                     dee   deeef   fd       Z xZS )MoonshineDecoder	input_idsr!   r   )index
layer_namer   )r  r8   cross_attentionsr*   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                  |j                  d      | _        t!        |      | _        d| _        | j'                          y c c}w )NFr   r#  )r(   r)   pad_token_idpadding_idx
vocab_sizer,   	Embeddingr.   embed_tokensr+  r,  decoder_num_hidden_layersr   r   r   normr   r*  r/  r0  )r3   r*   r2  r5   s      r6   r)   zMoonshineDecoder.__init__P  s     !.. ++LL):):F<N<NPTP`P`amm;@AaAa;bcC"63/c
 LL!3!3%@	2&A&+# 	 ds   Dr\   r   r   inputs_embedsr   r   r   r  r_   r9   c
                    |du |duz  rt        d      || j                  |      }|r6|4t        t        | j                        t        | j                              }|F||j                         nd}t        j                  |||j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }|}| j                  ||      }|	|j                  d   }d	}|	d
dd|f   d
d|f   }	| j                  j                  dk(  r|	dk(  j                         r|	nd}	nb| j                  j                  dk(  r%t        |	|j                   |j                  d         }	n$t#        |	|j                   |j                  d         }	| j$                  D ]  } ||||f|	|||||d|
} | j'                  |      }t)        ||r|      S d      S )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            of the decoder.
        encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        Nz:You must specify exactly one of input_ids or inputs_embedsr#  r   r!   r>  )r*   input_embedsr\   r   r   r   ra   r;  .r<  r   r=  )r  r   r   r   r   r   )r?  r   )
ValueErrorrU  r   r   r*   get_seq_lengthr@   rD  rP   r   r{   r   r*  r   rC  r   rb   r   r   rW  r   )r3   rL  r\   r   r   rX  r   r   r   r  r_   past_seen_tokensrp   r8   r   rE  rF  decoder_layers                     r6   r;   zMoonshineDecoder.forward`  sI   0 -t";<YZZ  --i8M01,dkk2RT`hlhshsTtuO!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;&))+%
 &"oom\J!-,2226H *%;CATCTAT<T%UVY[d\d[dVd%e"{{//3FFDZ^aDaCfCfCh)?nr&11V;)L*M,?,?ATATUWAX*& *D*M,?,?ATATUWAX*& "[[ 	M)% (>) /#-$7 M	 		-08+/8O
 	
>B
 	
r7   )	NNNNNNNNN)r=   r>   r?   r  r   r   r   rH  r"   r)   r   r   r@   r   rA   r
   r  r   r   r   r   r   r   r;   rB   rC   s   @r6   rK  rK  G  sF   !O$%7q[Y.*+=QSab    151537+/59$(59=A9=W
E,,-W
 !.W
 u//0	W

 "%W
   1 12W
 D>W
 !!1!12W
  ((9(9:W
 !) 6W
 +,W
 
u--	.W
 W
r7   rK  rP   	mask_probmask_length	min_masksc                    | \  }dk  rt        d      kD  rt        d d d      t        j                  j                  d      j	                         fd}|-|j                         j                  d      j                         nt        |      D cg c]  } c}}t        j                  |ft        	      }	g }
 |      }|d
k(  r|	S |D ]  } ||      }t        j                  j                  t        j                  |dz
  z
        |d      }t        |      d
k(  rdz
  }n|d
   }t        j                  |t        j                  ||z
  t        j                   	      |z  g      }|
j#                  |        t        j$                  |
      }
t        j&                  |
dddddf   ||f      }
|
j)                  ||z        }
t        j                        ddddf   }t        j&                  |||f      j)                  ||z        }|
|z   }
|
j+                         dz
  kD  rdz
  |
|
dz
  kD  <   t        j,                  |	|
dd       |	S c c}w )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r!   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t        | z  z  z         }t        |      }|z  kD  rz  }| dz
  z
  |k  rt        | dz
  z
  d      }|S )z;Given input length, compute how many spans should be maskedr!   r   )r   max)input_lengthnum_masked_spanepsilonr`  r_  ra  sequence_lengths     r6   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_span  so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr7   NrI   r   r   F)replace)r[  nprandomranditemdetachsumtolistr,  zerosr   choicerD  lenconcatenateonesint32appendarraybroadcast_torR   re  put_along_axis)rP   r_  r`  r\   ra  
batch_sizerj  r   r
  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanrf  rg  spec_aug_mask_idxdummy_mask_idxoffsetsrh  ri  s    `` `            @@r6   _compute_mask_indicesr    s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89!o9  HHj/:$GM1/Ba% 51,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;o(MUWU]U] ^ao op
 	!!"34/52 "45 1a:&5H+(V ,33J@SVa@ab ii$T4]3Goog
4G'UV^^'+5G ,g5 /A"55GVYZGZ-!0CCD m%7B?w :s   $	I+c                       e Zd Zdef fdZd Zd Zd Zd Z	 dde	j                  dee	j                     fd	Zee	 	 	 	 	 	 	 	 	 	 dd
ee	j                     dee	j                     dee	j                     dee	j                     deeee	j                           deeeee	j                     f      deee	j                        deee	j                        dee   dee	j                     dee   defd              Z xZS )MoonshineModelr*   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r&   )r(   r)   r  encoderrK  decoderr0  r3   r*   r5   s     r6   r)   zMoonshineModel.__init__4  s2     '/'/r7   c                 .    | j                   j                  S r&   r  rU  r5  s    r6   r6  z#MoonshineModel.get_input_embeddings<  s    ||(((r7   c                 &    || j                   _        y r&   r  r8  s     r6   r9  z#MoonshineModel.set_input_embeddings?  s    $)!r7   c                     | j                   S r&   )r  r5  s    r6   get_encoderzMoonshineModel.get_encoderB  s    ||r7   c                 8    | j                   j                          y)z
        Calling this function will disable the gradient computation for the Moonshine encoder so that its parameters will
        not be updated during training.
        N)r  _freeze_parametersr5  s    r6   freeze_encoderzMoonshineModel.freeze_encoderE  s    
 	'')r7   input_featuresr\   c                 2   t        | j                  dd      s|S |j                         \  }}}| j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                  || j                  j                        }t        j                  ||j                  t        j                        }|dddf   j                  d|d      }d||<   | j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                  | j                  j                        }t        j                  ||j                  t        j                        }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTr   )r_  r`  r\   ra  )r   rb   NrI   )r_  r`  ra  )r   r*   sizemask_time_probrd   r  mask_time_lengthmask_time_min_masksr@   tensorr   r   rQ   mask_feature_probmask_feature_lengthmask_feature_min_masks)r3   r  r\   r}  r.   ri  mask_time_indicesmask_feature_indicess           r6   _mask_input_featuresz#MoonshineModel._mask_input_featuresL  s[    t{{$8$?!! 4B3F3F3H0
K;;%%)dmm 5_-++44 KK88-++99! !&->~G\G\didndn o 1!T' : A A"kSU V01N,-;;((1,#8[)++77 KK;;++<<	$  $)<<0D^MbMbjojtjt#u 34N/0r7   r	  decoder_input_idsdecoder_attention_maskencoder_outputsr   decoder_inputs_embedsdecoder_position_idsr   r   r_   r9   c                 B   | | j                   |fd|i|} | j                  d||||j                  ||||	|
d	|}t        |j                  |j                  |j
                  |j                  |j                  |j                  |j
                  |j                        S )a
  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, MoonshineModel
        >>> from datasets import load_dataset

        >>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values
        >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
        >>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 2, 288]
        ```
        r\   )	rL  r\   r  r   r   rX  r   r   r   )r?  r   decoder_hidden_statesdecoder_attentionsrO  encoder_last_hidden_stater   encoder_attentionsr   )r  r  r?  r   r   r8   r  rO  )r3   r	  r\   r  r  r  r   r  r  r   r   r_   decoder_outputss                r6   r;   zMoonshineModel.forwardw  s    \ "/;t||L/rYg/rkq/rOEQT\\ F
'1#1"1"C"C+/-)F
 F
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r7   r&   )
NNNNNNNNNN)r=   r>   r?   r"   r)   r6  r9  r  r  r@   r  r   r   r  r   r   r   r   r   r   r   r   r   r;   rB   rC   s   @r6   r  r  2  s    )** 6:)))) !!1!12)V  59598<=AEIZ^DHBF$(59E
u001E
 !!1!12E
 $E$4$45	E

 !))9)9 :E
 "%e.?.?(@"ABE
 "%(;U5CTCT=U(U"VWE
  (e.?.?(@AE
 'uU-=-='>?E
 D>E
 !!1!12E
 +,E
 
E
  E
r7   r  rL  rQ  decoder_start_token_idc                     | j                  | j                        }| ddddf   j                         |ddddf<   ||dddf<   |t        d      |j	                  |dk(  |       |S )z1
    Shift input ids one token to the right.
    NrI   r!   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosrP   cloner[  masked_fill_)rL  rQ  r  shifted_input_idss       r6   shift_tokens_rightr    s}     "++IOO<(CRC0668ae4adLMM""#4#<lKr7   zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    )custom_introc                       e Zd ZdgZdef fdZd Zd Zd Zd Z	de
j                  fd	Zee	 	 	 	 	 	 	 	 	 	 	 dd
eej"                     deej$                     deej$                     deej$                     deeeej"                           deeeeej"                     f      deeej"                        deeej$                        dee   deej$                     deej$                     dee   defd              Z xZS )!MoonshineForConditionalGenerationzproj_out.weightr*   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NFr   )
r(   r)   r  r  r,   r-   r.   rS  proj_outr0  r  s     r6   r)   z*MoonshineForConditionalGeneration.__init__  sH     #F+
		&"4"4f6G6GeT 	r7   c                 6    | j                   j                         S r&   )r  r  r5  s    r6   r  z-MoonshineForConditionalGeneration.get_encoder      zz%%''r7   c                 6    | j                   j                         S r&   )r  get_decoderr5  s    r6   r  z-MoonshineForConditionalGeneration.get_decoder  r  r7   c                     | j                   S r&   r  r5  s    r6   get_output_embeddingsz7MoonshineForConditionalGeneration.get_output_embeddings  s    }}r7   c                     || _         y r&   r  )r3   new_embeddingss     r6   set_output_embeddingsz7MoonshineForConditionalGeneration.set_output_embeddings  s	    &r7   r9   c                 6    | j                   j                         S r&   )r  r6  r5  s    r6   r6  z6MoonshineForConditionalGeneration.get_input_embeddings  s    zz..00r7   r	  r\   r  r  r  r   r  r  r   r   labelsr_   c                    |9|7|5t        || j                  j                  | j                  j                        } | j                  |f||||||||	|
d	|}| j                  |j                        }d}|(| j                  ||| j                  j                        }t        |||j                  |j                  |j                  |j                  |j                  |j                  |j                   	      S )a0  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values

        >>> generated_ids = model.generate(input_values, max_new_tokens=100)

        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> transcription
        'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
        ```N)	r\   r  r  r  r   r  r  r   r   )logitsr  rS  )	lossr  r   r  r  rO  r  r   r  )r  r*   rQ  r  r  r  r?  loss_functionrS  r   r   r  r  rO  r  r   r  )r3   r	  r\   r  r  r  r   r  r  r   r   r  r_   outputsr  r  s                   r6   r;   z)MoonshineForConditionalGeneration.forward  s   f  (-B-J$6DKK44dkk6X6X%! '1djj'
)/+#9+"7!5)'
 '
 w889%%VFt{{OeOe%fD#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r7   )NNNNNNNNNNN)r=   r>   r?   _tied_weights_keysr"   r)   r  r  r  r  r,   rI  r6  r   r   r   r@   r  r   r   r   r   r   r   r   r   r;   rB   rC   s   @r6   r  r    s    ,, (('1bii 1  59598<=AEIZ^DHBF$(59-1T
u001T
 !!1!12T
 $E$4$45	T

 !))9)9 :T
 "%e.?.?(@"ABT
 "%(;U5CTCT=U(U"VWT
  (e.?.?(@AT
 'uU-=-='>?T
 D>T
 !!1!12T
 ))*T
 +,T
 
T
  T
r7   r  )r  r  r  )r   )Nr!   )Nr   )Ltypingr   r   r   numpyrl  r@   torch.nnr,   transformers.utils.genericr   r   activationsr	   cache_utilsr
   r   r   
generationr   masking_utilsr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.deprecationr    configuration_moonshiner"   rI  r$   rE   rA   r   rW   r   rr   ry   r   r   r   r   r   r  r  rK  r   r   ndarrayr  r  r  r  __all__r   r7   r6   <module>r     s}  * - ,    I ! C C ) / g B 9  L F & I I 0 4")) "))  	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%46'T~) ~)B!<ryy !<H16 1hH6 HV # # #._
/ _
D p
/ p
 p
n 26tc?tt t U--.	t
 t ZZtn K
- K
 K
\%,, c [^   
p
(@/ p

p
f ^r7   