
    hA                    0   d Z ddlZddlmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZmZm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z)  e%jT                  e+      Z,dejZ                  de.de.fdZ/ G d dej`                        Z1 G d dejd                        Z3 G d de      Z4 G d de      Z5 G d dejd                        Z6 G d  d!ejd                        Z7e$ G d" d#e"             Z8 G d$ d%e8      Z9 G d& d'e8      Z:e$ G d( d)e8             Z; e$d*+       G d, d-e8e             Z< e$d.+       G d/ d0e8             Z=e$ G d1 d2e8             Z> G d3 d4e8      Z? G d5 d6e8e      Z@g d7ZAy)8zPyTorch MVP model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutput)PreTrainedModel)auto_docstringlogging)deprecate_kwarg   )	MvpConfig	input_idspad_token_iddecoder_start_token_idc                     | j                  | j                        }| ddddf   j                         |ddddf<   ||dddf<   |t        d      |j	                  |dk(  |       |S )z1
    Shift input ids one token to the right.
    Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r   r    r!   shifted_input_idss       b/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/mvp/modeling_mvp.pyshift_tokens_rightr+   4   s}     "++IOO<(CRC0668ae4adLMM""#4#<lK    c                   n     e Zd ZdZdedef fdZd	dej                  dedej                  f fdZ xZ	S )
MvpLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                 N    d| _         t        | 	  || j                   z   |       y N   )offsetsuper__init__)selfr/   r0   	__class__s      r*   r6   z&MvpLearnedPositionalEmbedding.__init__J   s$     $++5}Er,   r   past_key_values_lengthposition_idsc                 $   |a|j                   dd \  }}t        j                  |||z   t        j                  | j                  j
                        j                  |d      }n|j                  d      }t        | %  || j                  z         S )z3`input_ids' shape is expected to be [bsz x seqlen].Nr3   )dtypedevicer#   r   )r%   torcharangelongweightr=   expand	unsqueezer5   forwardr4   )r7   r   r9   r:   bszseq_lenr8   s         r*   rD   z%MvpLearnedPositionalEmbedding.forwardP   s     $??2A.LC <<&(>(HPUPZPZcgcncncucufS"o  (11!4Lw|dkk9::r,   )r   N)
__name__
__module____qualname____doc__intr6   r>   TensorrD   __classcell__r8   s   @r*   r.   r.   E   sH    Fs F3 F; ;s ;^c^j^j ; ;r,   r.   c                       e Zd ZdZ	 	 	 	 ddededee   dee   dee   dee   f fdZ e	d	d
d      	 	 	 	 	 	 	 dde
j                  dee
j                     d
ee   dee
j                     dee
j                     dee
j                     dedee
j                     dee
j                  ee
j                     eee
j                        f   fd       Z xZS )MvpAttentionz=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsdropout
is_decoderbias	layer_idxc                    t         |           || _        || _        || _        ||z  | _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _        || _	        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rU   )r5   r6   rQ   rR   rS   head_dimr'   scalingrT   rV   r   Lineark_projv_projq_projout_proj)r7   rQ   rR   rS   rT   rU   rV   r8   s          r*   r6   zMvpAttention.__init__a   s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"ii	94@ii	94@ii	94@		)YTBr,   past_key_valuepast_key_values4.58new_nameversionhidden_stateskey_value_statesattention_masklayer_head_maskattn_promptoutput_attentionscache_positionreturnc	                 .   |du}	|j                         \  }
}}| j                  |      | j                  z  }|St        |t              rA|j
                  j                  | j                        }|	r|j                  }n|j                  }n|}|	r|n|}|	rK|IrGj                  | j                     j                  }|j                  | j                     j                  }n| j                  |      }| j                  |      }|j                  |
d| j                   | j"                        j%                  dd      }|j                  |
d| j                   | j"                        j%                  dd      }|D|	s|nd}j'                  ||| j                  d|i      \  }}|	rd|j
                  | j                  <   |t)        j*                  |d   j-                  |
ddd      |gd      }t)        j*                  |d   j-                  |
ddd      |gd      }|\t)        j.                  |
d||d   j                  d            j1                  |j2                        }t)        j*                  ||gd      }|
| j                   z  d| j"                  f}|j                  |
|| j                   | j"                        j%                  dd      } |j4                  | } |j4                  | } |j4                  | }|j                  d      }t)        j6                  ||j%                  dd            }|j                         |
| j                   z  ||fk7  r/t9        d	|
| j                   z  ||f d
|j                                |{|j                         |
d||fk7  r#t9        d|
d||f d
|j                                |j                  |
| j                   ||      |z   }|j                  |
| j                   z  ||      }t:        j<                  j?                  |d      }||j                         | j                   fk7  r*t9        d| j                   f d
|j                                |j                  dddd      |j                  |
| j                   ||      z  }|j                  |
| j                   z  ||      }|r?|j                  |
| j                   ||      }|j                  |
| j                   z  ||      }nd}t:        j<                  jA                  || j@                  | jB                        }t)        j6                  ||      }|j                         |
| j                   z  || j"                  fk7  r7t9        d|
| j                   || j"                  f d
|j                                |j                  |
| j                   || j"                        }|j%                  dd      }|j5                  |
|| jD                        }| jG                  |      }||fS )z#Input shape: Batch x Time x ChannelNr#   r   r3   rl   Tr   dimz$Attention weights should be of size z	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size )$sizer^   rZ   
isinstancer   
is_updatedgetrV   cross_attention_cacheself_attention_cachelayerskeysvaluesr\   r]   viewrR   rY   	transposeupdater>   catrB   zerostor=   reshapebmmr'   r   
functionalsoftmaxrS   rs   rQ   r_   )r7   rf   rg   ra   rh   ri   rj   rk   rl   is_cross_attentionrE   tgt_len_query_statesrv   curr_past_key_valuecurrent_states
key_statesvalue_statesprompt_mask
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                             r*   rD   zMvpAttention.forward~   s     .T9',,.Wa {{=1DLL@&/+>?,77;;DNNK
%*9*O*O'*9*N*N'&5#-?)]/"=*,33DNNCHHJ.55dnnELLL^4J;;~6L#b$..$--PZZ[\^_`J',,S"dnndmmT^^_`bcdL*7It+>+E+Ednn?OQ_>`,(
L &AEO..t~~>"KN$9$9#r2r$JJ#W]^_J 99k!n&;&;CR&Ll%[abcL)#kk#q';q>;N;Nq;QRUUVdVkVkl!&K+Hr!SDNN*B>
#((gt~~t}}U__`acde+|++Z8'Z''4
+|++Z8//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(* 
 %""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL',,S4>>-A7GTL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVZVdVdfmov?wwL',,S4>>-A7GTL
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CRVR_R_3`2a b$$&') 
 "&&sDNNGT]]S!++Aq1 "))#wGmmK0111r,   )        FTN)NNNNNFN)rG   rH   rI   rJ   rK   r   floatboolr6   r   r>   rL   r   tuplerD   rM   rN   s   @r*   rP   rP   ^   s`   G $'%*#$(CC C %	C
 TNC tnC D>C: %0A6R 48+/1526.2"'15|2|||2 #5<<0|2 "%	|2
 !.|2 "%,,/|2 ell+|2  |2 !.|2 
u||Xell3XeELL>Q5RR	S|2 S|2r,   rP   c                        e Zd Zdef fdZ	 d
dej                  dej                  dej                  dej                  dee   de	ej                  eej                     f   fd	Z
 xZS )MvpEncoderLayerconfigc                 f   t         |           |j                  | _        t	        | j                  |j
                  |j                        | _        t        j                  | j                        | _
        |j                  | _        t        |j                     | _        |j                  | _        t        j                   | j                  |j"                        | _        t        j                   |j"                  | j                        | _        t        j                  | j                        | _        y )N)rQ   rR   rS   )r5   r6   d_modelrQ   rP   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normrS   r
   activation_functionactivation_fnactivation_dropoutr[   encoder_ffn_dimfc1fc2final_layer_normr7   r   r8   s     r*   r6   zMvpEncoderLayer.__init__   s    %nn44,,

 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r,   rf   rh   ri   self_attn_promptrk   rm   c                    |}| j                  |||||      \  }}t        j                  j                  || j                  | j                        }||z   }| j                  |      }|}| j                  | j                  |            }t        j                  j                  || j                  | j                        }| j                  |      }t        j                  j                  || j                  | j                        }||z   }| j                  |      }|j                  t        j                  k(  rt        j                  |      j                         s#t        j                   |      j                         rEt        j"                  |j                        j$                  dz
  }t        j&                  || |      }||fS )a@  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
                `(2, encoder_attention_heads, pro_len, head_dim)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rf   rh   ri   rj   rk   rq   i  )minmax)r   r   r   rS   rs   r   r   r   r   r   r   r<   r>   float16isinfanyisnanfinfor   clamp)	r7   rf   rh   ri   r   rk   residualr   clamp_values	            r*   rD   zMvpEncoderLayer.forward  s   * !&*nn')+(/ '5 '
#| --mt||VZVcVc-d =011-@ **488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0--m<%--/KK&**,M0J0N0N0P++m&9&9:>>EK!KKK<[YMl**r,   )F)rG   rH   rI   r   r6   r>   FloatTensorr   r   r   rD   rM   rN   s   @r*   r   r      s    =y =, -2/+((/+ ))/+ **	/+
  ++/+ $D>/+ 
u  (5+<+<"==	>/+r,   r   c            !           e Zd Zddef fdZ eddd      	 	 	 	 	 	 	 	 	 	 	 ddej                  deej                     d	eej                     d
eej                     deej                     deej                     deej                     deej                     dee	   dee
   dee
   deej                     deej                  eeej                  ej                  f      f   fd       Z xZS )MvpDecoderLayerr   c                    t         |           |j                  | _        t	        | j                  |j
                  |j                  d|      | _        |j                  | _        t        |j                     | _        |j                  | _        t        j                  | j                        | _        t	        | j                  |j
                  |j                  d|      | _        t        j                  | j                        | _        t        j$                  | j                  |j&                        | _        t        j$                  |j&                  | j                        | _        t        j                  | j                        | _        y )NT)rQ   rR   rS   rT   rV   )rS   rT   rV   )r5   r6   r   rQ   rP   decoder_attention_headsr   r   rS   r
   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normr[   decoder_ffn_dimr   r   r   )r7   r   rV   r8   s      r*   r6   zMvpDecoderLayer.__init__B  s   %nn44,,
 ~~#F$>$>?"(";";$&LL$@!(NN**,,
 (*||DNN'C$99T^^V-C-CD99V33T^^D "T^^ <r,   r`   ra   rb   rc   rf   rh   encoder_hidden_statesencoder_attention_maskri   cross_attn_layer_head_maskr   cross_attn_promptrk   	use_cacherl   rm   c           	      4   |}| j                  ||	||||
|      \  }}t        j                  j                  || j                  | j                        }||z   }| j                  |      }d}|i|}| j                  ||||||	|
      \  }}t        j                  j                  || j                  | j                        }||z   }| j                  |      }|}| j                  | j                  |            }t        j                  j                  || j                  | j                        }| j                  |      }t        j                  j                  || j                  | j                        }||z   }| j                  |      }|f}|
r|||fz  }|S )aD  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size `(decoder_attention_heads,)`.
            self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
                `(2, decoder_attention_heads, pro_len, head_dim)`.
            cross_attn_prompt (`torch.FloatTensor`): prompt of cross attention of shape
                `(2, decoder_attention_heads, pro_len, head_dim)`.
            past_key_values (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rf   ra   rh   ri   rj   rk   rl   rq   N)rf   rg   rh   ri   rj   ra   rk   )r   r   r   rS   rs   r   r   r   r   r   r   r   r   )r7   rf   rh   r   r   ri   r   r   r   ra   rk   r   rl   r   self_attn_weightscross_attn_weightsoutputss                    r*   rD   zMvpDecoderLayer.forward^  s   L ! ,0>>'+)+(/) ,: ,
(( --mt||VZVcVc-d =011-@ " ,$H040A0A+!65 :- /"3 1B 1-M- MM11-4<<Z^ZgZg1hM$}4M 88GM !**488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0--m< ")+=>>Gr,   N)NNNNNNNNFTN)rG   rH   rI   r   r6   r   r>   rL   r   r   r   r   r   rD   rM   rN   s   @r*   r   r   A  sd   =y =8 %0A6R 268<9=26=A3748+/,1$(15U||U !.U  (5	U
 !) 6U "%,,/U %-U\\$:U #5<<0U $ELL1U "%U $D>U D>U !.U 
u  (51B1BEDUDU1U+V"WW	XU SUr,   r   c                   l     e Zd ZdZdedededef fdZdej                  dej                  fd	Z	 xZ
S )
MvpClassificationHeadz-Head for sentence-level classification tasks.	input_dim	inner_dimnum_classespooler_dropoutc                     t         |           t        j                  ||      | _        t        j
                  |      | _        t        j                  ||      | _        y )Nrr   )r5   r6   r   r[   denseDropoutrS   r_   )r7   r   r   r   r   r8   s        r*   r6   zMvpClassificationHead.__init__  sD     	YYy)4
zzN3		)[9r,   rf   rm   c                     | j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S r   )rS   r   r>   tanhr_   )r7   rf   s     r*   rD   zMvpClassificationHead.forward  sN    ]3

=1

=1]3m4r,   )rG   rH   rI   rJ   rK   r   r6   r>   rL   rD   rM   rN   s   @r*   r   r     sL    7
:
: 
: 	
:
 
:U\\ ell r,   r   c                   `     e Zd ZdZ fdZdej                  deej                     fdZ xZ	S )	MvpPromptz)Layer-wise prompt for encoder or decoder.c           	      8   t         |           |j                  | _        || _        || _        |j
                  |z  | _        t        j                  |j                        | _	        t        j                  |j                  |j
                        | _        t        j                  t        j                  |j
                  |j                        t        j                         t        j                  |j                  |dz  |j
                  z              | _        y )Nr   r3   )r5   r6   prompt_length
num_layersrR   r   rY   r   r   rS   	Embeddingprompt_embedding
Sequentialr[   prompt_mid_dimGELUprompt_trans)r7   r   r   rR   r8   s       r*   r6   zMvpPrompt.__init__  s    #11$")3zzFNN3 "V-A-A6>> RMMIIfnnf&;&;<GGIIIf++Z!^fnn-LM
r,   
prompt_idsrm   c                 *   | j                  | j                  |            }|j                  | j                  | j                  dz  | j
                  | j                        }| j                  |      }|j                  g d      j                  d      }|S )Nr3   )r   r3   r   r	   )
r   r   r}   r   r   rR   rY   rS   permutesplit)r7   r   prompts      r*   rD   zMvpPrompt.forward  sw    ""4#8#8#DET//11DdnnVZVcVcdf%-33A6r,   )
rG   rH   rI   rJ   r6   r>   rL   r   rD   rM   rN   s   @r*   r   r     s+    3
%,, 53F r,   r   c                   6    e Zd ZU eed<   dZdZd Zed        Z	y)MvpPreTrainedModelr   modelTc                    | j                   j                  }t        |t        j                        rY|j
                  j                  j                  d|       |j                  %|j                  j                  j                          y y t        |t        j                        rf|j
                  j                  j                  d|       |j                  2|j
                  j                  |j                     j                          y y y )Nr   )meanstd)r   init_stdru   r   r[   rA   datanormal_rU   zero_r   padding_idx)r7   moduler   s      r*   _init_weightsz MvpPreTrainedModel._init_weights  s    kk""fbii(MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> . .r,   c                     | j                   j                  }t        j                  g ddddd|gg| j                        }|j                  |      |d}|S )N)r      
      r3   r         r3   r=   )rh   r   )r   r    r>   tensorr=   ne)r7   	pad_tokenr   dummy_inputss       r*   r   zMvpPreTrainedModel.dummy_inputs  sW    KK,,	LL"2Q2q)4L!MVZVaVab	'll95"
 r,   N)
rG   rH   rI   r   __annotations__base_model_prefixsupports_gradient_checkpointingr   propertyr    r,   r*   r   r     s-    &*#	?  r,   r   c                       e Zd ZdZ	 ddedeej                     dee   f fdZ		 	 	 	 	 	 	 ddee
j                     dee
j                     dee
j                     d	ee
j                     d
ee   dee   dee   deeef   fdZ xZS )
MvpEncodera  
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`MvpEncoderLayer`].

    Args:
        config: MvpConfig
        embed_tokens (nn.Embedding): output embedding
        use_prompt (bool): whether to use prompt
    r   embed_tokens
use_promptc                 2   t         |   |       |j                  | _        |j                  | _        |j
                  }|j                  | _        |j                  | _	        |j                  rt        j                  |      nd| _        ||| _        n0t        j                   |j"                  || j                        | _        t%        |j                  |      | _        t        j(                  t+        |j,                        D cg c]  }t/        |       c}      | _        t        j2                  |      | _        || _        |r7|j8                  | _        t;        ||j,                  |j<                        | _        d| _         | jC                          y c c}w )N      ?F)"r5   r6   rS   encoder_layerdrop	layerdropr   r    r   max_position_embeddingsmax_source_positionsscale_embeddingmathsqrtembed_scaler  r   r   
vocab_sizer.   embed_positions
ModuleListrangeencoder_layersr   rz   r   layernorm_embeddingr  r   r   r   r   gradient_checkpointing	post_init)r7   r   r  r  rQ   r   r8   s         r*   r6   zMvpEncoder.__init__  sD    	 ~~11NN	!..$*$B$B!393I3I499Y/s# ,D "V->->	4K[K[ \D<** 
 mmeFLaLaFb$c_V%<$cd#%<<	#: $!'!5!5D$-%%..%D! ',# %ds   Fr   rh   	head_maskinputs_embedsrk   output_hidden_statesreturn_dictrm   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      |$|}|j
                  }	|j                  d|	d         }n-| |j                         dd }	|dddddf   }nt	        d      || j                  |      | j                  z  }| j                  |      }
||
z   }| j                  |      }t        j                  j                  || j                  | j                        }| j                   rIt#        j$                  | j&                        j)                  | j*                        }| j-                  |      }|t/        ||j0                        }|rdnd}|rdnd}|_|j                         d   t3        | j4                        k7  r6t	        dt3        | j4                         d	|j                         d    d
      t7        | j4                        D ]|  \  }}|r||fz   }d}| j                  r&t#        j8                  g       }|| j:                  k  rd}|rd}n* ||||||   nd| j                   r|   nd|      }|d   }|st||d   fz   }~ |r||fz   }|st=        d |||fD              S t?        |||      S )a~  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer#   z5You have to specify either input_ids or inputs_embedsrq   r  r   z&The head_mask should be specified for  layers, but it is for .FT)NN)ri   r   rk   r   c              3   &   K   | ]	  }||  y wr   r  .0vs     r*   	<genexpr>z%MvpEncoder.forward.<locals>.<genexpr>  s     eqWXWdes   last_hidden_staterf   
attentions) r   rk   r  use_return_dictr'   r%   r}   rt   r  r  r  r  r   r   rS   rs   r  r>   r?   r   r   r=   r   r   r<   lenrz   	enumeraterandr
  r   r   )r7   r   rh   r  r  rk   r  r  inputinput_shape	embed_posrf   r   r   encoder_statesall_attentionsidxencoder_layerto_dropdropout_probabilitylayer_outputss                        r*   rD   zMvpEncoder.forward6  s   \ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]  ]%>cdd"E++K!r;r?;I&',,.s3K!!Q(+ETUU  --i84;K;KKM((/	%	100?--mt||VZVcVc-d ??d&8&89<<T[[IJ#44Z@ %7H[H[\N30d  ~~"s4;;'78 <S=M<N O!(+,A/ 
 #,DKK"8 	FC#!/=2B!BG}}&+jjn#&7"G , -!"7@7LYs^RV?C&6s&;TX&7! !.a 0 !/=3C2E!E1	F4  +}.>>Ne]NN$Seee+>Vd
 	
r,   NF)NNNNNNN)rG   rH   rI   rJ   r   r   r   r   r   r6   r>   
LongTensorrL   r   r   r   r   rD   rM   rN   s   @r*   r  r    s     lq$$/7/E$ZbcgZh$P 1515,059,0/3&*@
E,,-@
 !.@
 ELL)	@

   1 12@
 $D>@
 'tn@
 d^@
 
uo%	&@
r,   r  c                       e Zd ZdZ	 ddedeej                     dee   f fdZ		 	 	 	 	 	 	 	 	 	 	 	 	 ddee
j                     dee
j                     dee
j                     d	ee
j                     d
ee
j                     dee
j                     deee
j                        dee
j                     dee   dee   dee   dee   dee
j                     deeef   fdZ xZS )
MvpDecoderz
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MvpDecoderLayer`]

    Args:
        config: MvpConfig
        embed_tokens (nn.Embedding): output embedding
        use_prompt (bool): whether to use prompt
    r   r  r  c           	         t         |   |       |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  rt        j                  |j                        nd| _        ||| _        n:t        j                   |j"                  |j                  | j                        | _        t%        |j                  |j                        | _        t        j(                  t+        |j,                        D cg c]  }t/        ||       c}      | _        t        j2                  |j                        | _        || _        |r]|j8                  | _        t;        ||j,                  |j<                        | _        t;        ||j,                  |j<                        | _         d| _!        | jE                          y c c}w )Nr  )rV   F)#r5   r6   rS   decoder_layerdropr
  r    r   r  max_target_positionsr  r  r  r   r  r  r   r   r  r.   r  r  r  decoder_layersr   rz   r   r  r  r   r   r   r   r   r  r  )r7   r   r  r  ir8   s        r*   r6   zMvpDecoder.__init__  su    	 ~~11!..$*$B$B!8>8N8N499V^^4TW# ,D "V->->PTP`P` aD<**NN 
 mmSXY_YnYnSo$pa_Vq%I$pq#%<<#? $!'!5!5D$-%%..%D!
 &/%%..&D" ',#' %qs   Gr   rh   r   r   r  cross_attn_head_maskra   r  r   rk   r  r  rl   rm   c                 L   |
|
n| j                   j                  }
||n| j                   j                  }|	|	n| j                   j                  }	||n| j                   j                  }||t        d      |$|}|j                  }|j                  d|d         }n-| |j                         dd }|dddddf   }nt        d      || j                  |      | j                  z  }| j                  r%| j                  r|	rt        j                  d       d}	|	rN|L|4t        t!        | j                         t!        | j                               nt!        | j                         }|	r:t#        |t$              r*t        j                  d       t        j&                  |      }||j)                         nd	}t+        ||||      }||t-        ||j.                  |d   
      }| j1                  ||      }||z   }| j3                  |      }t4        j6                  j9                  || j8                  | j                        }| j:                  rZt=        j>                  | j@                        jC                  | jD                        }| jG                  |      }| jI                  |      }|rdnd}|
rdnd}|
r|dnd}tK        ||gddg      D ]j  \  }}|	|j                         d	   tM        | jN                        k7  s3t        d| dtM        | jN                         d|j                         d	    d       tQ        | jN                        D ]  \  }}|r||fz  }| j                  r%t=        jR                  g       }|| jT                  k  r? ||||||||   nd|||   nd| j:                  r|   nd| j:                  r|   nd||
|	|      }|d	   }|
s||d   fz  }|||d   fz  } |r||fz  }|st%        d |||||fD              S tW        |||||      S )a  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
                cross-attention on hidden heads. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer#   zEYou have to specify either decoder_input_ids or decoder_inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   )r   rq   r  r  r?  zThe `z` should be specified for r  r  )	r   ri   r   r   r   ra   rk   r   rl   r   r3   c              3   $   K   | ]  }|| 
 y wr   r  r!  s     r*   r$  z%MvpDecoder.forward.<locals>.<genexpr>  s      = s   )r&  ra   rf   r'  cross_attentions),r   rk   r  r   r(  r'   r%   r}   rt   r  r  r  rs   loggerwarning_oncer   r   ru   r   from_legacy_cacheget_seq_lengthr   r   r<   r  r  r   r   rS   r  r>   r?   r   r   r=   r   r   zipr)  rz   r*  r+  r
  r   )r7   r   rh   r   r   r  r?  ra   r  r   rk   r  r  rl   r,  r-  r9   	positionsrf   r   r   r   all_hidden_statesall_self_attnsall_cross_attentions	attn_mask	mask_namer1  decoder_layerr4  r5  s                                  r*   rD   zMvpDecoder.forward  s   b 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]  ]%>stt"E#//K!r;r?;I&',,.s3K!!Q(+Edee  --i84;K;KKM&&4==##p "	0 )4 $L$DlZ^ZeZeFfg!5 
 OU;\
 2CCOTOETE`!?!?!Afg:K8N

 !,1G1S%?&(;(;[QS_&"
 ((0FG	%	100?--mt||VZVcVc-d ??d&8&89<<T[[IJ#44Z@ $ 6 6z B #7BD0d&7<Q<]rdh %(4H(IKYoKp$q 	 Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03 	 #,DKK"8 	@C#!m%55!}}&+jjn#&7)%'=3<3H3dI]Ii,@,Eos;???"23"7PT=A__#4S#9RV /"3#-M *!,M =#3"55(4(]1-=,??(9	@>  -!11 ':K^]qr  
 9+++%1
 	
r,   r6  )NNNNNNNNNNNNN)rG   rH   rI   rJ   r   r   r   r   r   r6   r>   r7  rL   r   listr   r   r   rD   rM   rN   s   @r*   r9  r9    s~    lq&&/7/E&ZbcgZh&T 1515=A=A,07;=A59$(,0/3&*15S
E,,-S
 !.S
  ((9(9:	S

 !))9)9 :S
 ELL)S
 'u||4S
 "$u'8'8"9:S
   1 12S
 D>S
 $D>S
 'tnS
 d^S
 !.S
 
u??	@S
r,   r9  c            &       (    e Zd ZdgZddgZdef fdZd Zd Zd Z	d	 Z
e	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
eej                     deej                     deej                     deej                     deej                     deej                     deej                     deeej"                        deeej"                        deej"                     deej"                     dee   dee   dee   dee   deej                     deeef   f"d       Z xZS )MvpModelfinal_logits_biasencoder.embed_tokens.weightdecoder.embed_tokens.weightr   c                 z   t         |   |       |j                  |j                  }}|j                  | _        t        j                  ||j                  |      | _        t        || j                  |j                        | _
        t        || j                  |j                        | _        | j                          y r   )r5   r6   r    r  r  r   r   r   sharedr  encoderr9  decoderr  )r7   r   r   r  r8   s       r*   r6   zMvpModel.__init__  s     "("5"5v7H7HZ ++ll:v~~{K!&$++v7H7HI!&$++v7H7HI 	r,   c                     | j                   S r   )rV  r7   s    r*   get_input_embeddingszMvpModel.get_input_embeddings  s    {{r,   c                 ~    || _         | j                   | j                  _        | j                   | j                  _        y r   )rV  rW  r  rX  r7   values     r*   set_input_embeddingszMvpModel.set_input_embeddings  s)    $(KK!$(KK!r,   c                     | j                   S r   )rW  rZ  s    r*   get_encoderzMvpModel.get_encoder  s    ||r,   c                 *   | j                   sJ d       | j                  d       | j                  j                  j                  d       | j                  j                  j                  d       | j                  j
                  j                  d       y )NzHIf you want to use lightweight tuning, make sure that `use_prompt=True`.FT)r  requires_grad_rW  r   rX  r   rZ  s    r*   set_lightweight_tuningzMvpModel.set_lightweight_tuning  sj    j jjE"%%44T:%%44T:&&55d;r,   r   rh   decoder_input_idsdecoder_attention_maskr  decoder_head_maskr?  encoder_outputsra   r  decoder_inputs_embedsr   rk   r  r  rl   rm   c                 <   |D|B|t        d      t        || j                  j                  | j                  j                        }||n| j                  j
                  }||n| j                  j                  }||n| j                  j                  }||n| j                  j                  }|| j                  ||||
|||      }nI|rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }| j                  |||d   ||||	||||||      }|s||z   S t        |j                  |j                   |j"                  |j$                  |j&                  |j                  |j"                  |j$                  	      S )
a*  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        NzIf no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   rh   r  r  rk   r  r  r   r   r3   r%  )r   rh   r   r   r  r?  ra   r  r   rk   r  r  rl   )r&  ra   decoder_hidden_statesdecoder_attentionsrB  encoder_last_hidden_stater   encoder_attentions)r'   r+   r   r    r!   rk   r  r   r(  rW  ru   r   r)  rX  r   r&  ra   rf   r'  rB  )r7   r   rh   re  rf  r  rg  r?  rh  ra   r  ri  r   rk   r  r  rl   decoder_outputss                     r*   rD   zMvpModel.forward  s   f $)>)F  U  !34;;33T[[5W5W! 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]""ll#-#+"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO ,,'1"1!"4#1'!5+//!5#) ' 
  "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r,   NNNNNNNNNNNNNNNN)rG   rH   rI   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr   r6   r[  r_  ra  rd  r   r   r>   r7  rL   rO  r   r   r   r   r   rD   rM   rN   s   @r*   rQ  rQ    s   *=)>&79VWy 0
<  15158<=A,0487;=A=A59=A$(,0/3&*15#t
E,,-t
 !.t
 $E$4$45	t

 !))9)9 :t
 ELL)t
 $ELL1t
 'u||4t
 "$u'8'8"9:t
 "$u'8'8"9:t
   1 12t
  ((9(9:t
 D>t
 $D>t
 'tnt
  d^!t
" !.#t
$ 
u((	)%t
 t
r,   rQ  ze
    The MVP Model with a language modeling head. Can be used for various text generation tasks.
    )custom_introc            (           e Zd Zg dZdef fdZd Zd Z	 d!dede	e   d	e
d
ej                  f fdZded
dfdZd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d"de	ej$                     de	ej&                     de	ej$                     de	ej$                     de	ej&                     de	ej&                     de	ej&                     de	eej*                        de	eej*                        de	ej*                     de	ej*                     de	ej$                     de	e
   de	e
   de	e
   de	e
   de	ej&                     d
eeef   f$d       Zdej&                  fd Z xZS )#MvpForConditionalGeneration)rS  rT  lm_head.weightr   c                 x   t         |   |       t        |      | _        | j	                  dt        j                  d| j                  j                  j                  f             t        j                  |j                  | j                  j                  j                  d      | _        | j                          y )NrR  r   FrX   )r5   r6   rQ  r   register_bufferr>   r   rV  r/   r   r[   r   lm_headr  r   s     r*   r6   z$MvpForConditionalGeneration.__init__f  s     f%
0%++q$**BSBSBbBb>c2deyy1B1B1Q1QX]^ 	r,   c                 6    | j                   j                         S r   )r   ra  rZ  s    r*   ra  z'MvpForConditionalGeneration.get_encodero      zz%%''r,   c                 6    | j                   j                         S r   )r   get_decoderrZ  s    r*   r}  z'MvpForConditionalGeneration.get_decoderr  r{  r,   Nnew_num_tokenspad_to_multiple_ofmean_resizingrm   c                 L    t         |   |||      }| j                  |       |S r   )r5   resize_token_embeddings_resize_final_logits_bias)r7   r~  r  r  new_embeddingsr8   s        r*   r  z3MvpForConditionalGeneration.resize_token_embeddingsu  s.     8I[]jk&&~6r,   c                 6   | j                   j                  d   }||k  r| j                   d d d |f   }nSt        j                  d||z
  f| j                   j                        }t        j
                  | j                   |gd      }| j                  d|       y )Nr#   r   r   ro   rR  )rR  r%   r>   r   r=   r   rx  )r7   r~  old_num_tokensnew_bias
extra_biass        r*   r  z5MvpForConditionalGeneration._resize_final_logits_bias|  s    //55b9^+--a..@AHa.)H%IRVRhRhRoRopJyy$"8"8*!E1MH0(;r,   c                 n    | j                   j                          | j                  j                  d       y r6  r   rd  ry  rc  rZ  s    r*   rd  z2MvpForConditionalGeneration.set_lightweight_tuning  $    

))+##E*r,   r   rh   re  rf  r  rg  r?  rh  ra   r  ri  labelsr   rk   r  r  rl   c                    ||n| j                   j                  }|R|rt        j                  d       d}|7|5t	        || j                   j
                  | j                   j                        }| j                  |||||||||	|
||||||      }| j                  |d         | j                  z   }d}|Ft               } ||j                  d| j                   j                        |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                   |j"                  |j$                  |j&                  |j(                  	      S )	a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example of summarization:

        Fine-tuning a model
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer(
        ...     "Summarize: You may want to stick it to your boss and leave your job, but don't do it if these are your reasons.",
        ...     return_tensors="pt",
        ... )
        >>> labels = tokenizer("Bad Reasons To Quit Your Job", return_tensors="pt")["input_ids"]

        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     generated_ids = model.generate(**inputs)

        >>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        ```
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)rh   re  rh  rf  r  rg  r?  ra   r  ri  r   rk   r  r  rl   r   r#   r   	losslogitsra   rk  rl  rB  rm  r   rn  )r   r(  rC  warningr+   r    r!   r   ry  rR  r   r}   r  r   ra   rk  rl  rB  rm  r   rn  )r7   r   rh   re  rf  r  rg  r?  rh  ra   r  ri  r  r   rk   r  r  rl   r   	lm_logitsmasked_lm_lossloss_fctoutputs                          r*   rD   z#MvpForConditionalGeneration.forward  s   d &1%<k$++B]B]klI (-B-J$6DKK44dkk6X6X%! **)/+#9/!5+'"7/!5#)!  
$ LL,t/E/EE	')H%innR9O9O&PRXR]R]^`RabN\GABK/F3A3M^%.YSYY#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r,   c                 l    t        || j                  j                  | j                  j                        S r   )r+   r   r    r!   )r7   r  s     r*   %prepare_decoder_input_ids_from_labelszAMvpForConditionalGeneration.prepare_decoder_input_ids_from_labels  s%    !&$++*B*BDKKDfDfggr,   )NT)NNNNNNNNNNNNNNNNN)rG   rH   rI   rr  r   r6   ra  r}  rK   r   r   r   r   r  r  rd  r   r>   r7  rL   rO  r   r   r   r   rD   r  rM   rN   s   @r*   ru  ru  ^  s9    jy (( dh!7?}\`	< < <+  15158<=A,0487;=A=A59=A-1$(,0/3&*15%C
E,,-C
 !.C
 $E$4$45	C

 !))9)9 :C
 ELL)C
 $ELL1C
 'u||4C
 "$u'8'8"9:C
 "$u'8'8"9:C
   1 12C
  ((9(9:C
 ))*C
 D>C
 $D>C
  'tn!C
" d^#C
$ !.%C
& 
uo%	&'C
 C
JhELL hr,   ru  z
    Mvp model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c            $           e Zd ZddgZdef fdZd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     deee	j                        dee	j                     dee	j                     dee	j                     dee   dee   dee   dee   deeef   f d       Z xZS )MvpForSequenceClassificationrS  rT  r   c                     t        |   |fi | t        |      | _        t	        |j
                  |j
                  |j                  |j                        | _        | j                          y r   )
r5   r6   rQ  r   r   r   
num_labelsclassifier_dropoutclassification_headr  )r7   r   kwargsr8   s      r*   r6   z%MvpForSequenceClassification.__init__  sZ    *6*f%
#8NNNN%%	$
  	r,   c                 n    | j                   j                          | j                  j                  d       y r6  )r   rd  r  rc  rZ  s    r*   rd  z3MvpForSequenceClassification.set_lightweight_tuning)  s&    

))+  //6r,   r   rh   re  rf  r  rg  r?  rh  r  ri  r  r   rk   r  r  rm   c                    ||n| j                   j                  }|d}|$|	"t        d| j                  j                         | j                  |||||||||	|
||||      }|d   }|j                  | j                   j                        j                  |j                        }t        t        j                  |j                  d                  dkD  rt        d      ||ddf   j                  |j!                  d      d|j!                  d            dddddf   }| j#                  |      }d}|| j                   j$                  | j                   j&                  dk(  rd	| j                   _        nv| j                   j&                  dkD  rL|j(                  t        j*                  k(  s|j(                  t        j,                  k(  rd
| j                   _        nd| j                   _        | j                   j$                  d	k(  rSt/               }| j                   j&                  dk(  r& ||j1                         |j1                               }n |||      }n| j                   j$                  d
k(  rGt3               } ||j                  d| j                   j&                        |j                  d            }n,| j                   j$                  dk(  rt5               } |||      }|s|f|dd z   }||f|z   S |S t7        |||j8                  |j:                  |j<                  |j>                  |j@                  |jB                  |jD                  	      S )a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Example of single-label classification:

        Fine-tuning a model on `num_labels` classes
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForSequenceClassification

        >>> num_labels = 2  # for example, this is a binary classification task
        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForSequenceClassification.from_pretrained("RUCAIBox/mvp", num_labels=num_labels)

        >>> inputs = tokenizer("Classify: Hello, my dog is cute", return_tensors="pt")
        >>> labels = torch.tensor(1)  # the real label for inputs

        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax()
        ```
        NFz8Passing input embeddings is currently not supported for rh   re  rf  r  rg  r?  rh  r  ri  r   rk   r  r  r   r   z7All examples must have the same number of <eos> tokens.r#   
regressionsingle_label_classificationmulti_label_classificationr  )#r   r(  NotImplementedErrorr8   rG   r   eqeos_token_idr   r=   r)  r>   unique_consecutivesumr'   r}   rt   r  problem_typer  r<   r@   rK   r   squeezer   r   r   ra   rk  rl  rB  rm  r   rn  )r7   r   rh   re  rf  r  rg  r?  rh  r  ri  r  r   rk   r  r  r   rf   eos_masksentence_representationr  r  r  r  s                           r*   rD   z$MvpForSequenceClassification.forward-  s   Z &1%<k$++B]B]I!:%J4>>KbKbJcd  **)/#9/!5+'"7/!5#  
   
<< 8 89<<]=Q=QRu''Q89A=VWW"/!"<"A"A-BTBTUVBWY[]j]o]opr]s"tr1H#
 ))*AB{{''/;;))Q./;DKK,[[++a/V\\UZZ5OSYS_S_chclclSl/LDKK,/KDKK,{{''<7"9;;))Q.#FNN$4fnn6FGD#FF3D))-JJ+-B0F0F GUWY))-II,./Y,F)-)9TGf$EvE.#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r,   )NNNNNNNNNNNNNNN)rG   rH   rI   rr  r   r6   rd  r   r   r>   r7  rL   rO  r   r   r   r   r   rD   rM   rN   s   @r*   r  r    s    89VWy 7  15158<=A,0487;=A59=A-1$(,0/3&*!T
E,,-T
 !.T
 $E$4$45	T

 !))9)9 :T
 ELL)T
 $ELL1T
 'u||4T
 "$u'8'8"9:T
   1 12T
  ((9(9:T
 ))*T
 D>T
 $D>T
 'tnT
  d^!T
" 
u55	6#T
 T
r,   r  c            &           e Zd ZddgZ fdZd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     deeej                        deej                     deej                     deej                     deej                     dee   dee   dee   dee   deeef   f"d       Z xZS )MvpForQuestionAnsweringrS  rT  c                     t         |   |       d|_        |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y r2   )
r5   r6   r  rQ  r   r   r[   hidden_size
qa_outputsr  r   s     r*   r6   z MvpForQuestionAnswering.__init__  s[      ++f%
))F$6$68I8IJ 	r,   c                 n    | j                   j                          | j                  j                  d       y r6  )r   rd  r  rc  rZ  s    r*   rd  z.MvpForQuestionAnswering.set_lightweight_tuning  s$    

))+&&u-r,   r   rh   re  rf  r  rg  r?  rh  start_positionsend_positionsr  ri  r   rk   r  r  rm   c                    ||n| j                   j                  }|	|
d}| j                  ||||||||||||||      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}|	|
t        |	j                               dkD  r|	j                  d      }	t        |
j                               dkD  r|
j                  d      }
|j                  d      }|	j                  d|      }	|
j                  d|      }
t        |      } |||	      } |||
      }||z   d	z  }|s||f|dd z   }||f|z   S |S t        ||||j                  |j                  |j                  |j                  |j                   |j"                  |j$                  

      S )a`  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        Fine-tuning a model for extrative question answering, and our model also supports generative question answering
        using `BartForConditionalGeneration`
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForQuestionAnswering

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForQuestionAnswering.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer(
        ...     "Answer the following question: Who was Jim Henson? [SEP] Jim Henson was a nice puppet",
        ...     return_tensors="pt",
        ... )
        >>> target_start_index = torch.tensor([18])
        >>> target_end_index = torch.tensor([19])

        >>> loss = model(**inputs, start_positions=target_start_index, end_positions=target_end_index).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> answer_start_index = outputs.start_logits.argmax()
        >>> answer_end_index = outputs.end_logits.argmax()

        >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
        >>> predict_answer = tokenizer.decode(predict_answer_tokens)
        ```
        NFr  r   r   r#   ro   )ignore_indexr3   )
r  start_logits
end_logitsra   rk  rl  rB  rm  r   rn  )r   r(  r   r  r   r  
contiguousr)  rt   r   r   r   ra   rk  rl  rB  rm  r   rn  )r7   r   rh   re  rf  r  rg  r?  rh  r  r  r  ri  r   rk   r  r  r   sequence_outputr  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                               r*   rD   zMvpForQuestionAnswering.forward  s   f &1%<k$++B]B]&=+DI**)/#9/!5+'"7/!5#  
" "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J F 0:/EZMF*Q6Q2%!#33")"?"?&99$55&-&G&G")"?"?&99
 	
r,   rp  )rG   rH   rI   rr  r6   rd  r   r   r>   rL   r7  rO  r   r   r   r   r   rD   rM   rN   s   @r*   r  r    s   79VW
.  -1158<=A,0487;=A6:4859=A$(,0/3&*#Q
ELL)Q
 !.Q
 $E$4$45	Q

 !))9)9 :Q
 ELL)Q
 $ELL1Q
 'u||4Q
 "$u'8'8"9:Q
 "%"2"23Q
   0 01Q
   1 12Q
  ((9(9:Q
 D>Q
 $D>Q
  'tn!Q
" d^#Q
$ 
u99	:%Q
 Q
r,   r  c                   (     e Zd ZdZ fdZd Z xZS )MvpDecoderWrapperz
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    c                 D    t         |   |       t        |      | _        y r   )r5   r6   r9  rX  r   s     r*   r6   zMvpDecoderWrapper.__init__u  s     !&)r,   c                 &     | j                   |i |S r   )rX  )r7   argsr  s      r*   rD   zMvpDecoderWrapper.forwardy  s    t||T,V,,r,   )rG   rH   rI   rJ   r6   rD   rM   rN   s   @r*   r  r  o  s    
*-r,   r  c            "           e Zd ZdgZ fdZd Zd Zd Zd Zd Z	e
	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     d	eej                     d
eej                     deej                     deej                     deej                     deeej                        deej                     deej                     dee   dee   dee   dee   deej                     deeef   fd       Z xZS )MvpForCausalLMrv  c                     d|_         d|_        t        |   |       t	        |      | _        t        j                  |j                  |j                  d      | _
        | j                          y )NTFrX   )rT   is_encoder_decoderr5   r6   r  r   r   r[   r  r  ry  r  r   s     r*   r6   zMvpForCausalLM.__init__  sX     $)! &v.
yy!3!3V5F5FUS 	r,   c                 B    | j                   j                  j                  S r   r   rX  r  rZ  s    r*   r[  z#MvpForCausalLM.get_input_embeddings  s    zz!!...r,   c                 :    || j                   j                  _        y r   r  r]  s     r*   r_  z#MvpForCausalLM.set_input_embeddings  s    */

'r,   c                 &    || j                   _        y r   r   rX  )r7   rX  s     r*   set_decoderzMvpForCausalLM.set_decoder  s    $

r,   c                 .    | j                   j                  S r   r  rZ  s    r*   r}  zMvpForCausalLM.get_decoder  s    zz!!!r,   c                 n    | j                   j                          | j                  j                  d       y r6  r  rZ  s    r*   rd  z%MvpForCausalLM.set_lightweight_tuning  r  r,   r   rh   r   r   r  r?  ra   r  r  r   rk   r  r  rl   rm   c                 D   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j                  j                  |||||||||
|||      }| j                  |d         }d}|	Ft               } ||j                  d| j                   j                        |	j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )a  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MvpForCausalLM

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForCausalLM.from_pretrained("RUCAIBox/mvp", add_cross_attention=False)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> list(logits.shape)
        [1, 8, 50267]
        ```N)r   rh   r   r   r  r?  ra   r  r   rk   r  r  r   r#   r   )r  r  ra   rf   r'  rB  )r   rk   r  r(  r   rX  ry  r   r}   r  r   ra   rf   r'  rB  )r7   r   rh   r   r   r  r?  ra   r  r  r   rk   r  r  rl   r   r  r  r  r  s                       r*   rD   zMvpForCausalLM.forward  sF   Z 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] **$$)"7#9!5+'/!5# % 
 gaj)')HFKKDKK,B,BCV[[QS_UDY,F'+'7D7V#CVC0#33!//))$55
 	
r,   )NNNNNNNNNNNNNN)rG   rH   rI   rr  r6   r[  r_  r  r}  rd  r   r   r>   r7  rL   r   rO  r   r   r   r   rD   rM   rN   s   @r*   r  r  }  s   *+	/0%"+  1515=A>B,07;=A59-1$(,0/3&*15T
E,,-T
 !.T
  ((9(9:	T

 !)):): ;T
 ELL)T
 'u||4T
 "$u'8'8"9:T
   1 12T
 ))*T
 D>T
 $D>T
 'tnT
 d^T
 !.T
  
u77	8!T
 T
r,   r  )r  ru  r  r  rQ  r   )BrJ   r  typingr   r   r>   torch.utils.checkpointr   torch.nnr   r   r   activationsr
   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   utils.deprecationr   configuration_mvpr   
get_loggerrG   rC  rL   rK   r+   r   r.   ModulerP   r   r   r   r   r   r  r9  rQ  ru  r  r  r  r  __all__r  r,   r*   <module>r     s     "    A A ! C C ) :   . , 0 ( 
		H	%%,, c [^ ";BLL ;2]2299 ]2@@+0 @+Fs0 snBII 0		 2   6q
# q
hE
# E
P Y
! Y
 Y
x 
mh"4o mh
mh` i
#5 i
i
X e
0 e
 e
R-* -s
' s
lr,   