
    h~                        d Z ddlZddlmZ ddlmZ ddlZddlZddlmZ ddl	m
Z
 ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZ  ej8                  e      Zee G d de                    Z G d dej@                        Z! G d dej@                        Z" G d dej@                        Z# G d dej@                        Z$ G d dej@                        Z% G d dej@                        Z& G d dej@                        Z' G d  d!e      Z( G d" d#ej@                        Z) G d$ d%ej@                        Z*e G d& d'e             Z+ G d( d)ej@                        Z, G d* d+ej@                        Z-e,e-d,Z. ed-.       G d/ d0e+             Z/ G d1 d2ej@                        Z0 ed3.       G d4 d5e+             Z1g d6Z2y)7zPyTorch TVP Model    N)	dataclass)Optional)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingModelOutput)PreTrainedModel)prune_linear_layer)auto_docstringlogging)load_backbone   )	TvpConfigc                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   y)TvpVideoGroundingOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Temporal-Distance IoU loss for video grounding.
    logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Contains start_time/duration and end_time/duration. It is the time slot of the videos corresponding to the
        input texts.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.
    Nlosslogits.hidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   tupler        b/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/tvp/modeling_tvp.pyr   r   &   sq    	 )-D(5$$
%,*.FHU&&'.=AM8E%"3"3S"89:A:>Ju00#567>r"   r   c                   :     e Zd ZdZ fdZd Zd Zd Zd Z xZ	S )TvpLossa~  
    This class computes the losses for `TvpForVideoGrounding`. The process happens in two steps: 1) we compute
    hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
    ground-truth / prediction (supervise class and box).

    Args:
        losses (`list[str]`):
            List of all the losses to be applied.
    c                     t         |           | j                  | j                  | j                  d| _        |D ]  }|| j
                  vst        d| d       || _        y )NioudistancedurationzLoss z not supported)super__init__loss_iouloss_distanceloss_durationloss_map
ValueErrorlosses)selfr2   r   	__class__s      r#   r,   zTvpLoss.__init__E   sj    ==****

  	?D4==( 5n!=>>	? r"   c                     t        j                  ||      t        j                  ||      z
  }t        j                  ||      t        j                  ||      z
  }d|j                  d      |z  z
  }|S )z6
        Measure the intersection over union.
        r   r   min)r   r7   maxclamp)	r3   
start_timeend_timecandidates_start_timecandidates_end_timer*   interunionr(   s	            r#   r-   zTvpLoss.loss_iouR   si     		-x8599EZ\f;gg		-x8599EZ\f;gg%++!+$u,,
r"   c                 P   t        j                  t        j                  ||      d      }t        j                  t        j                  ||      d      }t        j                  t        j                  ||      t        j                  ||      z
  |      j                  d      }|S )z5
        Measure the distance of mid points.
        g       @g?r6   )r   divaddr8   r7   r9   )	r3   r:   r;   r<   r=   r*   mid_candidatesmid_groundtruthdistance_diffs	            r#   r.   zTvpLoss.loss_distance\   s     599-BDW#XZ]^))EIIj($CSI		IIno6>Sb9ccem

%C%. 	 r"   c                     t        j                  ||      }t        j                  ||      }t        j                  t        j                  t        j                  ||      |            }|j	                  d      }|S )z5
        Measure the difference of duration.
        g?r6   )r   subsquarerA   r9   )	r3   r:   r;   r<   r=   r*   duration_candidatesduration_groundtruthduration_diffs	            r#   r/   zTvpLoss.loss_durationh   sh     $ii(;=RS$yy:>UYYuyy9LNb/cem%no%+++4r"   c                    |\  }}}t        j                  ||      }|dddf   j                         |dddf   j                         }}i }	| j                  D ],  }
|	j	                  |
 | j
                  |
   |||||      i       . |	S )am  
        This performs the loss computation.

        Args:
            logits (`torch.FloatTensor`):
                The output logits of head module.
            labels (`list[torch.FloatTensor]`):
                List of tensors ([start, end, duration]), which contains start time, end time of the video corresponding to the text, and also the duration.
        Nr   r   )r   mulfloatr2   updater0   )r3   r   labelsr*   r:   r;   
candidatesr<   r=   losses_dictr   s              r#   forwardzTvpLoss.forwards   s     *0&*hYYvx0
5?15E5K5K5MzZ[]^Z^O_OeOeOg2KK 	D*t}}T*:xAVXkmuvw	
 r"   )
r   r   r   r   r,   r-   r.   r/   rS   __classcell__r4   s   @r#   r%   r%   :   s!    
	r"   r%   c                   $     e Zd Z fdZd Z xZS )TvpVisionModelc           	      \   t         |           t        |      | _        |j                  |j                  j
                  d   }nt        | j                  d      rDt        | j                  j                  d      r$| j                  j                  j
                  d   }nbt        | j                  d      rAt        | j                  j                  d      r!| j                  j                  j                  }nt        d      t        j                  ||j                  ddddd	      | _        y )
Nconfighidden_sizeshidden_sizezBackbone config not foundr   r   F)kernel_sizestridepaddinggroupsbias)r+   r,   r   backbonebackbone_configr[   hasattrrZ   r\   r1   r   Conv2dgrid_encoder_conv)r3   rZ   in_channelsr4   s      r#   r,   zTvpVisionModel.__init__   s    %f-!!- 00==bAKT]]H-'$--:N:NP^2_--..;;B?KT]]H-'$--:N:NP]2^--..::K899!#"
r"   c                    |j                   \  }}}}}|j                  ||z  |||      }| j                  |      d   d   }| j                  |      }t        j
                  j                  |dd      }t        j
                  j                  |d      }|j                   dd  \  }	}
}|j                  |||	|
|      }|j                  ddd	d
d      }|S )Nfeature_mapsr      )r]   r^   T)inplacer   r      )	shapeviewrb   rf   r   
functional
max_pool2drelupermute)r3   pixel_values
batch_size
num_framesnum_channelsheightwidthgrid_feat_outputsgridnew_channel
new_height	new_widths               r#   rS   zTvpVisionModel.forward   s    >J>P>P;
Jfe#((j)@,PVX]^ MM,7GJ%%&78}}''!A'F}}!!$!5-1ZZ_*ZyyZj)T||Aq!Q*r"   r   r   r   r,   rS   rT   rU   s   @r#   rW   rW      s    
.r"   rW   c                   ~     e Zd ZdZ fdZdej                  dededej                  fdZdde	fd	Z
dde	fd
Z xZS )TvpVisualInputEmbeddingz;
    Takes input of both image and video (multi-frame)
    c                 r   t         |           t        j                  |j                  |j
                        | _        t        j                  |j                  |j
                        | _        t        j                  |j                  |j
                        | _
        t        j                  d|j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                         | _        |j                  | _        |j                  | _	        y )Nr   eps)r+   r,   r   	Embeddingmax_position_embeddingsr\   position_embeddings max_grid_row_position_embeddingsrow_position_embeddings max_grid_col_position_embeddingscol_position_embeddingstoken_type_embeddings	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutr3   rZ   r4   s     r#   r,   z TvpVisualInputEmbedding.__init__   s    #%<<0N0NPVPbPb#c ')||F4[4[]c]o]o'p$')||F4[4[]c]o]o'p$%'\\!V5G5G%H",,v'9'9v?T?TUzz&"<"<=060W0W-060W0W-r"   	embeddingrx   ry   returnc                     dx}}|| j                   kD  r|| j                   z  }|| j                  kD  r|| j                  z  }|j                  dddd      }t        j                  j                  |||fdd      }|j                  dddd      }|S )z
        This method allows to interpolate the pre-trained pad weights , to be able to use the model on collection of high
        resolution images (high resolution videos).

        r   r   r   rj   bicubicFscale_factormodealign_corners)r   r   rs   r   rp   interpolate)r3   r   rx   ry   h0w0s         r#   interpolate_pos_encodingz0TvpVisualInputEmbedding.interpolate_pos_encoding   s     RD999$???B4888>>>B%%aAq1	MM--b	 . 
	 %%aAq1	r"   r   c                    |j                   \  }}}}t        | j                  |      }t        j                  |t        j
                  |j                        }| j                  |      }	dt        |j                         dz
  z  |d|fz   }
 |	j                  |
 }	t        | j                  |      }t        j                  |t        j
                  |j                        }| j                  |      }|d||f} |j                  | }|	|z   }|r6|| j                  kD  s|| j                  kD  r|| j                  |||      z   }|S ||z   }|S )af  
        Args:
            grid: (batch_size, height, width, hidden_dim)
            interpolate_pos_encoding: (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the pre-trained position encodings.
        Returns:
            grid + col_position_embeddings.view(*col_shape): (batch_size, *, height, width, hidden_dim)
        dtypedevice)r   r   r   )rn   r7   r   r   arangelongr   r   lenro   r   r   r   )r3   r{   r   ru   rx   ry   
hidden_dim
row_heightrow_position_idsr   	row_shape	row_widthcol_position_idsr   	col_shapepositional_embeddingss                   r#   add_2d_positional_embeddingsz4TvpVisualInputEmbedding.add_2d_positional_embeddings   sQ    15

-
FE: >>G
 <<
%**T[[Y"&">">?O"PC

Oa/0J:3NN	">"9">">	"J ==uE	 <<	DKKX"&">">?O"PIz:	">"9">">	"J 7:Q Q $T:::edFkFk>k$778MvW\]]D  //Dr"   c                    |j                   \  }}}}}|j                  d      }| j                  ||      }|j                  |d|      }|j                   dd }	|j                  }
t        j                  |	t
        j                  |
      }| j                  |      }||z   }| j                  |      }| j                  |      }|S )a  
        Args:
            grid: Array of shape (batch_size, num_frames, height, width, num_channels).
                It contains processed frames extracted from videos, and is generated by Tvp image preprocessor. Note,
                num_frames can be 1
            interpolate_pos_encoding: (bool, *optional*, defaults to `False`):
                Whether to interpolate the pre-trained position encodings.

        Returns:
            embeddings: The embedding of grid with size (batch_size, height*width, num_channels)

        r   r   rY   Nr   )rn   meanr   ro   r   r   zerosr   r   r   r   )r3   r{   r   ru   rv   rx   ry   rw   visual_tokensvisual_tokens_shaper   token_type_idsr   
embeddingss                 r#   rS   zTvpVisualInputEmbedding.forward  s     ?Cjj;
J|yy|00Ph0i		*b,?+11#26%% %8

SYZ $ : :> J"%::
__Z0
\\*-
r"   F)r   r   r   r   r,   r   Tensorintr   boolr   rS   rT   rU   s   @r#   r   r      sT    
X%,,  TW \a\h\h .'4 'Rd r"   r   c                   *     e Zd ZdZ fdZddZ xZS )TvpTextInputEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                         | _        y )N)padding_idxr   )r+   r,   r   r   
vocab_sizer\   pad_token_idword_embeddingsr   r   type_vocab_sizer   r   r   r   r   r   r   r   s     r#   r,   zTvpTextInputEmbeddings.__init__&  s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]",,v'9'9v?T?TUzz&"<"<=r"   c                 .   ||j                         }n|j                         d d }|d   }||j                  n|j                  }|Ft        j                  |t        j                  |      }|j                  d      j                  |      }|&t        j                  |t        j                  |      }|| j                  |      }| j                  |      }| j                  |      }	||z   |	z   }
| j                  |
      }
| j                  |
      }
|
S )NrY   r   r   r   )sizer   r   r   r   	unsqueezeexpandr   r   r   r   r   r   )r3   	input_idsr   position_idsinputs_embedsinput_shape
seq_lengthr   r   r   r   s              r#   rS   zTvpTextInputEmbeddings.forward.  s    #..*K',,.s3K ^
%.%:!!@T@T <<
%**VTL'11!4;;KHL!"[[EJJvVN  00;M"66|D $ : :> J"%88;PP
__Z0
\\*-
r"   )NNNNr   r   r   r   r,   rS   rT   rU   s   @r#   r   r   #  s    Q>r"   r   c                   f     e Zd Z fdZd Zdej                  dedefdZ	 	 	 d	de	e
   fdZ xZS )
TvpAttentionc                    t         |           |j                  |j                  z  dk7  r1t	        |d      s%t        d|j                   d|j                         |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        t        j$                  |j                  |j&                        | _        t        j                  |j*                        | _        t/               | _        y )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r   )r+   r,   r\   num_attention_headsrd   r1   r   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probattn_dropoutdenser   r   r   r   r   setpruned_headsr   s     r#   r,   zTvpAttention.__init__H  s    : ::a?PVXhHi"6#5#5"66jkq  lF  lF  kG  H  $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
JJv'J'JKYYv1163E3EF
,,v'9'9v?T?TUzz&"<"<=Er"   c                 N   t        |      dk(  ry t        j                  | j                  | j                        }t        |      | j                  z
  }|D ](  t        fd| j                  D              z
  d|<   * |j                  d      j                         j                  d      }t        j                  t        |            |   j                         }t        | j                  |      | _        t        | j                  |      | _        t        | j                   |      | _        t        | j"                  |d      | _        | j                  t        |      z
  | _        | j                  | j                  z  | _        | j                  j'                  |      | _        y )Nr   c              3   0   K   | ]  }|k  rd nd  yw)r   r   Nr!   ).0hheads     r#   	<genexpr>z+TvpAttention.prune_heads.<locals>.<genexpr>d  s     Nq1t8a2Ns   rY   r   dim)r   r   onesr   r   r   r   sumro   
contiguouseqr   r   r   r   r   r   r   r   r?   )r3   headsmaskindexr   s       @r#   prune_headszTvpAttention.prune_heads]  sN   u:?zz$22D4L4LME
T... 	D#ND<M<MNNNDDJ	 yy}''),,Q/SY'-224 (

E:
%dhh6'

E:
'

EqA
 $(#;#;c%j#H !558P8PP --33E:r"   tensorsequence_lengthru   c                     |j                  ||| j                  | j                        j                  dd      j	                         S )Nr   rj   )ro   r   r   	transposer   )r3   r   r   ru   s       r#   _reshapezTvpAttention._reshapet  s7    KK
OT5M5MtOgOghYq!_Z\	
r"   output_attentionsc                 :   |j                   d d \  }}| j                  |      }| j                  |      }| j                  |      }	| j	                  |||      }
| j	                  |||      }| j	                  |	||      }t        j                  |
|j                  dd            }|t        j                  | j                        z  }|||z   }t        j                  j                  |d      }| j                  |      }|||z  }t        j                  ||      }|j                  dd      j                         }|j!                  ||| j"                        }| j%                  |      }| j'                  |      }| j)                  ||z         }|r||f}|S |f}|S )Nrj   rY   r   r   )rn   r   r   r   r   r   matmulr   mathsqrtr   r   rp   softmaxr   r   reshaper   r   r   r   )r3   r   attention_mask	head_maskr   ru   r   mixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresattention_probsattn_outputoutputss                    r#   rS   zTvpAttention.forward{  s    '4&9&9"1&=#
O JJ}5((=1 JJ}5mm$5
SMM/?JO	mm$5
S !<<Y5H5HR5PQ+dii8P8P.QQ%/.@ --//0@b/I ++O<  -	9Oll?K@!++Aq1<<>!))*otGYGYZjj-ll;/ookM&AB4E;0 MX>r"   NNN)r   r   r   r,   r   r   r   r   r   r   r   rS   rT   rU   s   @r#   r   r   G  sI    "*;.
u|| 
c 
s 
 ,0+
 $D>+r"   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )TvpIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y N)r+   r,   r   r   r\   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr   s     r#   r,   zTvpIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r"   r   r   c                 J    | j                  |      }| j                  |      }|S r  )r   r  )r3   r   s     r#   rS   zTvpIntermediate.forward  s&    

=100?r"   r   r   r   r,   r   r   rS   rT   rU   s   @r#   r	  r	    s#    9U\\ ell r"   r	  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )TvpOutputLayerc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _	        t        j                  |j                        | _        y )Nr   )r+   r,   r   r   r  r\   r   r   r   r   r   r   r   r   s     r#   r,   zTvpOutputLayer.__init__  s`    YYv779K9KL
,,v'9'9v?T?TUzz&"<"<=r"   r   input_tensorr   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r  )r   r   r   )r3   r   r  s      r#   rS   zTvpOutputLayer.forward  s7    

=1]3(DEr"   r  rU   s   @r#   r  r    s1    >U\\  RWR^R^ r"   r  c                   8     e Zd Z fdZ	 	 	 ddee   fdZ xZS )TvpEncodeLayerc                     t         |           t        |      | _        t	        |      | _        t        |      | _        y r  )r+   r,   r   	attentionr	  intermediater  outputr   s     r#   r,   zTvpEncodeLayer.__init__  s3    %f-+F3$V,r"   r   c                     | j                  ||||      }|d   }|dd  }| j                  |      }| j                  ||      }	|	f|z   }|S )N)r   r   r   )r  r  r  )
r3   r   r   r   r   self_attention_outputsattention_outputr  intermediate_outputlayer_outputs
             r#   rS   zTvpEncodeLayer.forward  sr     "&/	 "0 "
 2!4(,"//0@A{{#68HI/G+r"   r  )r   r   r   r,   r   r   rS   rT   rU   s   @r#   r  r    s&    - ,0
 $D>r"   r  c            
       n     e Zd Z fdZ	 	 	 	 	 ddeej                     dee   dee   dee   fdZ xZ	S )
TvpEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r+   r,   rZ   r   
ModuleListrangenum_hidden_layersr  layergradient_checkpointing)r3   rZ   _r4   s      r#   r,   zTvpEncoder.__init__  sN    ]]E&JbJbDc#dqN6$:#de
&+# $es   A#r   r   output_hidden_statesreturn_dictc                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }d}d}t	        | j
                        D ],  \  }	}
|r||fz   } |
||||	   |      }|d   }|s$||d   fz   }. |r||fz   }|s|f}|r||fz   }|r||fz   }|S t        ||r|nd |r|      S d       S )Nr!   r   r   )last_hidden_stater   r   )rZ   r-  r   r,  	enumerater)  r	   )r3   r   r   r   r   r,  r-  all_hidden_statesall_attentionsilayer_modulelayer_outputsr  s                r#   rS   zTvpEncoder.forward  s2    &1%<k$++BYBY1B1N-TXT_T_TqTq$8$D $++JjJj 	 (4 	FOA|#$58H$H!(	RSVghM)!,M !/=3C2E!E	F   1]4D D$&G#!%6$88 !^$55N+/C+):~
 	
 AE
 	
r"   )NNNNN)
r   r   r   r,   r   r   r   r   rS   rT   rU   s   @r#   r$  r$    s]    , 15,0/3&*+
 E--.	+

 $D>+
 'tn+
 d^+
r"   r$  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )	TvpPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r  )r+   r,   r   r   r\   r   Tanh
activationr   s     r#   r,   zTvpPooler.__init__  s9    YYv1163E3EF
'')r"   r   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r:  )r3   r   first_token_tensorpooled_outputs       r#   rS   zTvpPooler.forward  s6     +1a40

#566r"   r  rU   s   @r#   r7  r7    s#    $
U\\ ell r"   r7  c                   @    e Zd ZU eed<   dZdZdej                  fdZ	y)TvpPreTrainedModelrZ   modelTmodulec                 D   t        |t        j                  t        j                  f      r=|j                  j
                  j                  d| j                  j                         nt        |t        j                        rJ|j                  j
                  j                          |j                  j
                  j                  d       nt        |t        j                        rct        j                  j                  |j                  dd       |j                  dt        j                  j!                  |j                  d       n9t        |t"              r)t        j                  j                  |j$                         t        |t        j                        r0|j                  $|j                  j
                  j                          t'        |d	      r)t        j                  j                  |j(                         t'        |d
      r)t        j                  j                  |j*                         t'        |d      r)t        j                  j                  |j,                         t'        |d      r*t        j                  j                  |j.                         yy)zInitialize the weights        )r   stdg      ?fan_outrr   )r   nonlinearityNr   pad_uppad_downpad_left	pad_right)r  r   r   r   weightdatanormal_rZ   initializer_ranger   ra   zero_fill_re   initkaiming_normal_	constant_TvpModeltext_promptrd   rG  rH  rI  rJ  )r3   rA  s     r#   _init_weightsz TvpPreTrainedModel._init_weights.  s   fryy",,78 MM&&CT[[5R5R&S-KK""$MM$$S)		*GG##FMM	PV#W{{&!!&++q1)GGOOF../fbii(V[[-DKK""$68$GGOOFMM*6:&GGOOFOO,6:&GGOOFOO,6;'GGOOF,,- (r"   N)
r   r   r   r   r   base_model_prefixsupports_gradient_checkpointingr   ModulerV  r!   r"   r#   r?  r?  (  s$    &*#.BII .r"   r?  c                   (     e Zd ZdZ fdZd Z xZS )TvpFrameDownPadPrompterz>
    Pad frames extracted from videos only at the bottom.
    c           	      |   |j                   dvrt        d      t        |           |j                  | _        |j
                  | _        |j                  | _        |j                   | _         t        j                  t        j                  d|j
                  d|j                  |j                  g            | _        y )NrB   replaceremove9`visual_prompter_apply` must be in (add, replace, remove)r   r   )visual_prompter_applyr1   r+   r,   visual_prompt_size	frame_nummax_img_sizer   	Parameterr   randnrH  r   s     r#   r,   z TvpFrameDownPadPrompter.__init__O  s    ''/KKXYY"(";";))"//%+%A%A"KKF,,a1J1JFL_L_`a
r"   c                    | j                   dk7  rst        j                  | j                  | j                  g|j                  |j
                        }d|| j                  | j                  z
  | j                  d d f<   ||z  }| j                   dk7  rt        j                  |j                  d   |j                  d   d| j                  | j                  g|j
                        }| j                  | j                  z
  }| j                  |d d d d d d || j                  d d f<   ||j                  |j                        z  }|S )	NrB   r   rC  r_  r   r   r   r   )ra  r   r   rd  r   r   rb  r   rn   rH  to)r3   rt   visual_prompt_maskpromptstart_points        r#   rS   zTvpFrameDownPadPrompter.forward]  s1   %%.!&""D$5$56l>P>PYeYlYl" fit0043J3JJTM^M^^`aab..L%%1[[##A&(:(:1(=q$BSBSUYUfUfg#**F ++d.E.EEKBF--F1aK$*;*;;Q>?FIIl&8&899Lr"   r   rU   s   @r#   r[  r[  J  s    
r"   r[  c                   p     e Zd ZdZ fdZdej                  dededej                  fdZd
de	fd	Z
 xZS )TvpFramePadPrompterz?
    Pad frames extracted from videos in the surroundings.
    c           
         |j                   dvrt        d      t        |           |j                  | _        |j
                  | _        |j                   | _         |j
                  |j                  dz  z
  | _        t        j                  t        j                  d|j                  d|j                  |j
                  g            | _        t        j                  t        j                  d|j                  d|j                  |j
                  g            | _        t        j                  t        j                  d|j                  d|j
                  |j                  dz  z
  |j                  g            | _        t        j                  t        j                  d|j                  d|j
                  |j                  dz  z
  |j                  g            | _        y )Nr]  r`  rj   r   r   )ra  r1   r+   r,   rv   rd  rb  	base_sizer   re  r   rf  rG  rH  rI  rJ  r   s     r#   r,   zTvpFramePadPrompter.__init__t  s   ''/KKXYY ++"//%+%A%A",,v/H/H1/LLllKKF--q&2K2KVM`M`ab
 KKF--q&2K2KVM`M`ab
 KK%%''&*C*Ca*GG--

 KK%%''&*C*Ca*GG--

r"   rk  rx   ry   r   c                    || j                   z  || j                   z  }}|j                  \  }}}}	}
|j                  ||z  ||	|
      }t        j                  j                  |||fdd      }|j                  |||||      }|S )z
        This method allows to interpolate the pre-trained pad weights, to be able to use the model on collection of high
        resolution images (high resolution videos).

        r   Fr   )rd  rn   r   r   rp   r   )r3   rk  rx   ry   r   r   batchrv   channelsprompt_heightprompt_widths              r#   interpolate_pad_encodingz,TvpFramePadPrompter.interpolate_pad_encoding  s     $+++UT5F5F-FBCI<<@z8]L 
 2Hm\Z**b	 + 
 z8VUKr"   rv  c                 Z   |r|j                   d   |j                   d   fn| j                  | j                  f\  }}| j                  dvrt        d| j                         | j                  dv r3t	        j
                  ||g|j                  |j                        }||z  }| j                  dv rt	        j                  d| j                  d	| j                  | j                  |j                  
      }t	        j                  | j                  || j                  gd      }t	        j                  | j                  || j                  gd	      }t	        j                  |j!                  d      |gz        }|r| j#                  |||      }||j%                  |j                        z   }|S )Nr   rY   )rB   r_  r^  z$Invalid visual_prompter_apply value )r^  r_  r   )r^  rB   r   r   rh  rm   r   r   )rn   rd  ra  r1   r   r   r   r   r   rv   rp  catrI  rJ  rG  rH  r   rv  ri  )r3   rt   rv  rx   ry   rj  baserk  s           r#   rS   zTvpFramePadPrompter.forward  s{    ( #\%7%7%;<##T%6%67 	
 %%-IICDD^D^C_`aa%%)>>!&VUO<CUCU^j^q^q!r..L%%);;;;q$//1dnndnn]i]p]pqDYYtT^^D!LFYYVT]]CKFYY|003vh>?F'66vvuM'&))L4F4F*GGLr"   r   )r   r   r   r   r,   r   r   r   rv  r   rS   rT   rU   s   @r#   rn  rn  o  sG    $
Lu|| S QT Y^YeYe 0d r"   rn  )framedownpadframepadzw
    The bare Tvp Model transformer outputting BaseModelOutputWithPooling object without any specific head on top.
    )custom_introc                        e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee   d
ee   dee   defd       Z xZS )rT  c                 "   t         |   |       || _        t        |      | _        t        |      | _        t        |      | _        t        |      | _
        t        |      | _        t        j                  t        j                   dd|j"                  g            | _        t        j&                  |j(                        | _        |j,                  t.        vrt1        d      t/        |j,                     |      | _        | j5                          y )Nr   
   z:`visual_prompter_type` must be in (framedownpad, framepad))r+   r,   rZ   rW   vision_modelr   r   r   visual_embeddingsr$  encoderr7  poolerr   re  r   rf  r\   rU  r   r   r   visual_prompter_typeTVP_PROMPTER_CLASSES_MAPPINGr1   visual_prompter	post_initr   s     r#   r,   zTvpModel.__init__  s     *6208!8!@!&)'<<QF<N<N4O(PQzz&"<"<=&&.JJYZZ;F<W<WXY_`r"   c                 .    | j                   j                  S r  r   r   )r3   s    r#   get_input_embeddingszTvpModel.get_input_embeddings  s    ...r"   c                 &    || j                   _        y r  r  )r3   r   s     r#   set_input_embeddingszTvpModel.set_input_embeddings  s    */'r"   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)zPrunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        N)itemsr  r)  r  r   )r3   heads_to_pruner)  r   s       r#   _prune_headszTvpModel._prune_heads  sE     +002 	CLE5LLu%//;;EB	Cr"   r   rt   r   r   r   r,  r-  r   c	                 "   ||n| j                   j                  }| j                  | j                  ||            }| j	                  |      }	| j                  ||      }
||j                  |
j                  dd       }t        j                  |j                  d   d      j                  |j                  |j                        }t        j                  |||gd	
      }| j                  ||j                               j                  |j                        }| j                   j#                  |	j                  d   d	d	      }t        j                  ||	|
gd
      }| j%                  ||| j'                  || j                   j(                        |||      }|r|j*                  n|d   }| j-                  |      }| j/                  |      }| j/                  |      }|s
||f|dd z   S t1        |||j2                  |j4                        S )a  
        Examples:
        ```python
        >>> import torch
        >>> from transformers import AutoConfig, AutoTokenizer, TvpModel

        >>> model = TvpModel.from_pretrained("Jiqing/tiny-random-tvp")

        >>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

        >>> pixel_values = torch.rand(1, 1, 3, 448, 448)
        >>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
        >>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
        ```N)rv  )r   r   rj   r   r  )r   r   rY   r   r   )r   r   r   r,  r-  )r/  pooler_outputr   r   )rZ   r-  r  r  r   r  new_onesrn   r   r   ri  r   r   rx  get_extended_attention_maskr   rU  r   r  get_head_maskr(  r/  r  r   r
   r   r   )r3   r   rt   r   r   r   r,  r-  r   text_embedding_outputvisual_embedding_outputvisual_attention_maskpt_maskrU  embedding_outputencoder_outputsr/  r=  s                     r#   rS   zTvpModel.forward  s    4 &1%<k$++BYBY((  H` a
 !%) D"&"8"83K #9 #
 %$2$;$;<S<Y<YZ\[\<]$^!jj!5!5a!8"=@@%,,N4H4H A G #YYAV'W]_`N "==ninnN^_bbclcscstN&&--.C.I.I!.LbRTU 99k3HJa%bhij,,)((DKK4Q4QR/!5# ' 
 BMO==RabcRd$56 LL):;]3%}58KKK)/')77&11	
 	
r"   )NNNNNNNF)r   r   r   r,   r  r  r  r   r   r   
LongTensorr   r   rS   rT   rU   s   @r#   rT  rT    s     /0C  15485915,0/3&*).F
E,,-F
 u001F
 !!1!12	F

 E--.F
 $D>F
 'tnF
 d^F
 #'F
 F
r"   rT  c                   $     e Zd Z fdZd Z xZS )TvpVideoGroundingHeadc                 :   t         |           t        j                  |j                  |j                  dz        | _        t        j                  |j                  dz  d      | _        t        j                         | _        t        j                         | _
        y )Nrj   )r+   r,   r   r   r\   layer_0layer_1ReLUactivation_0Sigmoidactivation_1r   s     r#   r,   zTvpVideoGroundingHead.__init__=  sj    yy!3!3V5G5G!5KLyy!3!3a!7;GGIJJLr"   c                     | j                  | j                  |            }| j                  | j                  |            }|S r  )r  r  r  r  )r3   r  r   s      r#   rS   zTvpVideoGroundingHead.forwardD  s9    ""4<<#>?""4<<#78r"   r   rU   s   @r#   r  r  <  s    )r"   r  zb
    Tvp Model with a video grounding head on top computing IoU, distance, and duration loss.
    c                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee	ej                        deej                     dee   dee   d	ee   d
efd       Z xZS )TvpForVideoGroundingc                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r  )r+   r,   rZ   rT  r@  r  video_grounding_headr  r   s     r#   r,   zTvpForVideoGrounding.__init__P  s:     f%
$9&$A!r"   r   rt   r   rP   r   r   r,  r-  r   c
           
         ||n| j                   j                  }| j                  ||||||||	      }
|
d   }| j                  |      }d}|pt	        g d      }|j                  | j                          |||      }|d   | j                   j                  |d   z  z   | j                   j                  |d   z  z   }|s|f|
dd z   }
||f|
z   }
|
S t        |||
j                  |
j                  	      S )
a  
        labels (`torch.FloatTensor` of shape `(batch_size, 3)`, *optional*):
            The labels contains duration, start time, and end time of the video corresponding to the text.

        Examples:
        ```python
        >>> import torch
        >>> from transformers import AutoConfig, AutoTokenizer, TvpForVideoGrounding

        >>> model = TvpForVideoGrounding.from_pretrained("Jiqing/tiny-random-tvp")

        >>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

        >>> pixel_values = torch.rand(1, 1, 3, 448, 448)
        >>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
        >>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
        ```N)r   r   r,  r-  r   r   r'   r(   r)   r*   rj   )r   r   r   r   )rZ   r-  r@  r  r%   ri  r   distance_loss_weightduration_loss_weightr   r   r   )r3   r   rt   r   rP   r   r   r,  r-  r   r  r  r   r   	criterion	loss_dicts                   r#   rS   zTvpForVideoGrounding.forwardX  s,   < &1%<k$++BYBY**/!5#%=  	
  
**=9 ?@ILL%!&&1I% ++22Yz5JJK++22Yz5JJK 
 i'!"+-G'G+N&!//))	
 	
r"   )	NNNNNNNNF)r   r   r   r,   r   r   r   r  r   r    r   r   rS   rT   rU   s   @r#   r  r  J  s      1548590415,0/3&*).@
E,,-@
 u001@
 !!1!12	@

 u||,-@
 E--.@
 $D>@
 'tn@
 d^@
 #'@
 @
r"   r  )rT  r?  r  )3r   r   dataclassesr   typingr   r   torch.utils.checkpointr   activationsr   modeling_layersr   modeling_outputsr	   r
   r   modeling_utilsr   pytorch_utilsr   utilsr   r   utils.backbone_utilsr   configuration_tvpr   
get_loggerr   loggerr   rY  r%   rW   r   r   r   r	  r  r  r$  r7  r?  r[  rn  r  rT  r  r  __all__r!   r"   r#   <module>r     s     !     ! 9 X X - / , 1 ( 
		H	% ?k ?  ?$Mbii M`%RYY %Pnbii nb!RYY !H_299 _Fbii RYY / 82
 2
l		  . . .B"bii "JW")) Wv ,#   
e
! e

e
PBII  
J
- J

J
Z Er"   