
    hU                    B   d dl Z d dlZd dlmZ d dlmZmZmZ d dlZd dlm	Z	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZ ddlmZm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z* ddl+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5m6Z6 d Z7 G d de
jp                        Z9d Z:d Z; G d de
jp                        Z<	 	 d`de
jp                  dej                  dej                  d ej                  d!eej                     d"e=d#e=d$eej                     d%e'e,   fd&Z> G d' d(e
jp                        Z? G d) d*e
jp                        Z@ G d+ d,e
jp                        ZAd- ZB G d. d/e
jp                        ZC G d0 d1e
jp                        ZD G d2 d3e      ZE G d4 d5e
jp                        ZF G d6 d7e
jp                        ZGe- G d8 d9e$             ZH G d: d;eH      ZI G d< d=e
jp                        ZJ G d> d?e
jp                        ZK G d@ dAe
jp                        ZLee- G dB dCe                    ZM G dD dEe
jp                        ZN G dF dGe
jp                        ZO edH       G dI dJe
jp                               ZP G dK dLe
jp                        ZQ G dM dNe
jp                        ZRdO ZSdadPZTdQej                  dReUdSej                  fdTZV G dU dVe
jp                        ZW G dW dXe      ZXe- G dY dZe$             ZY G d[ d\eY      ZZ G d] d^eYe      Z[g d_Z\y)b    N)	dataclass)CallableOptionalUnion)Tensornn   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentionsBaseModelOutputWithPast,BaseModelOutputWithPoolingAndCrossAttentionsCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSModuleUtilsMixinPreTrainedModelget_parameter_dtype)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)OutputRecordercheck_model_inputs   )EvollaConfigSaProtConfigc                     | j                  |      j                         }t        j                  |d      j	                  |      |z  }|j                         |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r%   dim)neinttorchcumsumtype_aslong)	input_idspadding_idxmaskincremental_indicess       h/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/evolla/modeling_evolla.py"create_position_ids_from_input_idsr6   5   sP     <<$((*D,,t3;;DADH##%33    c                   8     e Zd ZdZ fdZ	 	 	 	 ddZd Z xZS )EvollaSaProtEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    t         |           t        j                  |j                  |j
                  |j                        | _        |j                  r1t        j                  |j
                  |j                        | _        nd | _        t        j                  |j                        | _        t        |dd      | _        | j#                  dt%        j&                  |j(                        j+                  d      d       |j                  | _        | j                   dk(  r;t        j                  |j(                  |j
                  | j,                        | _        |j0                  | _        |j2                  | _        d | _        y )	N)r2   epsposition_embedding_typeabsoluteposition_ids)r%   F
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsemb_layer_norm_before	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutgetattrr=   register_bufferr-   arangemax_position_embeddingsexpandr2   position_embeddingstoken_dropoutmask_token_idr?   selfconfig	__class__s     r5   rD   zEvollaSaProtEmbeddings.__init__J   s2   !||F,=,=v?Q?Q_e_r_rs'' ll6+=+=6CXCXYDO"DOzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 "..'':5')||..0B0BPTP`P`(D$ $11#11 r7   c                    |*|t        || j                        }n| j                  |      }|| j                  |      }|}| j                  r||j                  || j                  k(  j                  d      d      }d}||j                  d      n|j                  d   }|| j                  k(  j                  d      j                         |z  }|d|z
  z  d|z
  d d d d f   z  j                  |j                        }| j                  dk(  r| j                  |      }	||	z   }| j                  | j                  |      }|-||j                  d      z  j                  |j                        }|S )Nr@           gQ?r%   r>   )r6   r2   &create_position_ids_from_inputs_embedsrI   rW   masked_fillrX   	unsqueezesumshapefloattodtyper=   rV   rM   )
rZ   r1   attention_maskr?   inputs_embeds
embeddingsmask_ratio_trainsrc_lengthsmask_ratio_observedrV   s
             r5   forwardzEvollaSaProtEmbeddings.forwardc   s    $A)TM]M]^#JJ=Y  00;M #
 )"7#//d>P>P1P0[0[\^0_adeJ)4B4N.,,R0T]TcTcdeTfK#,0B0B#B"G"G"K"Q"Q"SVa"a$,<(<=EXAXZ[]acgZg@hhll  J '':5"&":":<"H#&99J??&4J%$~'?'?'CCGG
HXHXYJ r7   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr@   r%   rf   devicer   )sizer-   rS   r2   r0   rp   ra   rU   )rZ   rh   input_shapesequence_lengthr?   s        r5   r_   z=EvollaSaProtEmbeddings.create_position_ids_from_inputs_embeds   s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r7   NNNN)__name__
__module____qualname____doc__rD   rm   r_   __classcell__r\   s   @r5   r9   r9   E   s&    !6 /b=r7   r9   c                 b    | j                  dd      \  }}t        j                  | |fd      S )N   r@   r)   )chunkr-   catxx1x2s      r5   rotate_half_esmr      s/    WWQBWFB99rc2YB''r7   c                     |d d d d d | j                   d   d d f   }|d d d d d | j                   d   d d f   }| |z  t        |       |z  z   S )N)rc   r   )r   cossins      r5   apply_rotary_pos_emb_esmr      sY    
aMaggbkM1$
%C
aMaggbkM1$
%CG*S011r7   c                        e Zd ZU dZej
                  ed<   def fdZd
dZ	dej
                  dej
                  de
ej
                  ej
                  f   fd	Z xZS )EvollaSaProtRotaryEmbeddingz
    Rotary position embeddings based on those in
    [RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation
    matrices which depend on their relative positions.
    inv_freqr*   c                     t         |           ddt        j                  d|dt        j                        j                         |z  z  z  }|}| j                  d|       d | _        d | _        d | _	        y )N      ?i'  r   r|   rf   r   )
rC   rD   r-   rS   int64rd   rR   _seq_len_cached_cos_cached_sin_cached)rZ   r*   r   r\   s      r5   rD   z$EvollaSaProtRotaryEmbedding.__init__   sl    %ELLC%++$N$T$T$VY\$\]^Z2#r7   c                 t   |j                   |   }|| j                  k7  s#| j                  j                  |j                  k7  r|| _        t	        j
                  |j                   |   |j                        j                  | j                        }t	        j                  || j                        }t	        j                  ||fd      j                  |j                        }|j                         d d d d d d f   | _        |j                         d d d d d d f   | _        | j                  | j                  fS )Nrp   r@   r)   )rc   r   r   rp   r-   rS   r/   r   outerr~   re   r   r   r   )rZ   r   seq_dimensionseq_lentfreqsembs          r5   _update_cos_sin_tablesz2EvollaSaProtRotaryEmbedding._update_cos_sin_tables   s    ''-( d***d.>.>.E.E.Q#*D QWW]3AHHEMMdmm\AKK4==1E))UEN366qxx@C"wwytQ)9:D"wwytQ)9:D!1!111r7   qkreturnc                 .   | j                  |d      \  | _        | _        t        || j                  | j                        j	                  |j
                        t        || j                  | j                        j	                  |j
                        fS )Nr   )r   r   )r   r   r   r   re   rf   )rZ   r   r   s      r5   rm   z#EvollaSaProtRotaryEmbedding.forward   s    -1-H-HZ\-H-]*$* %Q(8(8$:J:JKNNUVU\U\N]$Q(8(8$:J:JKNNUVU\U\N]
 	
r7   )r|   )ru   rv   rw   rx   r-   r   __annotations__r,   rD   r   tuplerm   ry   rz   s   @r5   r   r      sY     ll	 C 	 2 
 
%,, 
5u||A[;\ 
r7   r   modulequerykeyvaluerg   scalingrP   	head_maskkwargsc                    t        j                  ||j                  dd            |z  }	t        | d      rN| j                  dv r?|j
                  d   }
t        j                  |
t         j                  |	j                        j                  dd      }t        j                  |
t         j                  |	j                        j                  dd      }||z
  }| j                  || j                  z   dz
        }|j                  |j                        }| j                  d	k(  rt        j                  d
||      }nB| j                  dk(  r3t        j                  d
||      }t        j                  d||      }||z   }|	z   }	|#|d d d d d d d |j
                  d   f   }|	|z   }	t        j                   j#                  |	dt         j$                        j                  |j                        }	t        j                   j'                  |	|| j(                        }	||	|z  }	t        j                  |	|      }|j                  dd      j+                         }||	fS )Nr|   r	   r=   relative_keyrelative_key_queryro   r@   r%   r   r   zbhld,lrd->bhlrr   zbhrd,lrd->bhlrr   )r*   rf   )ptraining)r-   matmul	transposehasattrr=   rc   rS   r0   rp   viewdistance_embeddingrT   re   rf   einsumr   
functionalsoftmaxfloat32rP   r   
contiguous)r   r   r   r   rg   r   rP   r   r   attn_weights
seq_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keycausal_maskattn_outputs                       r5   eager_attention_forwardr      s#    <<s}}Q':;gELv01f6T6T Y 7 [[^
j

<K^K^_ddegijkj

<K^K^_ddefhjk!N2%88FDbDb9bef9fg366U[[6I))^;',||4DeMa'b$++/CC-2\\:JESg-h*+0<<8H#Oc+d('EHd'd$#&>>!$Q1o		"o%=>#k1==((2U]](SVVW\WbWbcL==((6??([L#i/,,|U3K''1-88:K$$r7   c                        e Zd Zd
 fd	Z	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     dee	   de
ej
                     fd	Z xZS )EvollaSaProtSelfAttentionc                 v   t         |           || _        |j                  |j                  z  dk7  r2t        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        |j                  | _        |xs t#        |dd      | _        d | _        | j$                  dk(  s| j$                  d	k(  rG|j(                  | _        t        j*                  d
|j(                  z  dz
  | j                        | _        n*| j$                  dk(  rt/        | j                        | _        |j0                  | _        || _        d| _        | j0                  xr | | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r=   r>   r   r   r|   r%   rotaryr)   r   )rC   rD   r[   rG   num_attention_headsr   
ValueErrorr,   attention_head_sizeall_head_sizer   Linearr   r   r   attention_probs_dropout_probrP   rQ   r=   rotary_embeddingsrT   rE   r   r   
is_decoder	layer_idxr   	is_causal)rZ   r[   r=   r   is_cross_attentionr\   s        r5   rD   z"EvollaSaProtSelfAttention.__init__  s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
::'> (
'-zC
$ "&''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD#))X5%@TE]E]%^D" ++"C1C-Cr7   hidden_statesrg   r   encoder_hidden_statesencoder_attention_maskr   r   c                    |j                   d d \  }}||d| j                  f}	| j                  |      j                  |	      j	                  dd      }
|d u}|r|n|}|r|n|}| j                  |      j                  |	      j	                  dd      }| j                  |      j                  |	      j	                  dd      }|
| j                  dz  z  }
| j                  dk(  r| j                  |
|      \  }
}t        }| j                  j                  dk7  r[| j                  dv r0t        d| j                  j                   d	| j                   d
      t        | j                  j                     } || |
|||f| j                  sdn| j                  | j                   |d|\  }}|j#                  ||d      j%                         }||fS )Nr@   r%   r|         r   eagerr   zESM z attention does not support z^ embeddings. Set attention explicitly to 'eager' with `model.set_attn_implementation('eager')`r^   )rP   r   r   )rc   r   r   r   r   r   r   r=   r   r   r[   _attn_implementationr   r   r   rP   r   reshaper   )rZ   r   rg   r   r   r   r   
batch_sizer   hidden_shapequery_layerr   current_states	key_layervalue_layerattention_interfacer   r   s                     r5   rm   z!EvollaSaProtSelfAttention.forward4  s    "/!4!4Sb!9
J"JD4L4LMjj/44\BLLQPQR2$>2D.-3E/>HH^,11,?II!QO	jj055lCMMaQRS "D$<$<d$BB''83%)%;%;K%S"K(?;;++w6++/UU 4;;;;<<XY]YuYuXv wh h  #:$++:Z:Z"[$7
%
  $}}C$,,LL
%
 
%
!\ "))*j"EPPRL((r7   )NNFrt   )ru   rv   rw   rD   r-   r   r   FloatTensorr   r   r   rm   ry   rz   s   @r5   r   r     s     DJ 7;15=A>B3)||3) !!2!233) E--.	3)
  ((9(9:3) !)):): ;3) +,3) 
u||	3)r7   r   c                   $     e Zd Z fdZd Z xZS )EvollaSaProtSelfOutputc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y N)	rC   rD   r   r   rG   denserN   rO   rP   rY   s     r5   rD   zEvollaSaProtSelfOutput.__init__k  sB    YYv1163E3EF
zz&"<"<=r7   c                 T    | j                  |      }| j                  |      }||z   }|S r   r   rP   rZ   r   input_tensors      r5   rm   zEvollaSaProtSelfOutput.forwardp  .    

=1]3%4r7   ru   rv   rw   rD   rm   ry   rz   s   @r5   r   r   j      >
r7   r   c                   B     e Zd Zd fd	Zd Z	 	 	 	 ddee   fdZ xZS )EvollaSaProtAttentionc                     t         |           t        |||      | _        t	        |      | _        t               | _        t        j                  |j                  |j                        | _	        y )N)r   r   r;   )rC   rD   r   rZ   r   outputsetpruned_headsr   rK   rG   rL   )rZ   r[   r   r   r\   s       r5   rD   zEvollaSaProtAttention.__init__x  sS    -f	^pq	,V4Ef&8&8f>S>STr7   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r%   r)   )lenr   rZ   r   r   r   r   r   r   r   r   r   r   union)rZ   headsindexs      r5   prune_headsz!EvollaSaProtAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r7   r   c                     | j                  |      } | j                  |f||||d|\  }}	| j                  ||      }|S )Nrg   r   r   r   )rK   rZ   r   )
rZ   r   rg   r   r   r   r   hidden_states_lnr   _s
             r5   rm   zEvollaSaProtAttention.forward  s]      >>-8"
)"7#9
 
Q kk+}=r7   )NFrt   )	ru   rv   rw   rD   r   r   r   rm   ry   rz   s   @r5   r   r   w  s1    U;* "# +,r7   r   c                 j    | dz  dt        j                  | t        j                  d      z        z   z  S )zz
    This is the gelu implementation from the original EVOLLA_SA_PROT repo. Using F.gelu yields subtly wrong results.
    g      ?r   g       @)r-   erfmathsqrt)r   s    r5   gelur    s.     s7cEIIa$))C.&899::r7   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )EvollaSaProtIntermediatec                     t         |           t        j                  |j                  |j
                        | _        y r   )rC   rD   r   r   rG   intermediate_sizer   rY   s     r5   rD   z!EvollaSaProtIntermediate.__init__  s,    YYv1163K3KL
r7   r   r   c                 >    | j                  |      }t        |      }|S r   )r   r  )rZ   r   s     r5   rm   z EvollaSaProtIntermediate.forward  s     

=1]+r7   ru   rv   rw   rD   r-   r   rm   ry   rz   s   @r5   r  r    s$    MU\\ ell r7   r  c                   $     e Zd Z fdZd Z xZS )EvollaSaProtOutputc                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r   )
rC   rD   r   r   r
  rG   r   rN   rO   rP   rY   s     r5   rD   zEvollaSaProtOutput.__init__  sB    YYv779K9KL
zz&"<"<=r7   c                 T    | j                  |      }| j                  |      }||z   }|S r   r   r   s      r5   rm   zEvollaSaProtOutput.forward  r   r7   r   rz   s   @r5   r  r    r   r7   r  c                   @     e Zd Z fdZ	 	 	 	 ddee   fdZd Z xZS )EvollaSaProtLayerc                    t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        |j                  | _        | j                  r,| j                  st        |  d      t	        |d      | _	        t        |      | _        t        |      | _        t        j                  |j                   |j"                        | _        y )Nr%   z> should be used as a decoder model if cross attention is addedT)r   r;   )rC   rD   chunk_size_feed_forwardseq_len_dimr   	attentionr   add_cross_attentionRuntimeErrorcrossattentionr  intermediater  r   r   rK   rG   rL   rY   s     r5   rD   zEvollaSaProtLayer.__init__  s    '-'E'E$.v6 ++#)#=#= ##??"dV+i#jkk"7SW"XD4V<(0f&8&8f>S>STr7   r   c                      | j                   |f||d|}| j                  r5|3t        | d      st        d|  d       | j                  |f||||d|}| j                  |      }|S )N)rg   r   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )r  r   r   AttributeErrorr  feed_forward_chunk)	rZ   r   rg   r   r   r   r   attention_outputlayer_outputs	            r5   rm   zEvollaSaProtLayer.forward  s     *4>>
)
 	
 ??4@4!12$=dV D` ` 
  3t22  -#&;'=    ../?@r7   c                 n    | j                  |      }| j                  |      }| j                  ||      }|S r   )rK   r  r   )rZ   r  attention_output_lnintermediate_outputr  s        r5   r  z$EvollaSaProtLayer.feed_forward_chunk  s<    "nn-=>"//0CD{{#68HIr7   rt   )	ru   rv   rw   rD   r   r   rm   r  ry   rz   s   @r5   r  r    s2    U$ "#! +,!Fr7   r  c                   D     e Zd Z fdZe	 	 	 	 ddee   fd       Z xZS )EvollaSaProtEncoderc                 0   t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        t        j                  |j                  |j                        | _        d| _        y c c}w )Nr;   F)rC   rD   r[   r   
ModuleListrangenum_hidden_layersr  layerrK   rG   rL   emb_layer_norm_aftergradient_checkpointing)rZ   r[   r  r\   s      r5   rD   zEvollaSaProtEncoder.__init__  sn    ]]uVMeMeGf#g!$5f$=#gh
$&LL1C1CI^I^$_!&+# $hs   Br   c           	          t        | j                        D ]  \  }}|||   nd }	 ||f||	||d|} | j                  r| j                  |      }t        |      S )Nr   )last_hidden_state)	enumerater)  r*  r   )
rZ   r   rg   r   r   r   r   ilayer_modulelayer_head_masks
             r5   rm   zEvollaSaProtEncoder.forward  s~      )4 		OA|.7.CilO(-)&;'= M		 $$ 55mDM1MRRr7   rt   )	ru   rv   rw   rD   r!   r   r   rm   ry   rz   s   @r5   r$  r$     s=    ,  "#S +,S Sr7   r$  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )EvollaSaProtPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )rC   rD   r   r   rG   r   Tanh
activationrY   s     r5   rD   zEvollaSaProtPooler.__init__$  s9    YYv1163E3EF
'')r7   r   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r6  )rZ   r   first_token_tensorpooled_outputs       r5   rm   zEvollaSaProtPooler.forward)  s6     +1a40

#566r7   r  rz   s   @r5   r3  r3  #  s#    $
U\\ ell r7   r3  c                   d    e Zd ZU eed<   dgZdZdZdZe	 e
edd      g e
edd      gdZd	 Zy
)EvollaSaProtPreTrainedModelr[   r  Tr%   r  )r   
layer_namer  )r   
attentionscross_attentionsc                    | j                   j                  }t        |t        j                        rY|j
                  j                  j                  d|       |j                  %|j                  j                  j                          yyt        |t        j                        rf|j
                  j                  j                  d|       |j                  2|j
                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j
                  j                  j                  d       yy)zInitialize the weightsr^   meanstdNr   )r[   initializer_range
isinstancer   r   weightdatanormal_biaszero_rE   r2   rK   fill_)rZ   r   rB  s      r5   _init_weightsz)EvollaSaProtPreTrainedModel._init_weightsB  s    kk++fbii(MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> .-KK""$MM$$S) .r7   N)ru   rv   rw   r'   r   _no_split_modules_supports_flash_attn_supports_sdpa_supports_attention_backendr  r#   r   _can_record_outputsrK   r7   r5   r;  r;  2  sX    ,-N"& +%&?qU`ab4AJZ[
*r7   r;  c                        e Zd Zdef fdZd Zd Zd Ze	 dde	e
j                     de	e
j                     deee
j                     ef   fd	       Z	 dded
ee   de
j"                  de
j$                  def
dZ xZS )EvollaSaProtProteinEncoderr[   c                 d    t         |   |       t        |      | _        t	        |      | _        y r   )rC   rD   r9   ri   r$  encoderrY   s     r5   rD   z#EvollaSaProtProteinEncoder.__init__S  s(     08*62r7   c                 .    | j                   j                  S r   ri   rI   rZ   s    r5   get_input_embeddingsz/EvollaSaProtProteinEncoder.get_input_embeddingsX  s    ...r7   c                 &    || j                   _        y r   rW  rZ   r   s     r5   set_input_embeddingsz/EvollaSaProtProteinEncoder.set_input_embeddings[  s    */'r7   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrU  r)  r  r   )rZ   heads_to_pruner)  r   s       r5   _prune_headsz'EvollaSaProtProteinEncoder._prune_heads^  sE    
 +002 	CLE5LLu%//;;EB	Cr7   r1   rg   r   c                 N   |j                         }|\  }}|j                  }|t        j                  ||f|      }| j	                  ||      }| j                  ||      }| j                  ||      }	|	d   }
t        |
|	j                  |	j                  |	j                        S )Nr   r1   rg   )rg   r   )r-  r   r=  r>  )rq   rp   r-   onesri   get_extended_attention_maskrU  r   r   r=  r>  )rZ   r1   rg   rr   r   r   rp   rh   extended_attention_maskencoder_outputssequence_outputs              r5   rm   z"EvollaSaProtProteinEncoder.forwardf  s      nn&!,
J!!!"ZZ*j)A6RN)N["&"B"B>S^"_,,}E\,])!,;-)77&11,==	
 	
r7   rr   rp   rf   c                 4   |t        |       }|j                         dk(  r| j                  j                  s|t	        j
                  dt               |j                         dk(  r|dddddddf   }nk|j                         dk(  r<| j                  j                  rt        j                  |||      }n*|ddddddf   }nt        d| d|j                   d      |j                  |      }d	|z
  t        j                  |      j                  z  }|S )
a  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`Tuple[int]`):
                The shape of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        Nr|   zNThe `device` argument is deprecated and will be removed in v5 of Transformers.r	   z!Wrong shape for input_ids (shape z) or attention_mask (shape r   r   r   )r   r*   r[   r   warningswarnFutureWarningr   *create_extended_attention_mask_for_decoderr   rc   re   r-   finfomin)rZ   rg   rr   rp   rf   re  s         r5   rd  z6EvollaSaProtProteinEncoder.get_extended_attention_mask  s"    ='-E""$)dkk.D.D!dfs
 1$&4Qa]&C#!Q& {{%%*:*e*e+' +9D$9I*J'3K=@[\j\p\p[qqrs  #:"<"<5"<"I#&)@#@EKKPUDVDZDZ"Z&&r7   r   NN)ru   rv   rw   r'   rD   rY  r\  r`  r$   r   r-   r   r   r   r   rm   r,   rp   rd   rd  ry   rz   s   @r5   rS  rS  R  s    3| 3
/0C  26
ELL)
 !.
 
uU\\"$PP	Q	
 
2 rv2'$2'38:2'GL||2'chcncn2'	2'r7   rS  c                   &     e Zd Zd fd	Zd Z xZS )!EvollaSequenceCompressorAttentionc                 j   t         |           |dz  | _        || _        ||z  }t	        j
                  |      | _        t	        j
                  |      | _        t	        j                  ||d      | _	        t	        j                  ||dz  d      | _
        t	        j                  ||d      | _        y )Nr   FrH  r|   )rC   rD   scaler   r   rK   
norm_medianorm_latentsr   to_qto_kvto_out)rZ   r*   dim_headr   	inner_dimr\   s        r5   rD   z*EvollaSequenceCompressorAttention.__init__  s    t^

u$	,,s+LL-IIc959	YYsIM>
ii	3U;r7   c                 F   | j                  |      }| j                  |      }| j                  }| j                  |      }t	        j
                  ||fd      }| j                  |      j                  dd      \  }}|j                  |j                  d      |j                  d      |d      j                  dddd      }|j                  |j                  d      |j                  d      |d      j                  dddd      }|j                  |j                  d      |j                  d      |d      j                  dddd      }|| j                  z  }t	        j                  ||j                  dd            }	|	|	j                  dd	      j                         z
  }	|	j                   \  }
}}}t	        j"                  ||      j%                  |j&                        }|d
d
d
d
d
d
f   }|d
d
d
d
d
d
f   }||z  }|	j)                  d|z
  j+                         d      }	|	j-                  d      }t	        j                  ||      }|j                  dddd      }|j/                  |j                  d      |j                  d      d      }| j1                  |      S )z
        Args:
            x (torch.Tensor): image features
                shape (b, n1, D)
            latent (torch.Tensor): latent features
                shape (b, n2, D);  n2: num of latent tokens
        r   r)   r|   r@   r   r%   r	   Tr*   keepdimNg     )ru  rv  r   rw  r-   r~   rx  r}   r   rq   permutert  r   r   amaxdetachrc   rc  re   rp   r`   boolr   r   ry  )rZ   r   latentsr3   hr   kv_inputr   vsimbsnhskdokdrc  mask_expones_expattnouts                      r5   rm   z)EvollaSequenceCompressorAttention.forward  sB    OOA##G,JJIIg99a\r2zz(#))2 * 
1 FF166!9affQiB/771aCFF166!9affQiB/771aCFF166!9affQiB/771aC

N ll1akk"b12CHHTH299;;99BSzz"c"%%dkk24q()aD()("ooq4xoo/6{{r{"ll4#kk!Q1% kk#((1+sxx{B7{{3r7   )@      r   rz   s   @r5   rq  rq    s    <) r7   rq  c                   &     e Zd Zd fd	Zd Z xZS )EvollaFeedForwardc                    t         |           t        ||z        }t        j                  |      | _        t        j                  ||d      | _        t        j                         | _	        t        j                  ||d      | _
        y NFrs  )rC   rD   r,   r   rK   normr   fc1GELUr6  fc2)rZ   r*   multr{  r\   s       r5   rD   zEvollaFeedForward.__init__  s`    d
O	LL%	99S)%8'')99Y%8r7   c           	      ~    | j                  | j                  | j                  | j                  |                        S r   )r  r6  r  r  )rZ   r   s     r5   rm   zEvollaFeedForward.forward  s+    xx1(>?@@r7   )   r   rz   s   @r5   r  r    s    9Ar7   r  c                   *     e Zd Zdef fdZd Z xZS )!EvollaSequenceCompressorResamplerr[   c           
         t         |           |j                  j                  }|j                  | _        t        j                  t        j                  | j
                  |      d      | _
        t        j                  g       | _        t        |j                        D ]g  }| j                  j                  t        j                  t!        ||j"                  |j$                        t'        ||j(                        g             i t        j*                  |j                        | _        t        j.                  ||j                        | _        y )NT)requires_grad)r*   rz  r   )r*   r  )rC   rD   protein_encoder_configrG   resampler_num_latentsnum_latentsr   	Parameterr-   randnr  r&  layersr'  resampler_depthappendrq  resampler_dim_headresampler_headsr  resampler_ff_multrK   r  r   protein_projector)rZ   r[   protein_repr_dimr  r\   s       r5   rD   z*EvollaSequenceCompressorResampler.__init__  s   !88DD!77||EKK0@0@BR$ScghmmB'v--. 
	AKK9 06;T;T\b\r\r *.>VE]E]^		
	 LL!3!34	!#+;V=O=O!Pr7   c                 j   |j                   d   }|j                   \  }}t        j                  || j                        j	                  |j
                        }t        j                  ||fd      }t        j                  |      j	                  | j                  j
                        }| j                  d    |j                  ddd      z  }|j	                  |j                        }| j                  D ]  \  }	}
 |	|||      |z   } |
|      |z   } | j                  |      }| j                  |      S )Nr   r%   r)   r@   )rc   r-   rc  r  re   rp   r~   r  r   rf   r  r  r  )rZ   embedsr3   br  r  latent_maskrc  r  r  fftransformed_features               r5   rm   z)EvollaSequenceCompressorResampler.forward  s   LLO

AjjT%5%5699$++Fyy$,!4 zz!} 3 34,,t$tyyQ'::**V\\* 	,HD"67D1G;GkG+G	, #44W=yy,--r7   )ru   rv   rw   r&   rD   rm   ry   rz   s   @r5   r  r    s    Q| Q*.r7   r  c                       e Zd ZU dZej
                  ed<   dZeej
                     ed<   dZ	ee
ej
                  df      ed<   dZee
ej
                  df      ed<   y)EvollaProteinEncoderModelOutputNsequence_compressor_outputr-  .r   r=  )ru   rv   rw   r  r-   r   r   r-  r   r   r   r=  rQ  r7   r5   r  r  &  si     59 1 1859x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r7   r  c                   f     e Zd Zdef fdZedej                  dej                  fd       Z	 xZ
S )EvollaProteinEncoderr[   c                 z    t         |           t        |j                        | _        t        |      | _        y )Nr[   )rC   rD   rS  r  modelr  sequence_compressor_resamplerrY   s     r5   rD   zEvollaProteinEncoder.__init__0  s.    /v7T7TU
-NV\-]*r7   r1   rg   c                     | j                  ||      }|j                  }| j                  ||      }t        ||j                        S )Nrb  )r  r-  )r  r-  r  r  )rZ   r1   rg   r   protein_outputprotein_embedssequence_reprs          r5   rm   zEvollaProteinEncoder.forward5  sJ    iW'99::>>Z.'4,>>
 	
r7   )ru   rv   rw   r&   rD   r!   r-   
LongTensorr   rm   ry   rz   s   @r5   r  r  /  s?    ^| ^
 
!1!1 
5CTCT 
 
r7   r  c                   ~     e Zd Z	 	 	 ddee   dee   dee   f fdZd Z eddd	      	 	 	 	 	 	 	 dd
       Z xZ	S )#EvollaSequenceAlignerCrossAttentionprotein_encoder_dimstructure_encoder_dimmsa_encoder_dimc                    t         |           |j                  | _        |j                  | _        | j                  dz  | _        t        | j                  | j                  z        | _        | j                  | j                  z  | _        |j                  }|j                  }|j                  }t        j                  | j                  | j                        | _        |Kt        j                  || j                        | _        t        j                  || j                        | _        nd | _        d | _        |Kt        j                  || j                        | _        t        j                  || j                        | _        nd | _        d | _        |Kt        j                  || j                        | _        t        j                  || j                        | _        nd | _        d | _        t)        | j                        | _        t        j,                  |      | _        t        j                  | j                  | j                  |      | _        t3        | j                  |      | _        t        j6                  t9        j:                  dg            | _        t        j6                  t9        j:                  dg            | _        y )Nr   rs  r^   ) rC   rD   rG   r   rt  r,   r   r   $aligner_attention_probs_dropout_probaligner_enable_biasaligner_ffn_multr   r   r   key_proteinvalue_proteinkey_structurevalue_structurekey_msa	value_msaEvollaRMSNormattention_normrN   rP   out_projr  r  r  r-   tensorgate_attentiongate_ffw)	rZ   r[   r  r  r  r   enable_biasffn_multr\   s	           r5   rD   z,EvollaSequenceAlignerCrossAttention.__init__B  s    	!--#)#=#= --t3
#&t'7'7$:R:R'R#S !558P8PP'-'R'R$00**YYt//1C1CD
*!yy)<d>P>PQD!#+>@R@R!SD#D!%D ,!#+@$BTBT!UD#%99-BDDVDV#WD !%D#'D &99_d6H6HIDLYY8J8JKDNDL!DN+D,<,<=zz">?		$"2"2D4D4D;W#D$4$4h? ll5<<+>?U\\3%%89r7   c	                    |||g}	|	D 
cg c]  }
|
|
	 }	}
|	st        d      t        j                  |	d      }	| j                  |      }| j	                  |      }| j
                  @| j                  4|j                  |      }| j                  |      }| j                  |      }nd}d}| j                  @| j                  4|j                  |      }| j                  |      }| j                  |      }nd}d}| j                  @| j                  4|j                  |      }| j                  |      }| j                  |      }nd}d}|||g}|D 
cg c]  }
|
|
	 }}
t        j                  |d      }|||g}|D 
cg c]  }
|
|
	 }}
t        j                  |d      }|j                         dd | j                  | j                  fz   } |j                  | j!                  dddd      }|j                         dd | j                  | j                  fz   } |j                  | j!                  dddd      }|j                         dd | j                  | j                  fz   } |j                  | j!                  dddd      }|| j"                  z  }|Mt        j$                  |j                  d      |j                  d            j                  |j&                        }|ddddddf   |	ddddddf   z  }t        j(                  ||j+                  dd	            }||j-                  dd
      j/                         z
  }|j1                  d|z
  j3                         t        j4                  |j6                        j8                        } t;        j<                  d      |      }t        j(                  ||      }|j!                  dddd      j?                         }|j                         dd	 | j@                  fz   } |j                  | }| jC                  |      }|S c c}
w c c}
w c c}
w )z
        query_states: text
        key_value_states: protein
        query_states: [bs, query_seq_len, dim]
        key_value_states: [bs, kv_seq_len, dim]
        query_attn_mask: [bs, query_seq_len]
        kv_attn_mask: [bs, kv_seq_len]
        Nz=At least one modality should be provided for cross attention.r%   r)   r@   r   r|   r	   r   Tr}  )"r   r-   r~   r  r   r  r  re   r  r  r  r  rq   r   r   r   r  rt  rc  rp   r   r   r  r  r`   r  rm  rf   rn  r   Softmaxr   r   r  )rZ   query_statesprotein_key_value_statesstructure_key_value_statesmsa_key_value_statesquery_attn_maskprotein_kv_attn_maskstructure_kv_attn_maskmsa_kv_attn_maskkv_attn_maskr  r   key_layer_proteinvalue_layer_proteinkey_layer_structurevalue_layer_structurekey_layer_msavalue_layer_msar   r   new_query_layer_shapenew_key_layer_shapenew_value_layer_shaperg   r   attention_scoresattention_probscontext_layernew_context_layer_shapes                                r5   cross_attentionz3EvollaSequenceAlignerCrossAttention.cross_attentionu  si   * -.DFVW#/Aa1=AA\]]yy15)),7 jj-'D,>,>,J'?'B'B<'P$ $ 0 01I J"&"4"45M"N $"&)d.B.B.N)C)F)F|)T&"&"4"45O"P$($8$89S$T!"&$(!<<#(B#7#:#:<#H  LL)=>M"nn-ABO M"O&(;]K	 );1Q]Q;	;IIiQ/	*,A?S"-?Qq??ii3 + 0 0 23B 7$$$$;
 !
 'k&&(=>FFq!QPQR'nn.s3$$$$7
 
 #INN$78@@Aq!L	 + 0 0 23B 7$$$$;
 !
 'k&&(=>FFq!QPQR!DJJ. "#jj):):1)=|?P?PQR?STWWXdXkXklO(D!T)9:\!TSWYZJZ=[[||K1D1DR1LM#l&7&7B&7&M&T&T&VV'33%%'\5G5G)H)L)L
 -"**,-=> _kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CDm4q BL < @s"   P5P5P:P:P?P?past_key_valuepast_key_values4.58new_nameversionc           
      ^   |z|j                   \  }}}|jt        j                  ||      j                  |	j                        |	j                  ||f      j                  z  j                  |j                        }nd }|z|j                   \  }}}|jt        j                  ||      j                  |	j                        |
j                  ||f      j                  z  j                  |j                        }nd }|z|j                   \  }}}|jt        j                  ||      j                  |	j                        |j                  ||f      j                  z  j                  |j                        }nd }|}||j                         s$||j                         s||j                         rz|}| j                  ||||||||      }t        j                  | j                        |z  }||z   }|}| j                  |      t        j                  | j                        z  }||z   }|S )N)rq   )r  r  r  r  r  r  r  r  )rc   r-   rc  re   rp   rU   Tanyr  tanhr  r  r  )rZ   r  protein_kv_statesstructure_kv_statesmsa_kv_statesr  r  r  r  protein_batch_maskstructure_batch_maskmsa_batch_maskr  r  protein_kv_seq_lenr*   structure_kv_seq_lenmsa_kv_seq_lenr   residuals                       r5   rm   z+EvollaSequenceAlignerCrossAttention.forward  sL     (*;*A*A'B"C#+JJr#5699:L:S:ST(//6H"5M/NPPQ"&--. %
 $( *,?,E,E)B$c%-JJr#78;;<N<U<UV*118Lb7Q1RTTU"(//0 '
 &*"$&3&9&9#B'JJr>2556H6O6OP$++."1E+FHHI"]))* !
  $$ */C/G/G/I#/4J4N4N4P).>.B.B.D$H 00*):+>%2 /%9'=!1 1 	M "JJt':':;mKM$}4M$H GGM2UZZ5NNM$}4Mr7   )NNNNNNNNNN)
ru   rv   rw   r   r,   rD   r  r"   rm   ry   rz   s   @r5   r  r  A  s~     .2/3)-1: &c]1:  (}	1:
 "#1:fn` %0A6R "#!G SGr7   r  RMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )r  c                     t         |           t        j                  t	        j
                  |            | _        || _        y)z<
        EvollaRMSNorm is equivalent to T5LayerNorm
        N)rC   rD   r   r  r-   rc  rE  variance_epsilon)rZ   rG   r<   r\   s      r5   rD   zEvollaRMSNorm.__init__2  s1     	ll5::k#:; #r7   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nr|   r@   T)r~  )	rf   re   r-   r   powrA  rsqrtr
  rE  )rZ   r   input_dtypevariances       r5   rm   zEvollaRMSNorm.forward:  sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r7   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)r   rE  rc   r
  rX  s    r5   
extra_reprzEvollaRMSNorm.extra_reprA  s*    ))*+6$2G2G1HIIr7   )gư>)ru   rv   rw   rD   rm   r  ry   rz   s   @r5   r  r  0  s    $;Jr7   r  c                   ~     e Zd ZU ej                  ed<   ddef fdZ ej                         e	d               Z
 xZS )EvollaRotaryEmbeddingr   r[   c                    t         |           t        |d      rUt        |j                  t
              r;|j                  j                  d|j                  j                  d            | _        nd| _        |j                  | _	        |j                  | _
        || _        t        | j                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                   | _        y )Nrope_scaling	rope_typetypedefaultr   FrA   )rC   rD   r   rD  r  dictgetr  rT   max_seq_len_cachedoriginal_max_seq_lenr[   r   rope_init_fnattention_scalingrR   r   original_inv_freq)rZ   r[   rp   r   r\   s       r5   rD   zEvollaRotaryEmbedding.__init__H  s    6>*z&:M:Mt/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r7   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   r@   r%   mpscpuF)device_typeenabledr|   r)   r   )r   rd   rU   rc   re   rp   rD  r  strr-   autocastr   r~   r   r  r   rf   )
rZ   r   r?   inv_freq_expandedposition_ids_expandedr#  r   r   r   r   s
             r5   rm   zEvollaRotaryEmbedding.forwardY  sV    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s    BF%%F.r   )ru   rv   rw   r-   r   r   r&   rD   no_gradr   rm   ry   rz   s   @r5   r  r  E  s=    ll/| /" U]]_<  <r7   r  c                   $     e Zd Z fdZd Z xZS )	EvollaMLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  | j                  | j                  |j                        | _
        t        |j                     | _        y )Nrs  )rC   rD   r[   rG   r
  r   r   mlp_bias	gate_projup_proj	down_projr
   
hidden_actact_fnrY   s     r5   rD   zEvollaMLP.__init__j  s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r7   c                     | j                  | j                  | j                  |            | j                  |      z        }|S r   )r0  r2  r.  r/  )rZ   r   r0  s      r5   rm   zEvollaMLP.forwardt  s6    NN4;;t~~a/@#ADLLQRO#ST	r7   r   rz   s   @r5   r+  r+  i  s    0r7   r+  c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nr@   r|   r)   )rc   r-   r~   r   s      r5   rotate_halfr5  y  sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r7   c                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )ra   r5  )r   r   r   r   r?   unsqueeze_dimq_embedk_embeds           r5   apply_rotary_pos_embr:    sY    ( --
&C
--
&C3w;q>C/0G3w;q>C/0GGr7   r   n_repr   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r%   N)rc   rU   r   )r   r;  batchnum_key_value_headsslenhead_dims         r5   	repeat_kvrA    so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr7   c                   *    e Zd ZdZdedef fdZ eddd      	 	 dd	ej                  d
e
ej                  ej                  f   deej                     dee   deej                     dee   de
ej                  ej                  f   fd       Z xZS )EvollaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr[   r   c                 d   t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        | j                  dz  | _
        |j                  | _        d| _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j                  | j                  z  |j
                  |j                        | _        y )Nr@  r   Trs  )rC   rD   r[   r   rQ   rG   r   r@  r>  num_key_value_groupsr   attention_dropoutr   r   r   attention_biasq_projk_projv_projo_projrZ   r[   r   r\   s      r5   rD   zEvollaAttention.__init__  sM   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
r7   r  r  r  r  r   rV   rg   cache_positionr   r   c                 4   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  rt        | j                  j                     } || |	|
||f| j                  sdn| j                  | j                   d|\  }} |j"                  g |d j%                         }| j'                  |      }||fS )Nr@   r%   r|   )r   r   rM  r   r^   )rP   r   )rc   r@  rH  r   r   rI  rJ  r:  updater   r   r[   r   r   r   rF  r   r   r   rK  )rZ   r   rV   rg   r  rM  r   rr   r   r  
key_statesvalue_statesr   r   cache_kwargsr   r   r   s                     r5   rm   zEvollaAttention.forward  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r7   ro  )ru   rv   rw   rx   r&   r,   rD   r"   r-   r   r   r   r   r  r   r   rm   ry   rz   s   @r5   rC  rC    s    G
| 
 
. %0A6R ,059))||)) #5<<#=>)) !.	))
 "%)) !!1!12)) +,)) 
u||U\\)	*)) S))r7   rC  c                        e Zd Zdedef fdZ eddd      	 	 	 	 	 	 	 	 	 	 	 	 ddej                  d	e	ej                  ej                  f   d
e
ej                     de
ej                     de
e   de
e   de
ej                     de
ej                     de
ej                     de
ej                     de
ej                     de
ej                     de
ej                     de
ej                     dej                  fd       Z xZS )EvollaDecoderLayerr[   r   c                    t         |           |j                  | _        t        ||      | _        t        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        |dz   t        |j                  |j                  z  d      z  dk(  rt        ||j                        | _        y y )Nr[   r   r;   r%   r   )r  )rC   rD   rG   rC  	self_attnr+  mlpr  rms_norm_epsinput_layernormpost_attention_layernormmaxr(  aligner_num_add_layersr  adapterrL  s      r5   rD   zEvollaDecoderLayer.__init__  s    !--()LV$,V-?-?VEXEXY(5f6H6HfNaNa(b%MS!9!9V=Z=Z!Z\]^^bcc>$*$6$6DL dr7   r  r  r  r  r   rV   rg   r?   	use_cacherM  r  r  r  r  r   r  r  r   c                    |}| j                  |      } | j                  d|||||||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }t	        | d      r| j                  |||	|
||||      }|S )N)r   rg   r?   r  r_  rM  rV   r^  )r  r  r  r  r  r  r   r  rQ  )rZ  rW  r[  rX  r   r^  )rZ   r   rV   rg   r?   r  r_  rM  r  r  r  r  r   r  r  r   r  r  s                     r5   rm   zEvollaDecoderLayer.forward  s    & !,,]; *4>> 	
')%+) 3	
 	
q !=0 !55mD/ =04# LL*"3$7+ /#5%9- ) 	M r7   )NNNFNNNNNNNN)ru   rv   rw   r&   r,   rD   r"   r-   r   r   r   r  r   r  rm   ry   rz   s   @r5   rT  rT    sp   |   %0A6R
 2637+/$)59486:04597;15265||5 #5<<#=>5 !.	5
 u//05 "%5 D>5 !!1!125 $ELL15 &ell35  -5 %U\\25 'u||45 !.5 "%,,/5" 
#5 S5r7   rT  c                   ^     e Zd ZU eed<   dZdZg dZdgZdZ	dZ
dZdZdZeedZ fdZ xZS )	EvollaPreTrainedModelr[   r  T)rT  r  r  r  F)r   r=  c                    | j                   j                  }t        |   |       t	        |t
              rd|j                  j                          |j                  j                          |j                  j                  j                  j                  d       y t	        |t              r(|j                  j                  j                  d|       y y )Nr   r^   r@  )r[   rC  rC   rK  rD  r  r  rI  r  r  rE  rF  rJ  r  r  rG  )rZ   r   rB  r\   s      r5   rK  z#EvollaPreTrainedModel._init_weightsM  s    kk++f%fAB!!'')OO!!#!!((--33C8 ABNN''Sc': Cr7   )ru   rv   rw   r&   r   base_model_prefixsupports_gradient_checkpointingrL  _skip_keys_device_placementrM  rN  _supports_flex_attn_can_compile_fullgraphrO  rT  rC  rP  rK  ry   rz   s   @r5   rb  rb  7  s]    &*#
 $5"5 N!"'+%
; ;r7   rb  c            !           e Zd Zdef fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 	 	 dde	j                  dee	j                     dee	j                     dee   d	ee	j                     d
ee   dee	j                     dee	j                     dee	j                     dee	j                     dee	j                     dee	j                     dee	j                     deeef   fd              Z xZS )EvollaModelr[   c           	      F   t         |   |       |j                  | _        |j                  | _        t        j                  | j                  |j                  | j                        | _        t        |      | _
        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t!        |j                  |j"                        | _        t'        |      | _        t+        |dd      | _        | j/                          y c c}w )Nr  rV  r;   r+  F)rC   rD   rH   r2   rF   r   rE   rG   embed_tokensr  protein_encoderr&  r'  r(  rT  r  r  rY  r  r  
rotary_embrQ   r+  	post_initrL  s      r5   rD   zEvollaModel.__init__Y  s     !.. ++LL&:L:LdN^N^_36Bmm "'v'?'?!@
 	 #!'
 "&"4"4&:M:MN	/v>&-f6NPU&V#s   $Dc                     | j                   S r   rl  rX  s    r5   rY  z EvollaModel.get_input_embeddingsn  s       r7   c                     || _         y r   rq  r[  s     r5   r\  z EvollaModel.set_input_embeddingsq  s
    !r7   r1   rg   r?   r  rh   r_  rM  protein_input_idsprotein_attention_maskstructure_feats	msa_featsr   r  r   c                    |du |duz  rt        d      || j                  |      }|r|t        | j                        }|F||j	                         nd}t        j                  |||j                  d   z   |j                        }||j                  d      }d}d}|S|	Q| j                  ||	      }|j                  }t        j                  dg|j                  d   z  |j                        }t        | j                  ||||	      }|}| j                  ||      }| j                  D ]  } ||f||||||||
|||||d
|} | j!                  |      }t#        ||      }|S )a;  
        protein_input_ids (torch.LongTensor):
            The input IDs for the protein sequence in structure-aware tokens. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`.
        protein_attention_mask (torch.Tensor):
            The attention mask for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.Tensor`.
        structure_feats (torch.FloatTensor):
            The input IDs for purely structure-based features. Should be of shape `(batch_size, structure_seq_length, structure_feat_dim)` and type `torch.FloatTensor`. Dummy input for now.
        msa_feats (torch.FloatTensor):
            The input IDs for purely MSA-based features. Should be of shape `(batch_size, msa_seq_length, msa_feat_dim)` and type `torch.FloatTensor`. Dummy input for now.
        structure_batch_mask (torch.Tensor):
            The batch mask to decide which protein sequences are purely structure-based. Should be of shape `(batch_size)` and type `torch.Tensor`. Should be paired with `structure_feats`. Dummpy input for now.
        msa_batch_mask (torch.Tensor):
            The batch mask to decide which protein sequences are purely MSA-based. Should be of shape `(batch_size)` and type `torch.Tensor`. Should be paired with `msa_feats`. Dummpy input for now.
        Nz:You must specify exactly one of input_ids or inputs_embedsr  r   r%   r   rb  T)r[   input_embedsrg   rM  r  )rg   r?   r  r_  rM  rV   r  r  r  r  r   r  r  )r-  r  )r   rl  r   r[   get_seq_lengthr-   rS   rc   rp   ra   rm  r  r  r   rn  r  r  r   )rZ   r1   rg   r?   r  rh   r_  rM  rs  rt  ru  rv  r   r  r   past_seen_tokensprotein_featsr  protein_outputsr   r   rV   decoder_layerr   s                           r5   rm   zEvollaModel.forwardt  s   B -t";<YZZ  --i8M0*$++>O!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L!(-C-O"22+5 3 O ,FFM!&tf7H7N7Nq7Q.QZkZrZr!s(;;&))+
 & #oom\J![[ 	M)*) /#-$7"/$3'#5%9- . M	& 		-0(++
 r7   )NNNNNNNNNNNNN)ru   rv   rw   r&   rD   rY  r\  r    r$   r-   r  r   r   r   r   r  r   r   r   rm   ry   rz   s   @r5   rj  rj  X  sw   | *!"  '+1537+/59$(598<9=7;157;15b##b !.b u//0	b
 "%b   1 12b D>b !!1!12b $E$4$45b !) 6b "%"3"34b E--.b 'u||4b !.b  
u--	.!b  br7   rj  c                       e Zd Z fdZd Zd Zee	 	 	 	 	 	 	 ddej                  de
ej                     de
ej                     de
ej                     dej                  d	e
ej                     d
e
e   fd              Z xZS )EvollaForProteinText2Textc                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  | j                  d      | _        | j                          y r  )
rC   rD   rj  r  rF   r   r   rG   lm_headro  rY   s     r5   rD   z"EvollaForProteinText2Text.__init__  sQ      (
 ++yy!3!3T__5Qr7   c                 6    | j                   j                         S r   )r  rY  rX  s    r5   rY  z.EvollaForProteinText2Text.get_input_embeddings  s    zz..00r7   c                 8    | j                   j                  |      S r   )r  r\  r[  s     r5   r\  z.EvollaForProteinText2Text.set_input_embeddings  s    zz..u55r7   r1   rg   rh   labelsrs  rt  r_  c           
          | j                   d||||||d|}	|	d   }
| j                  |
      }d}|  | j                  d||| j                  d|}t	        |||	j
                  |	j                  |	j                        }|S )a,  
        protein_input_ids (torch.LongTensor):
            The input IDs for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`.
        protein_attention_mask (torch.Tensor):
            The attention mask for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.Tensor`.

        Example:

        ```python
        >>> from transformers import EvollaProcessor, EvollaForProteinText2Text
        >>> model = EvollaForProteinText2Text.from_pretrained("westlake/Evolla-10B-hf")
        >>> processor = EvollaProcessor.from_pretrained("westlake/Evolla-10B-hf")

        >>> protein_information = {
            "aa_seq": "your amino acid sequence",
            "foldseek": "your foldseek sequence",
        }
        >>> question = "What is the function of this protein?"
        >>> message = [
            {"role": "system", "content": "You are an AI expert that can answer any questions about protein."},
            {"role": "user", "content": question},
        ]

        >>> inputs = processor(proteins=[protein_information], messages_list=[message], return_tensors="pt", padding="longest")
        >>> outputs = model.generate(**inputs)

        >>> print(processor.batch_decode(outputs, skip_special_tokens=True))
        ```)r1   rg   rh   rs  rt  r_  r   N)logitsr  rF   )lossr  r  r   r=  rQ  )r  r  loss_functionrF   r   r  r   r=  )rZ   r1   rg   rh   r  rs  rt  r_  r   outputsr   r  r  
lm_outputss                 r5   rm   z!EvollaForProteinText2Text.forward  s    T $** 
)'/#9
 
  
m,%4%%iVFtibhiD+#33!//))

 r7   r  )ru   rv   rw   rD   rY  r\  r!   r    r-   r  r   r   r   r  rm   ry   rz   s   @r5   r  r    s    16  '+1559-1.29=$(?##? !.?   1 12	?
 ))*? !++? !) 6? D>?  ?r7   r  )r  rj  rb  )r^   N)Nr%   )]r  ri  dataclassesr   typingr   r   r   r-   r   r   activationsr
   cache_utilsr   r   
generationr   integrationsr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   r   r   processing_utilsr   pytorch_utilsr   r   utilsr   r    r!   utils.deprecationr"   utils.genericr#   r$   configuration_evollar&   r'   r6   Moduler9   r   r   r   rd   r   r   r   r   r  r  r  r  r$  r3  r;  rS  rq  r  r  r  r  r  r  r  r+  r5  r:  r,   rA  rC  rT  rb  rj  r  __all__rQ  r7   r5   <module>r     s(  ,   ! , ,   ! . ) 7 / 9  L m m & Q I I 0 ? <4 ^=RYY ^=B(
2*
")) *
h (,/%II/%<</% 
/% <<	/%
 U\\*/% /% /% %/% '(/%dV)		 V)r
RYY 
-BII -`;ryy 
 
72 7t S"))  SF  */ * *>_'!< _'D7 		 7 tA		 A'.		 '.T ?k ?  ?
299 
$l")) l^ Y'JBII J (J(!<BII !<H		  (6	UU\\ 	U# 	U%,, 	UD)bii D)NF3 FR ;O ; ;@@' @FP 5 Pf Pr7   