
    h                        d Z ddlZddlmZmZmZ ddlZddlZddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZmZmZmZmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZmZm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z&  e!jN                  e(      Z)d Z*d Z+d Z,d Z-d Z. G d dej                  j^                        Z0 G d dej^                        Z1 G d dej^                        Z2	 	 dGdej^                  dejf                  dejf                  dejf                  deejf                     d e4d!e4d"eejf                     d#ee   fd$Z5 G d% d&ej^                        Z6 G d' d(ej^                        Z7 G d) d*ej^                        Z8 G d+ d,ej^                        Z9 G d- d.ej^                        Z: G d/ d0e      Z; G d1 d2ej^                        Z< G d3 d4ej^                        Z=e G d5 d6e             Z>e G d7 d8e>             Z?e G d9 d:e>             Z@ G d; d<ej^                        ZA ed=>       G d? d@e>             ZBe G dA dBe>             ZC G dC dDej^                        ZDdE ZEg dFZFy)HzPyTorch ESM model.    N)CallableOptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)TransformersKwargsauto_docstringcan_return_tuplelogging)OutputRecordercheck_model_inputs   )	EsmConfigc                 b    | j                  dd      \  }}t        j                  | |fd      S )N   dim)chunktorchcat)xx1x2s      b/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/esm/modeling_esm.pyrotate_halfr*   -   s/    WWQBWFB99rc2YB''    c                     |d d d d d | j                   d   d d f   }|d d d d d | j                   d   d d f   }| |z  t        |       |z  z   S )N)shaper*   )r&   cossins      r)   apply_rotary_pos_embr1   2   sX    
aMaggbkM1$
%C
aMaggbkM1$
%CGA,--r+   c                 j    | dz  dt        j                  | t        j                  d      z        z   z  S )zo
    This is the gelu implementation from the original ESM repo. Using F.gelu yields subtly wrong results.
    g      ?      ?g       @)r$   erfmathsqrtr&   s    r)   gelur8   9   s.     s7cEIIa$))C.&899::r+   c                 ,    | | j                  dd      z   S )zJMake layer symmetric in final two dimensions, used for contact prediction.r    r-   )	transposer7   s    r)   
symmetrizer;   @   s    q{{2r"""r+   c                     | j                  dd      }| j                  dd      }| j                  dd      }||z  }|j                  |       | |z
  }|S )z=Perform average product correct, used for contact prediction.r    T)keepdimsr-   )r    r-   )sumdiv_)r&   a1a2a12avg
normalizeds         r)   average_product_correctrE   E   s[    	
rD	!B	
rD	!B
%%4%
(C
r'CHHSMSJr+   c                        e Zd ZU dZej
                  ed<   def fdZd
dZ	dej
                  dej
                  de
ej
                  ej
                  f   fd	Z xZS )RotaryEmbeddingz
    Rotary position embeddings based on those in
    [RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation
    matrices which depend on their relative positions.
    inv_freqr"   c                     t         |           ddt        j                  d|dt        j                        j                         |z  z  z  }|}| j                  d|       d | _        d | _        d | _	        y )Nr3   i'  r   r   dtyperH   )
super__init__r$   arangeint64floatregister_buffer_seq_len_cached_cos_cached_sin_cached)selfr"   rH   	__class__s      r)   rM   zRotaryEmbedding.__init__Z   sl    %ELLC%++$N$T$T$VY\$\]^Z2#r+   c                 t   |j                   |   }|| j                  k7  s#| j                  j                  |j                  k7  r|| _        t	        j
                  |j                   |   |j                        j                  | j                        }t	        j                  || j                        }t	        j                  ||fd      j                  |j                        }|j                         d d d d d d f   | _        |j                         d d d d d d f   | _        | j                  | j                  fS )Ndevicer    r!   )r.   rR   rS   rY   r$   rN   type_asrH   outerr%   tor/   r0   rT   )rU   r&   seq_dimensionseq_lentfreqsembs          r)   _update_cos_sin_tablesz&RotaryEmbedding._update_cos_sin_tablese   s    ''-( d***d.>.>.E.E.Q#*D QWW]3AHHEMMdmm\AKK4==1E))UEN366qxx@C"wwytQ)9:D"wwytQ)9:D!1!111r+   qkreturnc                 .   | j                  |d      \  | _        | _        t        || j                  | j                        j	                  |j
                        t        || j                  | j                        j	                  |j
                        fS )Nr-   )r]   rJ   )rb   rS   rT   r1   r\   rK   )rU   rc   rd   s      r)   forwardzRotaryEmbedding.forwardu   s    -1-H-HZ\-H-]*$* !D$4$4d6F6FGJJQRQXQXJY D$4$4d6F6FGJJQRQXQXJY
 	
r+   )r   )__name__
__module____qualname____doc__r$   Tensor__annotations__intrM   rb   tuplerg   __classcell__rV   s   @r)   rG   rG   Q   sY     ll	 C 	 2 
 
%,, 
5u||A[;\ 
r+   rG   c                   8     e Zd ZdZ	 	 ddedef fdZd Z xZS )EsmContactPredictionHeadzWPerforms symmetrization, apc, and computes a logistic regression on the output featuresin_featureseos_idxc                     t         |           || _        || _        t	        j
                  |d|      | _        t	        j                         | _        y )Nr   )	rL   rM   rt   ru   r   Linear
regressionSigmoid
activation)rU   rt   biasru   rV   s       r)   rM   z!EsmContactPredictionHead.__init__   s@     	&))KD9**,r+   c                 X   |j                  | j                        j                  |      }|j                  d      |j                  d      z  }||d d d d d d d d f   z  }|dd dd df   }|ddd dd f   }|j	                         \  }}}}}|j                  |||z  ||      }|j                  | j                  j                  j                        }t        t        |            }|j                  dddd      }| j                  | j                  |      j                  d            S )Nr   r   .r    r   r
   )neru   r\   	unsqueezesizeviewrx   weightrY   rE   r;   permuterz   squeeze)	rU   tokens
attentionseos_mask
batch_sizelayersheadsseqlen_s	            r)   rg   z EsmContactPredictionHead.forward   s!   99T\\*--j9%%a(8+=+=a+@@(1dD!Q+>"??
SbS#2#.
QR,
/9/@,
FE61__Z%P
  ]]OO""))

 -Z
-CD
''1a3
tz:BB1EFFr+   )Tr   )rh   ri   rj   rk   rn   rM   rg   rp   rq   s   @r)   rs   rs   ~   s+    a
 	
'
' 	
'Gr+   rs   c                   8     e Zd ZdZ fdZ	 	 	 	 ddZd Z xZS )EsmEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    t         |           t        j                  |j                  |j
                  |j                        | _        |j                  r1t        j                  |j
                  |j                        | _        nd | _        t        j                  |j                        | _        t        |dd      | _        | j#                  dt%        j&                  |j(                        j+                  d      d       |j                  | _        | j                   dk(  r;t        j                  |j(                  |j
                  | j,                        | _        |j0                  | _        |j2                  | _        y )	N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r   r    F)
persistent)rL   rM   r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsemb_layer_norm_before	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutgetattrr   rQ   r$   rN   max_position_embeddingsexpandr   position_embeddingstoken_dropoutmask_token_idrU   configrV   s     r)   rM   zEsmEmbeddings.__init__   s*   !||F,=,=v?Q?Q_e_r_rs'' ll6+=+=6CXCXYDO"DOzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 "..'':5')||..0B0BPTP`P`(D$ $11#11r+   c                    |*|t        || j                        }n| j                  |      }|| j                  |      }|}| j                  r||j                  || j                  k(  j                  d      d      }d}||j                  d      n|j                  d   }|| j                  k(  j                  d      j                         |z  }|d|z
  z  d|z
  d d d d f   z  j                  |j                        }| j                  dk(  r| j                  |      }	||	z   }| j                  | j                  |      }|-||j                  d      z  j                  |j                        }|S )Nr            gQ?r   r   )"create_position_ids_from_input_idsr   &create_position_ids_from_inputs_embedsr   r   masked_fillr   r~   r>   r.   rP   r\   rK   r   r   r   )
rU   	input_idsattention_maskr   inputs_embeds
embeddingsmask_ratio_trainsrc_lengthsmask_ratio_observedr   s
             r)   rg   zEsmEmbeddings.forward   s    $A)TM]M]^#JJ=Y  00;M #
 )"7#//d>P>P1P0[0[\^0_adeJ)4B4N.,,R0T]TcTcdeTfK#,0B0B#B"G"G"K"Q"Q"SVa"a$,<(<=EXAXZ[]acgZg@hhll  J '':5"&":":<"H#&99J??&4J%$~'?'?'CCGG
HXHXYJ r+   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr    r   rK   rY   r   )r   r$   rN   r   longrY   r~   r   )rU   r   input_shapesequence_lengthr   s        r)   r   z4EsmEmbeddings.create_position_ids_from_inputs_embeds   s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r+   NNNN)rh   ri   rj   rk   rM   rg   r   rp   rq   s   @r)   r   r      s&    22 /b=r+   r   modulequerykeyvaluer   scalingr   	head_maskkwargsc                    t        j                  ||j                  dd            |z  }	t        | d      rN| j                  dv r?|j
                  d   }
t        j                  |
t         j                  |	j                        j                  dd      }t        j                  |
t         j                  |	j                        j                  dd      }||z
  }| j                  || j                  z   dz
        }|j                  |j                        }| j                  d	k(  rt        j                  d
||      }nB| j                  dk(  r3t        j                  d
||      }t        j                  d||      }||z   }|	z   }	|#|d d d d d d d |j
                  d   f   }|	|z   }	t        j                   j#                  |	dt         j$                        j                  |j                        }	t        j                   j'                  |	|| j(                        }	||	|z  }	t        j                  |	|      }|j                  dd      j+                         }||	fS )Nr   r
   r   relative_keyrelative_key_queryr   r    r   rJ   r   zbhld,lrd->bhlrr   zbhrd,lrd->bhlrr-   )r"   rK   )ptraining)r$   matmulr:   hasattrr   r.   rN   r   rY   r   distance_embeddingr   r\   rK   einsumr   
functionalsoftmaxfloat32r   r   
contiguous)r   r   r   r   r   r   r   r   r   attn_weights
seq_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keycausal_maskattn_outputs                       r)   eager_attention_forwardr      s#    <<s}}Q':;gELv01f6T6T Y 7 [[^
j

<K^K^_ddegijkj

<K^K^_ddefhjk!N2%88FDbDb9bef9fg366U[[6I))^;',||4DeMa'b$++/CC-2\\:JESg-h*+0<<8H#Oc+d('EHd'd$#&>>!$Q1o		"o%=>#k1==((2U]](SVVW\WbWbcL==((6??([L#i/,,|U3K''1-88:K$$r+   c                        e Zd Zd
 fd	Z	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     dee	   de
ej
                     fd	Z xZS )EsmSelfAttentionc                 v   t         |           || _        |j                  |j                  z  dk7  r2t        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        |j                  | _        |xs t#        |dd      | _        d | _        | j$                  dk(  s| j$                  d	k(  rG|j(                  | _        t        j*                  d
|j(                  z  dz
  | j                        | _        n*| j$                  dk(  rt/        | j                        | _        d| _        |j2                  | _        || _        | j2                  xr | | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   r   r   r   r   rotaryr!   r3   )rL   rM   r   r   num_attention_headsr   
ValueErrorrn   attention_head_sizeall_head_sizer   rw   r   r   r   attention_probs_dropout_probr   r   r   rotary_embeddingsr   r   r   rG   r   
is_decoder	layer_idx	is_causal)rU   r   r   r   is_cross_attentionrV   s        r)   rM   zEsmSelfAttention.__init__3  s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
::'> (
'-zC
$ "&''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD#))X5%49Q9Q%RD" ++"C1C-Cr+   hidden_statesr   r   encoder_hidden_statesencoder_attention_maskr   re   c                    |j                   d d \  }}||d| j                  f}	| j                  |      j                  |	      j	                  dd      }
|d u}|r|n|}|r|n|}| j                  |      j                  |	      j	                  dd      }| j                  |      j                  |	      j	                  dd      }|
| j                  dz  z  }
| j                  dk(  r| j                  |
|      \  }
}t        }| j                  j                  dk7  r[| j                  dv r0t        d| j                  j                   d	| j                   d
      t        | j                  j                     } || |
|||f| j                  sdn| j                  | j                   |d|\  }}|j#                  ||d      j%                         }||fS )Nr    r   r   g      r   eagerr   zESM z attention does not support z^ embeddings. Set attention explicitly to 'eager' with `model.set_attn_implementation('eager')`r   )r   r   r   )r.   r   r   r   r:   r   r   r   r   r   r   _attn_implementationr   r   r   r   r   reshaper   )rU   r   r   r   r   r   r   r   r   hidden_shapequery_layerr   current_states	key_layervalue_layerattention_interfacer   r   s                     r)   rg   zEsmSelfAttention.forwardU  s    "/!4!4Sb!9
J"JD4L4LMjj/44\BLLQPQR2$>2D.-3E/>HH^,11,?II!QO	jj055lCMMaQRS "D$<$<d$BB''83%)%;%;K%S"K(?;;++w6++/UU 4;;;;<<XY]YuYuXv wh h  #:$++:Z:Z"[$7
%
  $}}C$,,LL
%
 
%
!\ "))*j"EPPRL((r+   )NNFr   )rh   ri   rj   rM   r$   rl   r   FloatTensorr   r   ro   rg   rp   rq   s   @r)   r   r   2  s     DJ 7;15=A>B3)||3) !!2!233) E--.	3)
  ((9(9:3) !)):): ;3) +,3) 
u||	3)r+   r   c                   $     e Zd Z fdZd Z xZS )EsmSelfOutputc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y N)	rL   rM   r   rw   r   denser   r   r   r   s     r)   rM   zEsmSelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r+   c                 T    | j                  |      }| j                  |      }||z   }|S r   r   r   rU   r   input_tensors      r)   rg   zEsmSelfOutput.forward  .    

=1]3%4r+   rh   ri   rj   rM   rg   rp   rq   s   @r)   r   r         >
r+   r   c                   B     e Zd Zd fd	Zd Z	 	 	 	 ddee   fdZ xZS )EsmAttentionc                     t         |           t        |||      | _        t	        |      | _        t               | _        t        j                  |j                  |j                        | _	        y )N)r   r   r   )rL   rM   r   rU   r   outputsetpruned_headsr   r   r   r   )rU   r   r   r   rV   s       r)   rM   zEsmAttention.__init__  sS    $VyUgh	#F+Ef&8&8f>S>STr+   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r!   )lenr   rU   r   r   r  r   r   r   r   r
  r   r   union)rU   r   indexs      r)   prune_headszEsmAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r+   r   c                     | j                  |      } | j                  |f||||d|\  }}	| j                  ||      }|S )Nr   r   r   r   )r   rU   r
  )
rU   r   r   r   r   r   r   hidden_states_lnr   r   s
             r)   rg   zEsmAttention.forward  s]      >>-8"
)"7#9
 
Q kk+}=r+   )NFr   )	rh   ri   rj   rM   r  r   r   rg   rp   rq   s   @r)   r  r    s1    U;* "# +,r+   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )EsmIntermediatec                     t         |           t        j                  |j                  |j
                        | _        y r   )rL   rM   r   rw   r   intermediate_sizer   r   s     r)   rM   zEsmIntermediate.__init__  s,    YYv1163K3KL
r+   r   re   c                 >    | j                  |      }t        |      }|S r   )r   r8   )rU   r   s     r)   rg   zEsmIntermediate.forward  s     

=1]+r+   rh   ri   rj   rM   r$   rl   rg   rp   rq   s   @r)   r  r    s$    MU\\ ell r+   r  c                   $     e Zd Z fdZd Z xZS )	EsmOutputc                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r   )
rL   rM   r   rw   r  r   r   r   r   r   r   s     r)   rM   zEsmOutput.__init__  sB    YYv779K9KL
zz&"<"<=r+   c                 T    | j                  |      }| j                  |      }||z   }|S r   r  r  s      r)   rg   zEsmOutput.forward  r  r+   r  rq   s   @r)   r  r    r  r+   r  c                   @     e Zd Z fdZ	 	 	 	 ddee   fdZd Z xZS )EsmLayerc                    t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        |j                  | _        | j                  r,| j                  st        |  d      t	        |d      | _	        t        |      | _        t        |      | _        t        j                  |j                   |j"                        | _        y )Nr   z> should be used as a decoder model if cross attention is addedT)r   r   )rL   rM   chunk_size_feed_forwardseq_len_dimr  	attentionr   add_cross_attentionRuntimeErrorcrossattentionr  intermediater  r
  r   r   r   r   r   s     r)   rM   zEsmLayer.__init__  s    '-'E'E$%f- ++#)#=#= ##??"dV+i#jkk".v$"OD+F3'f&8&8f>S>STr+   r   c                      | j                   |f||d|}| j                  r5|3t        | d      st        d|  d       | j                  |f||||d|}| j                  |      }|S )N)r   r   r'  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r  )r$  r   r   AttributeErrorr'  feed_forward_chunk)	rU   r   r   r   r   r   r   attention_outputlayer_outputs	            r)   rg   zEsmLayer.forward  s     *4>>
)
 	
 ??4@4!12$=dV D` ` 
  3t22  -#&;'=    ../?@r+   c                 n    | j                  |      }| j                  |      }| j                  ||      }|S r   )r   r(  r
  )rU   r,  attention_output_lnintermediate_outputr-  s        r)   r+  zEsmLayer.feed_forward_chunk  s<    "nn-=>"//0CD{{#68HIr+   r   )	rh   ri   rj   rM   r   r   rg   r+  rp   rq   s   @r)   r   r     s2    U$ "#! +,!Fr+   r   c                   D     e Zd Z fdZe	 	 	 	 ddee   fd       Z xZS )
EsmEncoderc                 0   t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        t        j                  |j                  |j                        | _        d| _        y c c}w )Nr   F)rL   rM   r   r   
ModuleListrangenum_hidden_layersr   layerr   r   r   emb_layer_norm_aftergradient_checkpointing)rU   r   r   rV   s      r)   rM   zEsmEncoder.__init__  sm    ]]eFD\D\>]#^HV$4#^_
$&LL1C1CI^I^$_!&+# $_s   Br   c           	          t        | j                        D ]  \  }}|||   nd }	 ||f||	||d|} | j                  r| j                  |      }t        |      S )Nr  )last_hidden_state)	enumerater7  r8  r   )
rU   r   r   r   r   r   r   ilayer_modulelayer_head_masks
             r)   rg   zEsmEncoder.forward"  s~      )4 		OA|.7.CilO(-)&;'= M		 $$ 55mDM1MRRr+   r   )	rh   ri   rj   rM   r   r   r   rg   rp   rq   s   @r)   r2  r2    s=    ,  "#S +,S Sr+   r2  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )	EsmPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )rL   rM   r   rw   r   r   Tanhrz   r   s     r)   rM   zEsmPooler.__init__?  s9    YYv1163E3EF
'')r+   r   re   c                 \    |d d df   }| j                  |      }| j                  |      }|S Nr   )r   rz   )rU   r   first_token_tensorpooled_outputs       r)   rg   zEsmPooler.forwardD  s6     +1a40

#566r+   r  rq   s   @r)   rA  rA  >  s#    $
U\\ ell r+   rA  c                   ~    e Zd ZU eed<   dZdZg dZdgZdZ	dZ
dZdZe eedd      g eedd	      gd
Zd Zd Zy)EsmPreTrainedModelr   esmT)r   #EsmFoldTriangularSelfAttentionBlockr   zposition_embeddings.weightr   r$  )r  
layer_namer'  )r   r   cross_attentionsc                 l   t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yt        |t              r%|j                  j                  j                          yy)zInitialize the weightsr   )meanstdNr3   )
isinstancer   rw   r   datanormal_r   initializer_ranger{   zero_r   r   r   fill_	EsmLMHead)rU   r   s     r)   _init_weightsz EsmPreTrainedModel._init_weightsb  s&   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S)	*KK""$ +r+   c                      y r    rU   s    r)   get_output_embeddingsz(EsmPreTrainedModel.get_output_embeddingst  s     r+   N)rh   ri   rj   r   rm   base_model_prefixsupports_gradient_checkpointing_no_split_modules"_keys_to_ignore_on_load_unexpected_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   r   _can_record_outputsrX  r\  rZ  r+   r)   rI  rI  M  st    &*#\*F)G&N"& "%&6aKXY+1AQR
%$r+   rI  c                   l    e Zd ZdZd fd	Zd Zd Zd Zee		 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     d	e
ej                     d
e
ej                     de
ej                     de
ej                     dee   deeej                     ef   fd              Zd Z xZS )EsmModela  

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    c                    t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        t        |j                  |j                  z  d      | _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        NT)rt   r{   )rL   rM   r   r   r   r2  encoderrA  poolerrs   r6  r   contact_head	post_init)rU   r   add_pooling_layerrV   s      r)   rM   zEsmModel.__init__  sq    
 	 '/!&)+<i'$40063M3MMTX

 	r+   c                 .    | j                   j                  S r   r   r   r[  s    r)   get_input_embeddingszEsmModel.get_input_embeddings  s    ...r+   c                 &    || j                   _        y r   ro  )rU   r   s     r)   set_input_embeddingszEsmModel.set_input_embeddings  s    */'r+   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsri  r7  r$  r  )rU   heads_to_pruner7  r   s       r)   _prune_headszEsmModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr+   r   r   r   r   r   r   r   r   re   c                    |du |duz  rt        d      || j                  ||      }| j                  j                  dk7  rL|j                  dd \  }	}
|#t        j                  |	|
f|j                        }| j                  ||	|
f      }| j                  j                  rO|M|j                         \  }}}||f}|!t        j                  ||j                        }| j                  |      }nd}| j                  || j                  j                        } | j                  |f||||d|}|d	   }| j                  | j                  |      nd}t!        ||
      S )aV  
        input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length))`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        position_ids (`torch.LongTensor` of shape `((batch_size, sequence_length))`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `((batch_size, sequence_length), hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   flash_attention_2r    rX   )r   r  r   )r;  pooler_output)r   r   r   r   r.   r$   onesrY   get_extended_attention_maskr   r   invert_attention_maskget_head_maskr6  ri  rj  r   )rU   r   r   r   r   r   r   r   r   r   r   encoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskencoder_outputssequence_outputrG  s                      r)   rg   zEsmModel.forward  s   > -t";<YZZ  OO#) , M
 ;;++/BB%2%8%8"%="J
%!&j*-E}OcOc!d+/+K+KZ,D ,L ,N ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQ^QeQe)f&.2.H.HI_.`+.2+ &&y$++2O2OP	&$,,
)"7#B
 
 *!,8<8OO4UY;-'
 	
r+   c                 H    | ||dd      j                   }t        j                  |d      }||j                  d      j                  d      j                  d      z  }||j                  d      j                  d      j                  d      z  }| j	                  ||      S )NT)r   return_dictoutput_attentionsr   r!   r   r
      )r   r$   stackr~   rk  )rU   r   r   attnss       r)   predict_contactszEsmModel.predict_contacts  s    VN`deppEq)
 	))!,66q9CCAFF))!,66q9CCAFF  //r+   )T)NNNNNNN)rh   ri   rj   rk   rM   rp  rr  rv  r   r   r   r$   rl   r   r   r   ro   r   rg   r  rp   rq   s   @r)   rg  rg  z  s   
(/0C  -115/3,0048<9=O
ELL)O
 !.O
 u||,	O

 ELL)O
  -O
  (5O
 !) 6O
 +,O
 
uU\\"$PP	QO
  O
b	0r+   rg  c                   l    e Zd ZdgZ fdZd Zd Zee	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     de	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	e
j                     dee   deeef   fd              Zd Z xZS )EsmForMaskedLMzlm_head.decoder.weightc                     t         |   |       |j                  rt        j	                  d       t        |d      | _        t        |      | _        | j                          | j                          y )NzjIf you want to use `EsmForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Frm  )rL   rM   r   loggerwarningrg  rJ  rW  lm_headinit_weightsrl  r   s     r)   rM   zEsmForMaskedLM.__init__  s\     NN1
 Fe< (r+   c                 .    | j                   j                  S r   r  decoderr[  s    r)   r\  z$EsmForMaskedLM.get_output_embeddings  s    ||###r+   c                 &    || j                   _        y r   r  )rU   new_embeddingss     r)   set_output_embeddingsz$EsmForMaskedLM.set_output_embeddings   s    -r+   r   r   r   r   r   r   r   labelsr   re   c	           
      r    | j                   |f||||||d|	}
|
d   }| j                  |      }d}|at               }|j                  |j                        } ||j                  d| j                  j                        |j                  d            }t        |||
j                  |
j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        )r   r   r   r   r   r   r   Nr    losslogitsr   r   )rJ  r  r   r\   rY   r   r   r   r   r   r   )rU   r   r   r   r   r   r   r   r  r   outputsr  prediction_scoresmasked_lm_lossloss_fcts                  r)   rg   zEsmForMaskedLM.forward#  s    * $((	
)%'"7#9	
 	
 "!* LL9')HYY0778F%&7&<&<RAWAW&XZ`ZeZefhZijN$!//))	
 	
r+   c                 <    | j                   j                  ||      S )N)r   )rJ  r  )rU   r   r   s      r)   r  zEsmForMaskedLM.predict_contactsS  s    xx(((OOr+   )NNNNNNNN)rh   ri   rj   _tied_weights_keysrM   r\  r  r   r   r   r$   
LongTensorrl   r   r   r   r   ro   r   rg   r  rp   rq   s   @r)   r  r  	  s   23 $.  151537,059=A9=-1,
E,,-,
 !.,
 u//0	,

 ELL),
   1 12,
  ((9(9:,
 !) 6,
 ))*,
 +,,
 
un$	%,
  ,
\Pr+   r  c                   (     e Zd ZdZ fdZd Z xZS )rW  z&ESM Head for masked language modeling.c                    t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                  d      | _
        t        j                  t        j                  |j                              | _        y )Nr   F)r{   )rL   rM   r   rw   r   r   r   r   r   r   r  	Parameterr$   zerosr{   r   s     r)   rM   zEsmLMHead.__init__Z  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FUSLLV->->!?@	r+   c                     | j                  |      }t        |      }| j                  |      }| j                  |      | j                  z   }|S r   )r   r8   r   r  r{   rU   featuresr   r&   s       r)   rg   zEsmLMHead.forwardb  sD    JJx GOOA LLOdii'r+   rh   ri   rj   rk   rM   rg   rp   rq   s   @r)   rW  rW  W  s    0Ar+   rW  z
    ESM Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    )custom_introc                       e Zd Z fdZee	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     dee   d	eeef   fd
              Z xZS )EsmForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |d      | _        t        |      | _        | j                          | j                          y NFr  )
rL   rM   
num_labelsr   rg  rJ  EsmClassificationHead
classifierr  rl  r   s     r)   rM   z%EsmForSequenceClassification.__init__s  sT      ++Fe</7r+   r   r   r   r   r   r  r   re   c                     | j                   |f||||d|}|d   }	| j                  |	      }
d}||j                  |
j                        }| j                  j
                  | j                  dk(  rd| j                  _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                  _        nd| j                  _        | j                  j
                  dk(  rIt               }| j                  dk(  r& ||
j                         |j                               }n ||
|      }n| j                  j
                  dk(  r=t               } ||
j                  d| j                        |j                  d            }n,| j                  j
                  dk(  rt               } ||
|      }t!        ||
|j"                  |j$                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        r   r   r   r   r   Nr   rx   single_label_classificationmulti_label_classificationr    r  )rJ  r  r\   rY   r   problem_typer  rK   r$   r   rn   r	   r   r   r   r   r   r   r   rU   r   r   r   r   r   r  r   r  r  r  r  r  s                r)   rg   z$EsmForSequenceClassification.forward  s   & $((
)%'
 
 "!*1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
r+   NNNNNN)rh   ri   rj   rM   r   r   r   r$   r  rl   r   r   r   r   ro   r   rg   rp   rq   s   @r)   r  r  l  s    
  151537,059-1:
E,,-:
 !.:
 u//0	:

 ELL):
   1 12:
 ))*:
 +,:
 
u..	/:
  :
r+   r  c                       e Zd Z fdZee	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     dee   d	eeef   fd
              Z xZS )EsmForTokenClassificationc                 P   t         |   |       |j                  | _        t        |d      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          | j                          y r  )rL   rM   r  rg  rJ  r   r   r   r   rw   r   r  r  rl  r   s     r)   rM   z"EsmForTokenClassification.__init__  su      ++Fe<zz&"<"<=))F$6$68I8IJr+   r   r   r   r   r   r  r   re   c                 |    | j                   |f||||d|}|d   }	| j                  |	      }	| j                  |	      }
d}|Wt               }|j	                  |
j
                        } ||
j                  d| j                        |j                  d            }t        ||
|j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        r  r   Nr    r  )rJ  r   r  r   r\   rY   r   r  r   r   r   r  s                r)   rg   z!EsmForTokenClassification.forward  s    " $((
)%'
 
 "!*,,71')HYYv}}-FFKKDOO<fkk"oND$!//))	
 	
r+   r  )rh   ri   rj   rM   r   r   r   r$   r  rl   r   r   r   r   ro   r   rg   rp   rq   s   @r)   r  r    s    
  151537,059-1)
E,,-)
 !.)
 u//0	)

 ELL))
   1 12)
 ))*)
 +,)
 
u++	,)
  )
r+   r  c                   (     e Zd ZdZ fdZd Z xZS )r  z-Head for sentence-level classification tasks.c                 &   t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _
        y r   )rL   rM   r   rw   r   r   r   r   r   r  out_projr   s     r)   rM   zEsmClassificationHead.__init__  s`    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHr+   c                     |d d dd d f   }| j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S rE  )r   r   r$   tanhr  r  s       r)   rg   zEsmClassificationHead.forward  sY    Q1WLLOJJqMJJqMLLOMM!r+   r  rq   s   @r)   r  r    s    7Ir+   r  c                     | j                  |      j                         }t        j                  |d      j	                  |      |z  }|j                         |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r!   )r}   rn   r$   cumsumrZ   r   )r   r   maskincremental_indicess       r)   r   r     sP     <<$((*D,,t3;;DADH##%33r+   )r  r  r  rg  rI  )r   N)Grk   r5   typingr   r   r   r$   torch.utils.checkpointr   torch.nnr   r   r	   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   utilsr   r   r   r   utils.genericr   r   configuration_esmr   
get_loggerrh   r  r*   r1   r8   r;   rE   ModulerG   rs   r   rl   rP   r   r   r   r  r  r  r   r2  rA  rI  rg  r  rW  r  r  r  r   __all__rZ  r+   r)   <module>r     s      , ,    A A 9  G & Q R R ? ( 
		H	%(
.;#
	*
ehhoo *
Z Gryy  GF\=BII \=L (,/%II/%<</% 
/% <<	/%
 U\\*/% /% /% %/% '(/%dV)ryy V)r
BII 
-299 -`bii 
		 
7) 7t S  SH		  ) ) )X K0! K0 K0\ JP' JP JPZ		 * I
#5 I
I
X 8
 2 8
 8
vBII &4 r+   