
    ]h10                        d dl mZ d dlZd dlZd dlmZ d dlmZ d dlZd dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ  G d d      Z G d d      Z G d de
j4                        Zy)    )annotationsN)Iterable)Any)Tensornn)
functional)SentenceTransformer)CachedGISTEmbedLoss)"CachedMultipleNegativesRankingLoss)+CachedMultipleNegativesSymmetricRankingLoss)Transformerc                  @    e Zd ZdZd	dZd
dZddZddZddZddZ	y)TransformerDecoratorz
    Decorator that caches the embeddings of all layers of the transformer.
    When `layer_idx` is set, it returns the cached embeddings of that layer instead.

    This is meant to override the forward function of the Transformer.
    c                f    || _         || _        g | _        g | _        g | _        d | _        d| _        y Nr   )transformeroriginal_forward
embeddingslast_embeddingsfeatures	layer_idxcall_idx)selfr   r   s      l/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/sentence_transformers/losses/AdaptiveLayerLoss.py__init__zTransformerDecorator.__init__   s6    & 0/1-/13    c                     || _         d| _        y r   )r   r   )r   r   s     r   set_layer_idxz"TransformerDecorator.set_layer_idx&   s    "r   c                    t        j                  | j                  D cg c]  }|| j                      c}d      S c c}w )N   dim)torchconcatr   r   )r   	embeddings     r   get_layer_embeddingsz)TransformerDecorator.get_layer_embeddings*   s/    ||X9Yt~~6X^_``Xs   =c                    | j                   | j                  |      }|S | j                  |      }| xj                  dz  c_        |S )Nr    )r   call_grow_cachecall_use_cacher   r   r   outputs      r   __call__zTransformerDecorator.__call__-   sH    >>!))(3F  ((2FMMQMr   c                T   | j                   j                  j                  j                  }d| j                   j                  j                  _        | j	                  |      }t        |d         dz
  | _        | j                  j                  |d   dd        | j                  j                  |d          | j                  j                  |j                         D ci c]  \  }}|dvs|| c}}       || j                   j                  j                  _        |r|d= |S c c}}w )z
        Temporarily sets the output_hidden_states to True, runs the model, and then restores the original setting.
        Use the all_layer_embeddings to get the embeddings of all layers.
        Tall_layer_embeddingsr    token_embeddings)r.   r0   )r   
auto_modelconfigoutput_hidden_statesr   len
num_layersr   appendr   r   items)r   r   original_output_hidden_statesr+   keyvalues         r   r(   z$TransformerDecorator.call_grow_cache5   s	   
 )-(8(8(C(C(J(J(_(_%BF##**?&&x0 f%;<=Av&<=aCD##F+=$>?*0,,.tJCCGs<sS%Zt	

 C`##**?(-. us   D$
+D$
c                    i | j                   | j                     d| j                  | j                     | j                     iS )Nr0   )r   r   r   r   )r   r   s     r   r)   z#TransformerDecorator.call_use_cacheO   s>    s$--.s0BDOOTXTaTaDbcgcqcqDrssr   N)r   r   returnNoner<   r=   r<   r   )r<   dict[str, Tensor]r   r@   r<   r@   )
__name__
__module____qualname____doc__r   r   r&   r,   r(   r)    r   r   r   r      s'    a4tr   r   c                  (    e Zd ZdZddZddZddZy)	ForwardDecoratorz
    Decorator that caches the embeddings after all modules (e.g. pooling) of the model.
    Required to get the embeddings after all modules for the KL-divergence loss.

    This is meant to override the forward function of the SentenceTransformer.
    c                     || _         g | _        y )N)fnr   )r   rJ   s     r   r   zForwardDecorator.__init__[   s    r   c                d    | j                  |      }| j                  j                  |d          |S )Nsentence_embedding)rJ   r   r6   r*   s      r   r,   zForwardDecorator.__call___   s-    "v&:;<r   c                V    t        j                  | j                  d      }g | _        |S )Nr   r!   )r#   r$   r   )r   r   s     r   get_embeddingszForwardDecorator.get_embeddingsd   s#    \\$//q9
r   Nr>   rA   r?   )rB   rC   rD   rE   r   r,   rN   rF   r   r   rH   rH   S   s    
r   rH   c                  l     e Zd Z	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZddZddZed	d       Z xZS )
AdaptiveLayerLossc                X   t         |           || _        || _        || _        || _        || _        || _        || _        t        | j                  d   t              sJ t        |t        t        t        f      r0t        j                  d|j                   j"                   dd       yy)a  
        The AdaptiveLayerLoss can be seen as a loss *modifier* that allows you to use other loss functions at non-final
        layers of the Sentence Transformer model. This is useful for when you want to train a model where users have
        the option to lower the number of layers used to improve their inference speed and memory usage.

        Args:
            model: SentenceTransformer model
            loss: The loss function to be used, e.g.
                :class:`MultipleNegativesRankingLoss`,
                :class:`CoSENTLoss`, etc.
            n_layers_per_step: The number of layers to use per step. If
                -1, then all layers are used. If > 0, then a random
                sample of `n_layers_per_step` layers are used per step,
                separate from the final layer, which is always used. The
                2DMSE paper uses `n_layers_per_step=1`. The default
                value is 1.
            last_layer_weight: The weight to use for the loss of the
                final layer. Increase this to focus more on the
                performance when using all layers. The default value is
                1.0.
            prior_layers_weight: The weight to use for the loss of the
                prior layers. Increase this to focus more on the
                performance when using fewer layers. The default value
                is 1.0.
            kl_div_weight: The weight to use for the KL-divergence loss
                that is used to make the prior layers match that of the
                last layer. Increase this to focus more on the
                performance when using fewer layers. The default value
                is 1.0.
            kl_temperature: The temperature to use for the KL-divergence
                loss. If 0, then the KL-divergence loss is not used. The
                default value is 1.0.

        References:
            - The concept was inspired by the 2DMSE paper: https://arxiv.org/abs/2402.14776
            - `Adaptive Layers <../../../examples/sentence_transformer/training/adaptive_layer/README.html>`_

        Requirements:
            1. The base loss cannot be :class:`CachedMultipleNegativesRankingLoss`,
               :class:`CachedMultipleNegativesSymmetricRankingLoss`, or :class:`CachedGISTEmbedLoss`.

        Inputs:
            +---------------------------------------+--------+
            | Texts                                 | Labels |
            +=======================================+========+
            | any                                   | any    |
            +---------------------------------------+--------+

        Relations:
            - :class:`Matryoshka2dLoss` uses this loss in combination with :class:`MatryoshkaLoss` which allows for
                output dimensionality reduction for faster downstream tasks (e.g. retrieval).

        Example:
            ::

                from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses
                from datasets import Dataset

                model = SentenceTransformer("microsoft/mpnet-base")
                train_dataset = Dataset.from_dict({
                    "anchor": ["It's nice weather outside today.", "He drove to work."],
                    "positive": ["It's so sunny.", "He took the car to the office."],
                })
                loss = losses.MultipleNegativesRankingLoss(model=model)
                loss = losses.AdaptiveLayerLoss(model, loss)

                trainer = SentenceTransformerTrainer(
                    model=model,
                    train_dataset=train_dataset,
                    loss=loss,
                )
                trainer.train()
        r   z&MatryoshkaLoss is not compatible with .   )
stacklevelN)superr   modellossn_layers_per_steplast_layer_weightprior_layers_weightkl_div_weightkl_temperature
isinstancer   r   r   r
   warningswarn	__class__rB   )	r   rV   rW   rX   rY   rZ   r[   r\   r`   s	           r   r   zAdaptiveLayerLoss.__init__k   s    f 	
	!2!2#6 *,$**Q-555/1\^qr
 MMB4>>CZCZB[[\]jkl	
r   c                H   | j                   d   j                  }t        | j                   d   |      }|| j                   d   _        | j                   j                  }t        |      }|| j                   _        | j	                  ||      | j
                  z  }| j                  dkD  r4|j                         }t        j                  || j                  z  d      }|j                  }	t        |	dz
        }
| j                  dkD  r2| j                  |	dz
  k  r t        j                  |
| j                        }
|
D ]  }|j                  |       | j	                  ||      }||d|z   z  t!        |
      z  | j"                  z  z   }| j                  dkD  sZ|j                         }t        j$                  t        j&                  || j                  z  d      d      }||| j                  z  | j(                  z  z   } || j                   d   _        || j                   _        |S )Nr   r/   r!   r    	batchmean)	reduction)rV   forwardr   rH   rW   rY   r\   rN   Fsoftmaxr5   rangerX   randomsampler   r4   rZ   kl_divlog_softmaxr[   )r   sentence_featureslabelsoriginal_transformer_forwardtransformer_decoratorr   forward_decoratorrW   final_embeddingsr5   layer_indicesr   
layer_lossr   kl_div_losss                  r   rd   zAdaptiveLayerLoss.forward   s   '+zz!}'<'<$ 4TZZ]D` a 5

1  ::--,-=>.


 yy*F3d6L6LL"0??A yy)9D<O<O)OUWX*55
j1n-!!A%$*@*@:PQ>*Q"MM-9O9OPM ' 	UI!//	:#4f=J*I6]9KKdNfNfffD ""Q&.==?
hhMM*t/B/B"BK$)
 kD,?,??$BTBTTT	U" !=

1-

r   c                    | j                   j                  j                  | j                  | j                  | j
                  | j                  | j                  dS )N)rW   rX   rY   rZ   r[   r\   )rW   r`   rB   rX   rY   rZ   r[   r\   r   s    r   get_config_dictz!AdaptiveLayerLoss.get_config_dict   sL    II''00!%!7!7!%!7!7#'#;#;!//"11
 	
r   c                     y)Nz
@misc{li20242d,
    title={2D Matryoshka Sentence Embeddings},
    author={Xianming Li and Zongxi Li and Jing Li and Haoran Xie and Qing Li},
    year={2024},
    eprint={2402.14776},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}
rF   rv   s    r   citationzAdaptiveLayerLoss.citation  s    	r   )r          ?rz   rz   g333333?)rV   r	   rW   z	nn.ModulerX   intrY   floatrZ   r|   r[   r|   r\   r|   r<   r=   )rl   zIterable[dict[str, Tensor]]rm   r   r<   r   )r<   zdict[str, Any])r<   str)	rB   rC   rD   r   rd   rw   propertyry   __classcell__)r`   s   @r   rP   rP   j   s    
 "##&%(" #`m"`m `m 	`m
 !`m #`m `m `m 
`mD-^
 
 
r   rP   )
__future__r   rh   r^   collections.abcr   typingr   r#   r   r   torch.nnr   re   sentence_transformersr	   0sentence_transformers.losses.CachedGISTEmbedLossr
   ?sentence_transformers.losses.CachedMultipleNegativesRankingLossr   Hsentence_transformers.losses.CachedMultipleNegativesSymmetricRankingLossr   sentence_transformers.modelsr   r   rH   ModulerP   rF   r   r   <module>r      sY    "   $    $ 5 P n 5;t ;t| .g		 gr   