
    h                    N    d dl mZ d dlmZmZmZ d dlmZmZm	Z	  G d de      Z
y)    )annotations)AnyOptionalcast)TextSplitter	Tokenizersplit_text_on_tokensc                  v     e Zd ZU dZ	 	 	 d
	 	 	 	 	 	 	 	 	 d fdZ	 	 	 	 ddZddZddZdZde	d<   dd	Z
 xZS )%SentenceTransformersTokenTextSplitterz8Splitting text to tokens using sentence model tokenizer.intc                   t        |   di |d|i 	 ddlm} || _         || j
                        | _        | j                  j                  | _        | j                  |       y# t        $ r}d}t	        |      |d}~ww xY w)zCreate a new TextSplitter.chunk_overlapr   )SentenceTransformerzCould not import sentence_transformers python package. This is needed in order to for SentenceTransformersTokenTextSplitter. Please install it with `pip install sentence-transformers`.N)tokens_per_chunk )	super__init__sentence_transformersr   ImportError
model_name_model	tokenizer_initialize_chunk_configuration)	selfr   r   r   kwargsr   errmsg	__class__s	           l/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/langchain_text_splitters/sentence_transformers.pyr   z.SentenceTransformersTokenTextSplitter.__init__   s     	?6??	,A %)$//:..,,>N,O  	,N 
 c"+	,s   A( (	B1A??Bc                  | j                   j                  | _        || j                  | _        n|| _        | j                  | j                  kD  r5d| j                   d| j                   d| j                   d}t        |      y )NzThe token limit of the models 'z' is: z. Argument tokens_per_chunk=z > maximum token limit.)r   max_seq_lengthmaximum_tokens_per_chunkr   r   
ValueError)r   r   r   s      r   r   zESentenceTransformersTokenTextSplitter._initialize_chunk_configuration$   s     )-(B(B%#$($A$AD!$4D!  4#@#@@1$//1B C556 7..2.C.C-D)+  S/! A    c                     d fd}t         j                   j                   j                  j                  |      }t        ||      S )a  Splits the input text into smaller components by splitting text on tokens.

        This method encodes the input text using a private `_encode` method, then
        strips the start and stop token IDs from the encoded result. It returns the
        processed segments as a list of strings.

        Args:
            text (str): The input text to be split.

        Returns:
            List[str]: A list of string components derived from the input text after
            encoding and processing.
        c                ,    j                  |       dd S )N   )_encode)textr   s    r   %encode_strip_start_and_stop_token_idsz_SentenceTransformersTokenTextSplitter.split_text.<locals>.encode_strip_start_and_stop_token_idsF   s    <<%a++r$   )r   r   decodeencode)r*   r   r*   strreturn	list[int])r   _chunk_overlapr   r   r,   r	   )r   r*   r+   r   s   `   r   
split_textz0SentenceTransformersTokenTextSplitter.split_text7   sE    	, --!22>>((8	
	 $CCr$   c               6    t        | j                  |            S )ay  Counts the number of tokens in the given text.

        This method encodes the input text using a private `_encode` method and
        calculates the total number of tokens in the encoded result.

        Args:
            text (str): The input text for which the token count is calculated.

        Returns:
            int: The number of tokens in the encoded text.
        )lenr)   )r   r*   s     r   count_tokensz2SentenceTransformersTokenTextSplitter.count_tokensR   s     4<<%&&r$   l         _max_length_equal_32_bit_integerc                j    | j                   j                  || j                  d      }t        d|      S )Ndo_not_truncate)
max_length
truncationr1   )r   r-   r7   r   )r   r*   &token_ids_with_start_and_end_token_idss      r   r)   z-SentenceTransformersTokenTextSplitter._encodeb   s<    151F1F<<( 2G 2
.
 K!GHHr$   )2   z'sentence-transformers/all-mpnet-base-v2N)
r   r   r   r/   r   Optional[int]r   r   r0   None)r   r>   r0   r?   )r*   r/   r0   z	list[str])r*   r/   r0   r   r.   )__name__
__module____qualname____doc__r   r   r3   r6   r7   __annotations__r)   __classcell__)r   s   @r   r   r      s    B  C*.	PP P (	P
 P 
P2"#0"	"&D6' -2$c1Ir$   r   N)
__future__r   typingr   r   r   langchain_text_splitters.baser   r   r	   r   r   r$   r   <module>rI      s$    " & & W W`IL `Ir$   