
    h2                    ^   d dl mZ d dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZ d dlmZmZ d dlmZ erd d	lmZmZmZ d d
lmZ  ej<                  e      Z  edd      Z! G d dee      Z" G d de"      Z# G d de$e
      Z% ed       G d d             Z&ddZ'y)    )annotationsN)ABCabstractmethod)	dataclass)Enum)TYPE_CHECKINGAnyCallableLiteralOptionalTypeVarUnion)BaseDocumentTransformerDocument)Self)
CollectionIterableSequence)SetTSTextSplitter)boundc                      e Zd ZdZddedddf	 	 	 	 	 	 	 	 	 	 	 	 	 ddZedd       Z	 d	 	 	 	 	 dd	Zdd
Z	ddZ
ddZedd       Zedd e       df	 	 	 	 	 	 	 	 	 	 	 dd       Z	 	 	 	 	 	 ddZy)r   z)Interface for splitting text into chunks.i     FTc                    |dk  rd| }t        |      |dk  rd| }t        |      ||kD  rd| d| d}t        |      || _        || _        || _        || _        || _        || _        y)ad  Create a new TextSplitter.

        Args:
            chunk_size: Maximum size of chunks to return
            chunk_overlap: Overlap in characters between chunks
            length_function: Function that measures the length of given chunks
            keep_separator: Whether to keep the separator and where to place it
                            in each corresponding chunk (True='start')
            add_start_index: If `True`, includes chunk's start index in metadata
            strip_whitespace: If `True`, strips whitespace from the start and end of
                              every document
        r   zchunk_size must be > 0, got z chunk_overlap must be >= 0, got zGot a larger chunk overlap (z) than chunk size (z), should be smaller.N)
ValueError_chunk_size_chunk_overlap_length_function_keep_separator_add_start_index_strip_whitespace)self
chunk_sizechunk_overlaplength_functionkeep_separatoradd_start_indexstrip_whitespacemsgs           [/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/langchain_text_splitters/base.py__init__zTextSplitter.__init__!   s    * ?0=CS/!14]ODCS/!:%.}o ><46  S/!%+ /- /!1    c                     y)z$Split text into multiple components.N )r#   texts     r+   
split_textzTextSplitter.split_textI   s    r-   Nc           	        |xs i gt        |      z  }g }t        |      D ]  \  }}d}d}| j                  |      D ]  }	t        j                  ||         }
| j
                  r>||z   | j                  z
  }|j                  |	t        d|            }||
d<   t        |	      }t        |	|
      }|j                  |         |S )z&Create documents from a list of texts.r   start_index)page_contentmetadata)len	enumerater1   copydeepcopyr!   r   findmaxr   append)r#   texts	metadatas
_metadatas	documentsir0   indexprevious_chunk_lenchunkr5   offsetnew_docs                r+   create_documentszTextSplitter.create_documentsM   s     32$U"3
	 ' 	*GAtE!". *==A7(("%77$:M:MMF IIeSF^<E.3H]+),U&"I  )*	* r-   c                    g g }}|D ]8  }|j                  |j                         |j                  |j                         : | j                  ||      S )zSplit documents.)r>   )r<   r4   r5   rG   )r#   r@   r=   r>   docs        r+   split_documentszTextSplitter.split_documentsa   sV    ry 	+CLL))*S\\*	+ $$Ui$@@r-   c                l    |j                  |      }| j                  r|j                         }|dk(  ry |S )N )joinr"   strip)r#   docs	separatorr0   s       r+   
_join_docszTextSplitter._join_docsi   s3    ~~d#!!::<D2:r-   c                d   | j                  |      }g }g }d}|D ]m  }| j                  |      }||z   t        |      dkD  r|ndz   | j                  kD  r
|| j                  kD  r%t        j	                  d| d| j                          t        |      dkD  r| j                  ||      }	|	|j                  |	       || j                  kD  s*||z   t        |      dkD  r|ndz   | j                  kD  ro|dkD  rj|| j                  |d         t        |      dkD  r|ndz   z  }|dd  }|| j                  kD  r?||z   t        |      dkD  r|ndz   | j                  kD  r|dkD  rj|j                  |       ||t        |      dkD  r|ndz   z  }p | j                  ||      }	|	|j                  |	       |S )Nr   zCreated a chunk of size z%, which is longer than the specified    )r   r6   r   loggerwarningrQ   r<   r   )
r#   splitsrP   separator_lenrO   current_doctotald_lenrI   s
             r+   _merge_splitszTextSplitter._merge_splitsq   s    --i8!# 	KA((+D[1AA1E1M""# 4+++NN25' :>>B>N>N=OQ {#a'//+yACC(  $"5"55[9IA9MSTU**+!AI!6!6{1~!F-0-=-AMq"  '2!"o  $"5"55[9IA9MSTU**+!AI q!Tc+.>.B]JJE9	K: ook95?KKr-   c                    	 ddl m} t        |      sd}t        |      dfd} | d	d|i|S # t        $ r}d}t        |      |d}~ww xY w)
z>Text splitter that uses HuggingFace tokenizer to count length.r   )PreTrainedTokenizerBasezATokenizer received was not an instance of PreTrainedTokenizerBasec                8    t        j                  |             S N)r6   tokenizer0   	tokenizers    r+   _huggingface_tokenizer_lengthzNTextSplitter.from_huggingface_tokenizer.<locals>._huggingface_tokenizer_length   s    9--d344r-   z`Could not import transformers python package. Please install it with `pip install transformers`.Nr&   r0   strreturnintr/   )$transformers.tokenization_utils_baser^   
isinstancer   ImportError)clsrc   kwargsr^   r*   rd   errs    `     r+   from_huggingface_tokenizerz'TextSplitter.from_huggingface_tokenizer   sp    	+Ti)@AW  !o%5 K#@KFKK  	+E  S/s*	+s   %2 	AA		Agpt2allc                    	 ddl }||j                  |      n|j                  |      dfd}	t	        | t
              r||d}
i ||
} | dd|	i|S # t        $ r}d}t        |      |d}~ww xY w)	z9Text splitter that uses tiktoken encoder to count length.r   NzCould not import tiktoken python package. This is needed in order to calculate max_tokens_for_prompt. Please install it with `pip install tiktoken`.c                >    t        j                  |             S N)allowed_specialdisallowed_special)r6   encode)r0   ru   rv   encs    r+   _tiktoken_encoderz=TextSplitter.from_tiktoken_encoder.<locals>._tiktoken_encoder   s*    

$3'9   r-   )encoding_name
model_nameru   rv   r&   re   r/   )tiktokenrk   encoding_for_modelget_encoding
issubclassTokenTextSplitter)rl   rz   r{   ru   rv   rm   r|   rn   r*   ry   extra_kwargsrx   s      ``      @r+   from_tiktoken_encoderz"TextSplitter.from_tiktoken_encoder   s    	, !--j9C''6C	 c,-!.(#2&8	L 0/,/F?#4????  	,A 
 c"+	,s   A 	A9&A44A9c                6    | j                  t        |            S )z2Transform sequence of documents by splitting them.)rJ   list)r#   r@   rm   s      r+   transform_documentsz TextSplitter.transform_documents   s     ##DO44r-   )r$   rh   r%   rh   r&   zCallable[[str], int]r'   z$Union[bool, Literal['start', 'end']]r(   boolr)   r   rg   Noner0   rf   rg   	list[str]r`   )r=   r   r>   zOptional[list[dict[Any, Any]]]rg   list[Document])r@   zIterable[Document]rg   r   )rO   r   rP   rf   rg   Optional[str])rV   zIterable[str]rP   rf   rg   r   )rc   r	   rm   r	   rg   r   )rz   rf   r{   r   ru   'Union[Literal['all'], AbstractSet[str]]rv   &Union[Literal['all'], Collection[str]]rm   r	   rg   r   )r@   Sequence[Document]rm   r	   rg   r   )__name__
__module____qualname____doc__r6   r,   r   r1   rG   rJ   rQ   r\   classmethodro   setr   r   r/   r-   r+   r   r      sH   3  03?D %!%&2&2 &2 .	&2
 =&2 &2 &2 
&2P 3 3 MQ+I	(A(T L L,  $$(CF5EJ*@*@ "*@ A	*@
 C*@ *@ 
*@ *@X5+57:5	5r-   c                  V     e Zd ZdZdd e       df	 	 	 	 	 	 	 	 	 	 	 d fdZddZ xZS )	r   z/Splitting text to tokens using model tokenizer.rp   Nrq   c                    t        
|   di | 	 ddl}||j	                  |      }	n|j                  |      }	|	| _        || _        || _        y# t        $ r}d}t        |      |d}~ww xY w)zCreate a new TextSplitter.r   NzCould not import tiktoken python package. This is needed in order to for TokenTextSplitter. Please install it with `pip install tiktoken`.r/   )	superr,   r|   rk   r}   r~   
_tokenizer_allowed_special_disallowed_special)r#   rz   r{   ru   rv   rm   r|   rn   r*   rx   	__class__s             r+   r,   zTokenTextSplitter.__init__   s     	"6"	, !--j9C''6C /#5   	,A 
 c"+	,s   A 	A-A((A-c                     d fd}t         j                   j                   j                  j                  |      }t        ||      S )a  Splits the input text into smaller chunks based on tokenization.

        This method uses a custom tokenizer configuration to encode the input text
        into tokens, processes the tokens in chunks of a specified size with overlap,
        and decodes them back into text chunks. The splitting is performed using the
        `split_text_on_tokens` function.

        Args:
            text (str): The input text to be split into smaller chunks.

        Returns:
            List[str]: A list of text chunks, where each chunk is derived from a portion
            of the input text based on the tokenization and chunking rules.
        c                h    j                   j                  | j                  j                        S rt   )r   rw   r   r   )_textr#   s    r+   _encodez-TokenTextSplitter.split_text.<locals>._encode  s4    ??)) $ 5 5#'#;#; *  r-   )r%   tokens_per_chunkdecoderw   rb   )r   rf   rg   z	list[int])	Tokenizerr   r   r   r   split_text_on_tokens)r#   r0   r   rc   s   `   r+   r1   zTokenTextSplitter.split_text  sE     	 --!--??))	
	 $CCr-   )rz   rf   r{   r   ru   r   rv   r   rm   r	   rg   r   r   )r   r   r   r   r   r,   r1   __classcell__)r   s   @r+   r   r      s]    9 $$(CF5EJ66 "6 A	6
 C6 6 
68Dr-   r   c                  |    e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZy)Languagez"Enum of the programming languages.cppgojavakotlinjstsphpprotopythonrstrubyrustscalaswiftmarkdownlatexhtmlsolcsharpcobolcluaperlhaskellelixir
powershellvisualbasic6N)r   r   r   r   CPPGOJAVAKOTLINJSr   PHPPROTOPYTHONRSTRUBYRUSTSCALASWIFTMARKDOWNLATEXHTMLSOLCSHARPCOBOLCLUAPERLHASKELLELIXIR
POWERSHELLVISUALBASIC6r/   r-   r+   r   r   &  s    ,
C	BDF	B	B
CEF
CDDEEHED
CFEA
CDGFJ!Lr-   r   T)frozenc                  @    e Zd ZU dZded<   	 ded<   	 ded<   	 ded<   y	)
r   zTokenizer data class.rh   r%   r   zCallable[[list[int]], str]r   zCallable[[str], list[int]]rw   N)r   r   r   r   __annotations__r/   r-   r+   r   r   F  s)    *,&&=&&=r-   r   c                   g }|j                  |       }d}t        ||j                  z   t        |            }||| }|t        |      k  r|j	                  |j                  |             |t        |      k(  r	 |S ||j                  |j                  z
  z  }t        ||j                  z   t        |            }||| }|t        |      k  r|S )z6Split incoming text and return chunks using tokenizer.r   )rw   minr   r6   r<   r   r%   )r0   rc   rV   	input_ids	start_idxcur_idx	chunk_idss          r+   r   r   T  s    F  &II)i888#i.IG)G,I
c)n
$i&&y12c)n$ M 	Y//)2I2III	i)"<"<<c)nMi0	 c)n
$ Mr-   )r0   rf   rc   r   rg   r   )(
__future__r   r8   loggingabcr   r   dataclassesr   enumr   typingr   r	   r
   r   r   r   r   langchain_core.documentsr   r   typing_extensionsr   collections.abcr   r   r   r   AbstractSet	getLoggerr   rT   r   r   r   rf   r   r   r   r/   r-   r+   <module>r      s    "   # !    G ">>2			8	$T(E5*C E5P=D =D@"sD "@ $
> 
> 
>r-   