
    h;8                       U d Z ddlmZ ddlZddlZddlZddlZddlmZ ddl	m
Z
mZmZmZmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ  ej0                  d
      ZddZddZddZddZdaded<   ddZ  G d de      Z!y)a!  Module contains code for a cache backed embedder.

The cache backed embedder is a wrapper around an embedder that caches
embeddings in a key-value store. The cache is used to avoid recomputing
embeddings for the same text.

The text is hashed and the hash is used as the key in the cache.
    )annotationsN)Sequence)CallableLiteralOptionalUnioncast)
Embeddings)	BaseStore	ByteStore)batch_iterate)EncoderBackedStorei  )intc                    t        j                  | j                  d      d      j                         }t	        j
                  t        |      S )a  Return a UUID derived from *text* using SHA-1 (deterministic).

    Deterministic and fast, **but not collision-resistant**.

    A malicious attacker could try to create two different texts that hash to the same
    UUID. This may not necessarily be an issue in the context of caching embeddings,
    but new applications should swap this out for a stronger hash function like
    xxHash, BLAKE2 or SHA-256, which are collision-resistant.
    utf-8F)usedforsecurity)hashlibsha1encode	hexdigestuuiduuid5NAMESPACE_UUID)textsha1_hexs     X/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/langchain/embeddings/cache.py_sha1_hash_to_uuidr      s7     ||DKK0%HRRTH::nh//    c                6     dk(  r
t                d fd}|S )a   Create a default key encoder function.

    Args:
        namespace: Prefix that segregates keys from different embedding models.
        algorithm:
           * ``'sha1'`` - fast but not collision-resistant
           * ``'blake2b'`` - cryptographically strong, faster than SHA-1
           * ``'sha256'`` - cryptographically strong, slower than SHA-1
           * ``'sha512'`` - cryptographically strong, slower than SHA-1

    Returns:
        A function that encodes a key using the specified algorithm.
    r   c                   dk(  r t        |        S dk(  r6 t        j                  | j                  d            j	                          S dk(  r6 t        j
                  | j                  d            j	                          S dk(  r6 t        j                  | j                  d            j	                          S d }t        |      )z+Encode a key using the specified algorithm.r   blake2br   sha256sha512zUnsupported algorithm: )r   r   r!   r   r   r"   r#   
ValueError)keymsg	algorithm	namespaces     r   _key_encoderz/_make_default_key_encoder.<locals>._key_encoder<   s    [!3C!8 9::	![G1D!E!O!O!Q RSS [

70C!D!N!N!P QRR [

70C!D!N!N!P QRR'	{3or   )r%   strreturnr*   )_warn_about_sha1_encoder)r(   r'   r)   s   `` r   _make_default_key_encoderr-   +   s      F " r   c                H    t        j                  |       j                         S )zSerialize a value.)jsondumpsr   )values    r   _value_serializerr2   L   s    ::e##%%r   c                r    t        t        t           t        j                  | j                                     S )zDeserialize a value.)r	   listfloatr/   loadsdecode)serialized_values    r   _value_deserializerr9   Q   s&    UTZZ(8(?(?(ABCCr   Fbool_warned_about_sha1c                 N    t         st        j                  dt        d       da yy)z9Emit a one-time warning about SHA-1 collision weaknesses.a  Using default key encoder: SHA-1 is *not* collision-resistant. While acceptable for most cache scenarios, a motivated attacker can craft two different payloads that map to the same cache key. If that risk matters in your environment, supply a stronger encoder (e.g. SHA-256 or BLAKE2) via the `key_encoder` argument. If you change the key encoder, consider also creating a new cache, to avoid (the potential for) collisions with existing keys.   )category
stacklevelTN)r;   warningswarnUserWarning r   r   r,   r,   Z   s.     J !
	
 " r   c                       e Zd ZdZddd	 	 	 	 	 	 	 	 	 d fdZddZddZddZddZe	d	dd
dd	 	 	 	 	 	 	 	 	 	 	 	 	 dd       Z
 xZS )CacheBackedEmbeddingsa  Interface for caching results from embedding models.

    The interface allows works with any store that implements
    the abstract store interface accepting keys of type str and values of list of
    floats.

    If need be, the interface can be extended to accept other implementations
    of the value serializer and deserializer, as well as the key encoder.

    Note that by default only document embeddings are cached. To cache query
    embeddings too, pass in a query_embedding_store to constructor.

    Examples:
        .. code-block: python

            from langchain.embeddings import CacheBackedEmbeddings
            from langchain.storage import LocalFileStore
            from langchain_community.embeddings import OpenAIEmbeddings

            store = LocalFileStore('./my_cache')

            underlying_embedder = OpenAIEmbeddings()
            embedder = CacheBackedEmbeddings.from_bytes_store(
                underlying_embedder, store, namespace=underlying_embedder.model
            )

            # Embedding is computed and cached
            embeddings = embedder.embed_documents(["hello", "goodbye"])

            # Embeddings are retrieved from the cache, no computation is done
            embeddings = embedder.embed_documents(["hello", "goodbye"])
    N
batch_sizequery_embedding_storec               Z    t         |           || _        || _        || _        || _        y)a  Initialize the embedder.

        Args:
            underlying_embeddings: the embedder to use for computing embeddings.
            document_embedding_store: The store to use for caching document embeddings.
            batch_size: The number of documents to embed between store updates.
            query_embedding_store: The store to use for caching query embeddings.
                If ``None``, query embeddings are not cached.
        N)super__init__document_embedding_storerH   underlying_embeddingsrG   )selfrM   rL   rG   rH   	__class__s        r   rK   zCacheBackedEmbeddings.__init__   s/    " 	(@%%:"%:"$r   c           	        | j                   j                  |      }t        |      D cg c]
  \  }}|	| }}}t        | j                  |      D ]v  }|D cg c]  }||   	 }}| j
                  j                  |      }| j                   j                  t        t        ||                   t        ||      D ]
  \  }	}
|
||	<    x t        t        t        t              |      S c c}}w c c}w )w  Embed a list of texts.

        The method first checks the cache for the embeddings.
        If the embeddings are not found, the method uses the underlying embedder
        to embed the documents and stores the results in the cache.

        Args:
            texts: A list of texts to embed.

        Returns:
            A list of embeddings for the given texts.
        )rL   mget	enumerater   rG   rM   embed_documentsmsetr4   zipr	   r5   rN   textsvectorsivectorall_missing_indicesmissing_indicesmissing_textsmissing_vectorsindexupdated_vectors              r   rT   z%CacheBackedEmbeddings.embed_documents   s    372O2O2T2T3
  )1*
!VV^A*
 *
  -T__>QR 	0O/>?!U1X?M?"88HHWO))..S89 *-_o)N 0%~!/0	0 e
 	
*

 @s   
C)C)C/c           	     ,  K   | j                   j                  |       d{   }t        |      D cg c]
  \  }}|	| }}}t        | j                  |      D ]  }|D cg c]  }||   	 }}| j
                  j                  |       d{   }| j                   j                  t        t        ||                   d{    t        ||      D ]
  \  }	}
|
||	<     t        t        t        t              |      S 7 c c}}w c c}w 7 7 Lw)rQ   N)rL   amgetrS   r   rG   rM   aembed_documentsamsetr4   rV   r	   r5   rW   s              r   rd   z&CacheBackedEmbeddings.aembed_documents   s.     //55e<< 	  )1*
!VV^A*
 *
  -T__>QR 		0O/>?!U1X?M?$($>$>$O$O% O //55S89   *-_o)N 0%~!/0		0 e
 	
% =*
 @sO   DDD
DD D#D/ DD5DD>DDDc                
   | j                   s| j                  j                  |      S | j                   j                  |g      \  }||S | j                  j                  |      }| j                   j	                  ||fg       |S )0  Embed query text.

        By default, this method does not cache queries. To enable caching, set the
        ``cache_query`` parameter to ``True`` when initializing the embedder.

        Args:
            text: The text to embed.

        Returns:
            The embedding for the given text.
        )rH   rM   embed_queryrR   rU   rN   r   cachedr[   s       r   rh   z!CacheBackedEmbeddings.embed_query   s     ))--99$??..33TF;	M++77=""''$(89r   c                b  K   | j                   s#| j                  j                  |       d{   S | j                   j                  |g       d{   \  }||S | j                  j                  |       d{   }| j                   j	                  ||fg       d{    |S 7 y7 W7 07 w)rg   N)rH   rM   aembed_queryrc   re   ri   s       r   rl   z"CacheBackedEmbeddings.aembed_query  s      ))33@@FFF44::D6BB	M11>>tDD((..v/?@@@ GB E@sE   +B/B'#B/B)(B/:B+;%B/ B-!B/)B/+B/-B/ Fr   )r(   rG   query_embedding_cachekey_encoderc               f   t        |t              rt        ||      }n't        |      r|rd}t	        |      d}t	        |      t        t        t        t           f   ||t        t              }|du r|}	n1|du rd}	n*t        t        t        t           f   ||t        t              }	 | ||||	      S )a7  On-ramp that adds the necessary serialization and encoding to the store.

        Args:
            underlying_embeddings: The embedder to use for embedding.
            document_embedding_cache: The cache to use for storing document embeddings.
            *,
            namespace: The namespace to use for document cache.
                This namespace is used to avoid collisions with other caches.
                For example, set it to the name of the embedding model used.
            batch_size: The number of documents to embed between store updates.
            query_embedding_cache: The cache to use for storing query embeddings.
                True to use the same cache as document embeddings.
                False to not cache query embeddings.
            key_encoder: Optional callable to encode keys. If not provided,
                a default encoder using SHA-1 will be used. SHA-1 is not
                collision-resistant, and a motivated attacker could craft two
                different texts that hash to the same cache key.

                New applications should use one of the alternative encoders
                or provide a custom and strong key encoder function to avoid this risk.

                If you change a key encoder in an existing cache, consider
                just creating a new cache, to avoid (the potential for)
                collisions with existing keys or having duplicate keys
                for the same text in the cache.

        Returns:
            An instance of CacheBackedEmbeddings that uses the provided cache.
        zgDo not supply `namespace` when using a custom key_encoder; add any prefixing inside the encoder itself.zakey_encoder must be either 'blake2b', 'sha1', 'sha256', 'sha512' or a callable that encodes keys.TFNrF   )

isinstancer*   r-   callabler$   r   r4   r5   r2   r9   )
clsrM   document_embedding_cacher(   rG   rn   ro   r&   rL   rH   s
             r   from_bytes_storez&CacheBackedEmbeddings.from_bytes_store  s    V k3'3I{KKk" C  !o%3  S/!#5c4;6F#G$	$
  !D($<!"e+$(!$6sDK7G$H%!#	%! !$!"7	
 	
r   )
rM   r
   rL   zBaseStore[str, list[float]]rG   Optional[int]rH   z%Optional[BaseStore[str, list[float]]]r+   None)rX   z	list[str]r+   zlist[list[float]])r   r*   r+   list[float])rM   r
   rt   r   r(   r*   rG   rv   rn   zUnion[bool, ByteStore]ro   zKUnion[Callable[[str], str], Literal['sha1', 'blake2b', 'sha256', 'sha512']]r+   rE   )__name__
__module____qualname____doc__rK   rT   rd   rh   rl   classmethodru   __classcell__)rO   s   @r   rE   rE   l   s    L %)GK%)% #>%
 "%  E% 
%. 
D$
L..  $(8= T
)T
 #,T

 T
 "T
  6T

T
 
T
 T
r   rE   )r   r*   r+   z	uuid.UUID)r(   r*   r'   r*   r+   zCallable[[str], str])r1   zSequence[float]r+   bytes)r8   r   r+   rx   )r+   rw   )"r|   
__future__r   r   r/   r   r@   collections.abcr   typingr   r   r   r   r	   langchain_core.embeddingsr
   langchain_core.storesr   r   langchain_core.utils.iterr    langchain.storage.encoder_backedr   UUIDr   r   r-   r2   r9   r;   __annotations__r,   rE   rC   r   r   <module>r      sv    #     $ ; ; 0 6 3 ?t$0B&
D ! D  "$D
J D
r   