
    h                         d dl Zd dlZd dlmZ d dlmZmZmZ d dl	m
Z
 d dlmZ  ej                  e      Z G d de      Zy)    N)Path)DictIteratorUnion)Document)
BaseLoaderc                   d    e Zd ZdZ	 	 	 d
deeef   deedf   deedf   deddf
dZde	e
   fd	Zy)BSHTMLLoaderaS  
    __ModuleName__ document loader integration

    Setup:
        Install ``langchain-community`` and ``bs4``.

        .. code-block:: bash

            pip install -U langchain-community bs4

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import BSHTMLLoader

            loader = BSHTMLLoader(
                file_path="./example_data/fake-content.html",
            )

    Lazy load:
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            # async variant:
            # docs_lazy = await loader.alazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python


            Test Title


            My First Heading
            My first paragraph.



            {'source': './example_data/fake-content.html', 'title': 'Test Title'}

    Async load:
        .. code-block:: python

            docs = await loader.aload()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python



            Test Title


            My First Heading
            My first paragraph.



            {'source': './example_data/fake-content.html', 'title': 'Test Title'}

    N	file_pathopen_encoding	bs_kwargsget_text_separatorreturnc                     	 ddl }|| _        || _        |.t        j
                  j                  d      st        d      ddi}|| _        || _        y# t        $ r t        d      w xY w)a  initialize with path, and optionally, file encoding to use, and any kwargs
        to pass to the BeautifulSoup object.

        Args:
            file_path: The path to the file to load.
            open_encoding: The encoding to use when opening the file.
            bs_kwargs: Any kwargs to pass to the BeautifulSoup object.
            get_text_separator: The separator to use when calling get_text on the soup.
        r   NzUbeautifulsoup4 package not found, please install it with `pip install beautifulsoup4`lxmlzBy default BSHTMLLoader uses the 'lxml' package. Please either install it with `pip install -U lxml` or pass in init arg `bs_kwargs={'features': '...'}` to overwrite the default BeautifulSoup kwargs.features)	bs4ImportErrorr   r   	importlibutil	find_specr   r   )selfr   r   r   r   r   s         j/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/langchain_community/document_loaders/html_bs.py__init__zBSHTMLLoader.__init__S   s     	 #*>>++F3!,  $V,I""4%  	/ 	s   A A(c              #     K   ddl m} t        | j                  d| j                        5 } ||fi | j
                  }ddd       j                  | j                        }|j                  r t        |j                  j                        }nd}t        | j                        |d}t        ||       y# 1 sw Y   zxY ww)	z)Load HTML document into document objects.r   )BeautifulSoupr)encodingN )sourcetitle)page_contentmetadata)r   r   openr   r   r   get_textr   r!   strstringr   )r   r   fsouptextr!   r#   s          r   	lazy_loadzBSHTMLLoader.lazy_loady   s     %$..#0B0BC 	6q 5dnn5D	6 }}T445::

))*EE $..)1
 D8<<	6 	6s   )CB8A9C8C=C)NNr   )__name__
__module____qualname____doc__r   r&   r   dictr   r   r   r+        r   r
   r
      sv    CP +/'+"$$5d#$5 S$Y'$5 t$	$5
  $5 
$5L=8H- =r2   r
   )importlib.utilr   loggingpathlibr   typingr   r   r   langchain_core.documentsr   )langchain_community.document_loaders.baser   	getLoggerr,   loggerr
   r1   r2   r   <module>r;      s:       ( ( - @			8	$~=: ~=r2   