
    h;                         d Z ddlZddlZddlZddlmZmZmZmZm	Z	m
Z
mZmZ ddlZddlZddlmZ ddlmZ ddlmZ ddlmZ  ej.                  e      Z e       dd	d
ddddZdededefdZ G d de      Zy)zWeb base loader class.    N)AnyAsyncIteratorDictIteratorListOptionalSequenceUnion)
deprecated)Document)
BaseLoader)get_user_agentzJtext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8zen-US,en;q=0.5zhttps://www.google.com/1z
keep-alive)
User-AgentAcceptzAccept-LanguageRefererDNT
ConnectionzUpgrade-Insecure-Requestssoupurlreturnc                    d|i}| j                  d      x}r|j                         |d<   | j                  dddi      x}r|j                  dd      |d<   | j                  d	      x}r|j                  d
d      |d<   |S )z)Build metadata from BeautifulSoup output.sourcetitlemetanamedescription)attrscontentzNo description found.htmllangzNo language found.language)findget_textget)r   r   metadatar   r   r    s         k/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/langchain_community/document_loaders/web_base.py_build_metadatar(      s    #H		'""u"!NN,iiv}.EiFF{F"-//)=T"Uyy  t #xx0DEO    c            &          e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d2ddddeeee   f   dee   de	d	ee   d
e	de	dee   dee   de
dedeeeef      de	deeeef      deeeef      dede	de	ddf$dZedefd       Z	 d3dede
de
dedef
dZdedej&                  defd Zd!ee   defd"Zed#eddfd$       Z	 d4d%ed!ee   d#eedf   dee   fd&Zd4d!ee   d#eedf   dee   fd'Z	 d4d!ee   d#eedf   dee   fd(Z	 	 d5ded#eedf   dee   defd)Zd4d#eedf   defd*Zdee   fd+Z de!e   fd,Z" e#d-d.d/0      dee   fd1       Z$y)6WebBaseLoaderaQ  
    WebBaseLoader document loader integration

    Setup:
        Install ``langchain_community``.

        .. code-block:: bash

            pip install -U langchain_community

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import WebBaseLoader

            loader = WebBaseLoader(
                web_path = "https://www.espn.com/"
                # header_template = None,
                # verify_ssl = True,
                # proxies = None,
                # continue_on_failure = False,
                # autoset_encoding = True,
                # encoding = None,
                # web_paths = (),
                # requests_per_second = 2,
                # default_parser = "html.parser",
                # requests_kwargs = None,
                # raise_for_status = False,
                # bs_get_text_kwargs = None,
                # bs_kwargs = None,
                # session = None,
                # show_progress = True,
                # trust_env = False,
            )

    Lazy load:
        .. code-block:: python

            docs = []
            for doc in loader.lazy_load():
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            ESPN - Serving Sports Fans. Anytime. Anywhere.

            {'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}


    Async load:
        .. code-block:: python

            docs = []
            async for doc in loader.alazy_load():
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            ESPN - Serving Sports Fans. Anytime. Anywhere.

            {'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}

    .. versionchanged:: 0.3.14

        Deprecated ``aload`` (which was not async) and implemented a native async
        ``alazy_load``. Expand below for more details.

        .. dropdown:: How to update ``aload``

            Instead of using ``aload``, you can use ``load`` for synchronous loading or
            ``alazy_load`` for asynchronous lazy loading.

            Example using ``load`` (synchronous):

            .. code-block:: python

                docs: List[Document] = loader.load()

            Example using ``alazy_load`` (asynchronous):

            .. code-block:: python

                docs: List[Document] = []
                async for doc in loader.alazy_load():
                    docs.append(doc)

            This is in preparation for accommodating an asynchronous ``aload`` in the
            future:

            .. code-block:: python

                docs: List[Document] = await loader.aload()

    NTF)show_progress	trust_envweb_pathheader_template
verify_sslproxiescontinue_on_failureautoset_encodingencoding	web_pathsrequests_per_seconddefault_parserrequests_kwargsraise_for_statusbs_get_text_kwargs	bs_kwargssessionr,   r-   r   c                8   |r|rt        d      |rt        |      | _        n^t        |t              r	|g| _        nEt        |t
              rt        |      | _        n$t        dt        |       dt        |       d      |	| _        |
| _	        |xs i | _
        || _        || _        |xs i | _        |xs i | _        |r|| _        nt!        j"                         }|xs t$        j'                         }|j)                  d      s	 ddlm}  |       j.                  |d<   t7        |      |_        ||_        |r|j<                  j?                  |       || _        || _         || _!        || _"        || _#        y	# t0        $ r t2        j5                  d       Y xw xY w)
a  Initialize loader.

        Args:
            web_paths: Web paths to load from.
            requests_per_second: Max number of concurrent requests to make.
            default_parser: Default parser to use for BeautifulSoup.
            requests_kwargs: kwargs for requests
            raise_for_status: Raise an exception if http status code denotes an error.
            bs_get_text_kwargs: kwargs for beatifulsoup4 get_text
            bs_kwargs: kwargs for beatifulsoup4 web page parsing
            show_progress: Show progress bar when loading pages.
            trust_env: set to True if using proxy to make web requests, for example
                using http(s)_proxy environment variables. Defaults to False.
        zmReceived web_path and web_paths. Only one can be specified. web_path is deprecated, web_paths should be used.z+web_path must be str or Sequence[str] got (z*) or web_paths must be Sequence[str] got ()r   r   )	UserAgentzxfake_useragent not found, using default user agent.To get a realistic header for requests, `pip install fake_useragent`.N)$
ValueErrorlistr5   
isinstancestrr	   	TypeErrortyper6   r7   r8   r9   r,   r:   r;   r<   requestsSessiondefault_header_templatecopyr%   fake_useragentr?   randomImportErrorloggerinfodictheadersverifyr1   updater2   r3   r4   r-   )selfr.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r,   r-   r?   s                      r'   __init__zWebBaseLoader.__init__   s   H 	D  !)_DN#&&ZDN(+!(^DN=d8n=M N99=i8IL  $7 ,.4" 0*"4":"b"DL&&(G-O1H1M1M1OO"&&|4	84=K4F4FOL1 #?3GO'GN&&w/"DL#6  0 " # KK8s   E8 8FFc                 f    t        | j                        dkD  rt        d      | j                  d   S )N   zMultiple webpaths found.r   )lenr5   r@   )rS   s    r'   r.   zWebBaseLoader.web_path   s.    t~~"788~~a  r)   r   retriescooldownbackoffc                   K   t        j                  | j                        4 d {   }t        |      D ]  }	 t	        | j
                  j                  | j
                  j                  j                               }| j
                  j                  sd|d<    |j                  |fi | j                  |z  4 d {   }| j                  r|j                          |j                          d {   cd d d       d {    c cd d d       d {    S  d d d       d {    t'        d      7 7 {7 I7 ;7 *# 1 d {  7  sw Y   nxY w!# t         j                  $ r]}	||dz
  k(  r t        j!                  d| d|dz    d| d	|	 d
	       t#        j$                  |||z  z         d {  7   Y d }	~	d }	~	ww xY w7 # 1 d {  7  sw Y   t'        d      xY ww)N)r-   )rP   cookiesFsslrV   Error fetching z with attempt /z: z. Retrying...zretry count exceeded)aiohttpClientSessionr-   rangerO   r<   rP   r\   get_dictrQ   r%   r8   r9   textClientConnectionErrorrM   warningasynciosleepr@   )
rS   r   rX   rY   rZ   r<   ikwargsresponsees
             r'   _fetchzWebBaseLoader._fetch   s     ((4>>B 	C 	Cg7^ CC#' $ 4 4 $ 4 4 = = ?$F  <<..(-u*w{{  $ 4 4v =  5 5!00$557%-]]_45 5 5	C 	C 	CC	C 	C2 /003	C5
  55	C5 5 5 5 44 CGaK'-cU. 1ugQwir!MC &mmHwz,ABBBC!	C 	C 	C 	C2 /00s   %G,D6G,GBE=D9
>E0E1D;
2E5ED=
EGG,D?G,GG,&G'G,9E;E=E?G,EE
EEGG+AG7F:
8G=GGGG,G)GG)G,	semaphorec                 z  K   |4 d {    	 | j                  |       d {   cd d d       d {    S 7 /7 7 	# t        $ r[}| j                  r/t        j	                  d| d       Y d }~d d d       d {  7   yt        j                  d| d       |d }~ww xY w# 1 d {  7  sw Y   y xY ww)Nr^   z*, skipping due to continue_on_failure=True za and aborting, use continue_on_failure=True to continue loading urls after encountering an error.)rm   	Exceptionr2   rM   rf   	exception)rS   r   rn   rl   s       r'   _fetch_with_rate_limitz$WebBaseLoader._fetch_with_rate_limit  s       	 	![[--	 	 	-	  ++NN)# /4 5 	 	 	   %cU +L L 	 	 	s   B;9B;B&?;?B;=B;?B;	B#%B-B&1B;<A?=B;BB##B&&B8,B/-B84B;urlsc                   K   t        j                  | j                        }g }|D ]8  }t        j                  | j	                  ||            }|j                  |       : 	 | j                  r"ddlm}  |j                  |dddd d{   S t        j                  |  d{   S 7 7 # t        $ r3 t        j                  d       t        j                  |  d{  7  cY S w xY ww)	z/Fetch all urls concurrently with rate limiting.r   )tqdm_asynciozFetching pagesTrV   )descasciiminintervalNz2For better logging of progress, `pip install tqdm`)rg   	Semaphorer6   ensure_futurers   appendr,   tqdm.asynciorv   gatherrL   warningswarn)rS   rt   rn   tasksr   taskrv   s          r'   	fetch_allzWebBaseLoader.fetch_all  s     %%d&>&>?	 	C(()D)DS))TUDLL		0!!50\00!11   %^^U333	 4 	0MMNO ////	0sf   AC-")B. B*B. C-B. %B,&B. )C-*B. ,B. .3C*!C$"C*'C-)C**C-parserc                 T    g d}| |vr t        ddj                  |      z   dz         y)z#Check that parser is valid for bs4.)html.parserlxmlxmlzlxml-xmlhtml5libz`parser` must be one of z, .N)r@   join)r   valid_parserss     r'   _check_parserzWebBaseLoader._check_parser.  s:     O&*TYY}-EEK  'r)   resultsc                     ddl m} g }t        |      D ]`  \  }}||   }|1|j                  d      rd}n| j                  }| j                  |       |j                   |||fi | j                         b |S )z0Unpack fetch results into BeautifulSoup objects.r   BeautifulSoup.xmlr   )bs4r   	enumerateendswithr7   r   r|   r;   )	rS   r   rt   r   r   final_resultsri   resultr   s	            r'   _unpack_fetch_resultsz#WebBaseLoader._unpack_fetch_results7  s     	&"7+ 	RIAvq'C~<<'"F!00F""6*  vv!P!PQ	R r)   c                 r    t        j                  | j                  |            }| j                  |||      S )z2Fetch all urls, then return soups for all results.r   )rg   runr   r   rS   rt   r   r   s       r'   
scrape_allzWebBaseLoader.scrape_allI  s1    ++dnnT23))'4)GGr)   c                 h   K   | j                  |       d{   }| j                  |||      S 7 w)z8Async fetch all urls, then return soups for all results.Nr   )r   r   r   s       r'   ascrape_allzWebBaseLoader.ascrape_allN  s6      t,,))'4)GG -s   202c                    ddl m} | |j                  d      rd}n| j                  }| j	                  |        | j
                  j                  |fi | j                  }| j                  r|j                          | j                  | j                  |_	        n| j                  r|j                  |_	         ||j                  |fi |xs i S )Nr   r   r   r   )r   r   r   r7   r   r<   r%   r8   r9   r4   r3   apparent_encodingrd   )rS   r   r   r;   r   html_docs         r'   _scrapezWebBaseLoader._scrapeU  s     	&>||F#,,6"#4<<##C@4+?+?@  %%'==$ $H"" ( : :HX]]FHyBHHr)   c                 R    | j                  | j                  || j                        S )z?Scrape data from webpage and return it in BeautifulSoup format.)r   r;   )r   r.   r;   )rS   r   s     r'   scrapezWebBaseLoader.scrapeo  s!     ||DMM&DNN|SSr)   c              #      K   | j                   D ]V  }| j                  || j                        } |j                  di | j                  }t        ||      }t        ||       X yw)z+Lazy load text from the url(s) in web_path.)r;   page_contentr&   N )r5   r   r;   r$   r:   r(   r   )rS   pathr   rd   r&   s        r'   	lazy_loadzWebBaseLoader.lazy_loadt  sc     NN 	AD<<<?D 4==;4#:#:;D&tT2Hx@@		As   A'A)c                  K   | j                  | j                         d{   }t        | j                  |      D ]=  \  }} |j                  di | j                  }t        ||      }t        ||       ? y7 [w)z1Async lazy load text from the url(s) in web_path.Nr   r   )r   r5   zipr$   r:   r(   r   )rS   r   r   r   rd   r&   s         r'   
alazy_loadzWebBaseLoader.alazy_load|  st     ((88dnng6 	AJD$ 4==;4#:#:;D&tT2Hx@@	A 9s   A?A=AA?z0.3.14z1.0zSee API reference for updated usage: https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.web_base.WebBaseLoader.html)sinceremovalmessagec                    | j                  | j                        }g }t        | j                  |      D ]I  \  }} |j                  di | j                  }t        ||      }|j                  t        ||             K |S )z9Load text from the urls in web_path async into Documents.r   r   )r   r5   r   r$   r:   r(   r|   r   )rS   r   docsr   r   rd   r&   s          r'   aloadzWebBaseLoader.aload  sz     //$..1dnng6 	HJD$ 4==;4#:#:;D&tT2HKKdXFG	H
 r)   )rp   NTNFTNr      r   NFNNN)   r   g      ?)N)NN)%__name__
__module____qualname____doc__r
   rC   r	   r   rO   boolintr   r   rT   propertyr.   floatrm   rg   rz   rs   r   r   staticmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r   r)   r'   r+   r+   *   s_   aJ /1*."&$)!%"&#%#$+48!&7;.2!S#$ #'S#Xc]*+S# "$S# 	S#
 $S# "S# S# 3-S# C=S# !S# S# "$sCx.1S# S# %T#s(^4S# DcN+S#  !S#$ %S#& 'S#( 
)S#j !# ! ! OR11!$1471FK1	1<#*#4#4	&0DI 0# 0( c d   IM"&s)5:395E	c$HtCy H%T	2B HdSVi H ;?HIH',S$Y'7H	cH $($(	II c4i I D>	I
 
I4TU39- T T
A8H- AA-"9 A U	
tH~ 

r)   r+   )r   rg   loggingr   typingr   r   r   r   r   r   r	   r
   r`   rF   langchain_core._apir   langchain_core.documentsr   )langchain_community.document_loaders.baser   $langchain_community.utils.user_agentr   	getLoggerr   rM   rH   rC   rO   r(   r+   r   r)   r'   <module>r      s        V V V   * - @ ?			8	$ !"'(!$	 	# 	C 	D 	lJ lr)   