
    h #                         d dl Z d dlZd dlZd dlmZmZ d dlmZmZm	Z	m
Z
mZmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ  ej0                  e      Z e       ddd	d
dd
dZdededefdZ G d de      Zy)    N)FutureThreadPoolExecutor)	AnyAsyncIteratorDictIteratorListOptionalTupleUnioncast)Document)
BaseLoader)get_user_agentzJtext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8zen-US,en;q=0.5zhttps://www.google.com/1z
keep-alive)
User-AgentAcceptzAccept-LanguageRefererDNT
ConnectionzUpgrade-Insecure-Requestssoupurlreturnc                    d|i}| j                  d      x}r|j                         |d<   | j                  dddi      x}r|j                  dd      |d<   | j                  d	      x}r|j                  d
d      |d<   |S )z)Build metadata from BeautifulSoup output.sourcetitlemetanamedescription)attrscontentzNo description found.htmllangzNo language found.language)findget_textget)r   r   metadatar   r   r"   s         m/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/langchain_community/document_loaders/async_html.py_build_metadatar*   &   s    #H		'""u"!NN,iiv}.EiFF{F"-//)=T"Uyy  t #xx0DEO    c                      e Zd ZdZ	 	 	 	 	 	 	 	 	 	 d&ddddeeee   f   dee   dee	   d	ee   d
e	dee   dede
deeeef      de	de	de	de	fdZdedefdZededdfd       Z	 d'dede
de
dedef
dZdedej(                  deeef   fdZdee   de	deeeef      fd Zdee   dee   fd!Zded"edefd#Zdee   fd$Zdee   fd%Zy)(AsyncHtmlLoaderzLoad `HTML` asynchronously.NTF)preserve_order	trust_envweb_pathheader_template
verify_sslproxiesautoset_encodingencodingdefault_parserrequests_per_secondrequests_kwargsraise_for_statusignore_load_errorsr.   r/   c                t   t        |t              r	|g| _        nt        |t              r|| _        |xs t        }|j                  d      s	 ddlm}  |       j                  |d<   t        j                         | _        t        |      | j                  _        || j                  _        |r%| j                  j$                  j'                  |       || _        || _        |	xs i | _        |
| _        || _        || _        || _        || _        || _        y# t        $ r t        j                  d       Y w xY w)zInitialize with a webpage path.r   r   )	UserAgentzxfake_useragent not found, using default user agent.To get a realistic header for requests, `pip install fake_useragent`.N)
isinstancestr	web_pathsr	   default_header_templater'   fake_useragentr<   randomImportErrorloggerinforequestsSessionsessiondictheadersverifyr3   updater7   r6   r8   r9   r4   r5   r:   r.   r/   )selfr0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r.   r/   rJ   r<   s                   r)   __init__zAsyncHtmlLoader.__init__5   s   , h$&ZDN$'%DN!<%<{{<(	4(1(:(:%  '')#G}(LL  ''0#6 ,.4" 0 0 "4,"/  4s   D D76D7r   r   c                     | j                   r(	  | j                  j                  |fi | j                  S  | j                  j                  |fi | j                  S # t        $ r(}t        j                  t        |             Y d }~y d }~ww xY wN)r:   rH   r'   r8   	Exceptionwarningswarnr>   )rM   r   es      r)   _fetch_valid_connection_docsz,AsyncHtmlLoader._fetch_valid_connection_docso   s{    ""'t||''Dt/C/CDD
  t||<t';';<<	  c!f%s   &A 	B%BBparserc                 T    g d}| |vr t        ddj                  |      z   dz         y)z#Check that parser is valid for bs4.)html.parserlxmlxmlzlxml-xmlhtml5libz`parser` must be one of z, .N)
ValueErrorjoin)rV   valid_parserss     r)   _check_parserzAsyncHtmlLoader._check_parsery   s:     O&*TYY}-EEK  'r+   retriescooldownbackoffc                   K   t        j                  | j                        4 d {   }t        |      D ]  }	 t	        d| j
                  j                  | j
                  j                  j                         d| j                  }| j
                  j                  sd|d<    |j                  |fi |4 d {   }	 |j                          d {   }	|	cd d d       d {    c cd d d       d {    S  d d d       d {    t-        d      7 7 a7 J# t        $ r t        j                  d|        d}	Y mw xY w7 c7 R# 1 d {  7  sw Y   nxY w.# t         j                   t"        f$ r}
||dz
  k(  r?| j$                  r3t        j'                  d| d	| d
       Y d }
~
 d d d       d {  7   y||dz
  k(  r t        j'                  d| d|dz    d| d|
 d	       t)        j*                  |||z  z         d {  7   Y d }
~
d }
~
ww xY w7 # 1 d {  7  sw Y   t-        d      xY ww)N)r/   )rJ   cookiesFsslzFailed to decode content from     zError fetching z after z	 retries.z with attempt /z: z. Retrying...zretry count exceeded )aiohttpClientSessionr/   rangerI   rH   rJ   re   get_dictr8   rK   r'   textUnicodeDecodeErrorrD   errorClientConnectionErrorTimeoutErrorr:   warningasynciosleepr]   )rM   r   ra   rb   rc   rH   ikwargsresponsero   rT   s              r)   _fetchzAsyncHtmlLoader._fetch   sa     ((4>>B 	C 	Cg7^ CC#' $ $ 4 4 $ 4 4 = = ?$ ..$F
  <<..(-u*w{{    	$ 	$ "&)1#8D  $	$ 	$ 	$	C 	C 	CC	C 	C@ /00A	C	$
 $91 &"LL+I#)OP#%D&	$	C	$ 	$ 	$ 	$  55|D CGaK'D,C,CWWIY'WX!/	C 	C 	C0 gk)-cU. 1ugQwir!MC &mmHwz,ABBBC)	C 	C 	C 	C@ /00s;  %IDIH(B E%;D
<E%?ED!D
D!EE%&E

'E%+H(-I9E:I?H( IH%IE%D!!#E	EE	E
E%IE EE E%#H(%H">0H.H(3I>G?IAHH
HH(H""H(%I(I.H1/I6I	semaphorec                    K   |4 d {    || j                  |       d {   fcd d d       d {    S 7 07 7 	# 1 d {  7  sw Y   y xY wwrP   )rz   )rM   r   r{   s      r)   _fetch_with_rate_limitz&AsyncHtmlLoader._fetch_with_rate_limit   sR       	/ 	/dkk#...	/ 	/ 	/.	/ 	/ 	/ 	/sL   A:AA <A A>AA A AA	AAurlsc           	     H  K   t        j                  | j                        }|D cg c]'  }t        j                  | j	                  ||            ) }}	 ddlm} |r ||ddd      D ]  }| d {     y |j                  |ddd      D ]  }| d {     y c c}w 7 77 # t        $ ri t        j                  d       |r't        j                  |  d {  7  D ]  }| 	 Y y t        j                  |      D ]  }| d {  7    Y y w xY ww)Nr   )tqdm_asynciozFetching pagesTrh   )descasciiminintervalz2For better logging of progress, `pip install tqdm`)ru   	Semaphorer7   create_taskr}   tqdm.asyncior   as_completedrC   rR   rS   gather)	rM   r~   r.   r{   r   tasksr   taskresults	            r)   _lazy_fetch_allzAsyncHtmlLoader._lazy_fetch_all   s4     %%d&>&>?	 
  ; ;C KL
 
	%1( 0! %D !%*$%
 )55 0! 6  %D !%*$%
 %
 % 	%MMNO$+NNE$::: !F L! $007 %D $**$%	%s   $D",B$D"B- 1B)2	B- ;D"<B- B+	B- #D")B- +B- -5D"C%#D2D"4DD
DD"DD"c                 p   K   | j                  |d      2 cg c3 d{   \  }}|7 
6 c}}S c c}}w w)z/Fetch all urls concurrently with rate limiting.TN)r   )rM   r~   _docs       r)   	fetch_allzAsyncHtmlLoader.fetch_all   s2     (,(<(<T4(HIIfaIIIIs$   60+)+0+0
6ro   c                     ddl m} |j                  d      rd}n| j                  }| j	                  |        |||      }t        ||      }t        ||      S )Nr   )BeautifulSoupz.xmlrZ   )page_contentr(   )bs4r   endswithr6   r`   r*   r   )rM   r   ro   r   rV   r   r(   s          r)   _to_documentzAsyncHtmlLoader._to_document   sT    %<<F((F6"T6*"4-TH==r+   c              #     K   	 t        j                          t        d      5 }|j                  t         j                  | j                  | j                              }|j                         }ddd       t        t        t        t                       D ]&  \  }}| j                  | j                  |   |       ( y# 1 sw Y   SxY w# t        $ r1 t        j                  | j                  | j                              }Y w xY ww)+Lazy load text from the url(s) in web_path.rh   )max_workersN)ru   get_running_loopr   submitrunr   r?   r   RuntimeError	enumerater   r	   r>   r   )rM   executorfutureresultsrw   ro   s         r)   	lazy_loadzAsyncHtmlLoader.lazy_load   s     	B$$& $2 *h,4OOKKNN4>>2- !--/* !d3i!9: 	=GAt##DNN1$5t<<	=* *  	Bkk$.."@AG	BsA   D	 C A
C .C 6A
D	 C	C 7DD	DD	c                   K   | j                  | j                  | j                        2 3 d{   \  }}| j                  ||       "7 6 yw)r   N)r   r?   r.   r   )rM   r   ro   s      r)   
alazy_loadzAsyncHtmlLoader.alazy_load   sS     #33NND// 
 	/ 	/)#t ##C..	/  
s%   'AAAAAAA)
NTNTNrX      NFF)   r   g      ?)__name__
__module____qualname____doc__r   r>   r	   r
   rI   boolintr   r   rN   rU   staticmethodr`   floatrz   ru   r   r   r}   r   r   r   r   r   r   r   r   rj   r+   r)   r-   r-   2   s   %
 +/%)"&!%"&+#$48!&#(8#  $8#T#Y'8# "$8# TN	8#
 $8# 8# 3-8# 8# !8# "$sCx.18# 8# !8# 8# 8#t= = = c d   OR#1#1!$#147#1FK#1	#1J//#*#4#4/	sCx/%I%/3%	uS#X	'%<JDI J$s) J
> 
>3 
>8 
>=8H- =(/-"9 /r+   r-   ) ru   loggingrR   concurrent.futuresr   r   typingr   r   r   r   r	   r
   r   r   r   rk   rF   langchain_core.documentsr   )langchain_community.document_loaders.baser   $langchain_community.utils.user_agentr   	getLoggerr   rD   r@   r>   rI   r*   r-   rj   r+   r)   <module>r      s       9
 
 
   - @ ?			8	$ !"'(!$	 	# 	C 	D 	B/j B/r+   