
    h&                        d Z ddlZddlZddlmZmZ ddlmZmZm	Z	m
Z
mZmZmZ ddlmZ ddlmZ erddlmZ ddlmZ dd	lmZ dd
lmZmZmZ  ej4                  e      Z G d de      Z G d de      Z G d de      Zy)zQLoader that uses Playwright to load a page, then uses unstructured to parse html.    N)ABCabstractmethod)TYPE_CHECKINGAsyncIteratorDictIteratorListOptionalUnion)Document)
BaseLoader)Browser)Page)Response)r   r   r   c            	       T    e Zd ZdZedddddddefd	       Zedd
dddddefd       Zy)PlaywrightEvaluatorzAbstract base class for all evaluators.

    Each evaluator should take a page, a browser instance, and a response
    object, process the page as necessary, and return the resulting text.
    pager   browserr   responser   returnc                      y)a  Synchronously process the page and return the resulting text.

        Args:
            page: The page to process.
            browser: The browser instance.
            response: The response from page.goto().

        Returns:
            text: The text content of the page.
        N selfr   r   r   s       q/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/langchain_community/document_loaders/url_playwright.pyevaluatezPlaywrightEvaluator.evaluate   s     	    	AsyncPageAsyncBrowserAsyncResponsec                    K   yw)a  Asynchronously process the page and return the resulting text.

        Args:
            page: The page to process.
            browser: The browser instance.
            response: The response from page.goto().

        Returns:
            text: The text content of the page.
        Nr   r   s       r   evaluate_asyncz"PlaywrightEvaluator.evaluate_async+   s      	s   N)__name__
__module____qualname____doc__r   strr   r"   r   r   r   r   r      sl     V i : RU   *8DS	 r   r   c                   Z    e Zd ZdZddeee      fdZddddd	d
defdZddddd	ddefdZ	y)UnstructuredHtmlEvaluatorz@Evaluate the page HTML content using the `unstructured` library.Nremove_selectorsc                 L    	 ddl }|| _        y# t        $ r t        d      w xY w)z%Initialize UnstructuredHtmlEvaluator.r   NzQunstructured package not found, please install it with `pip install unstructured`)unstructuredImportErrorr*   )r   r*   r,   s      r   __init__z"UnstructuredHtmlEvaluator.__init__?   s7    	 !1  	- 	s    #r   r   r   r   r   r   r   c                 T   ddl m} | j                  xs g D ]J  }|j                  |      j	                         }|D ]$  }|j                         s|j                  d       & L |j                         } ||      }dj                  |D 	cg c]  }	t        |	       c}	      S c c}	w )z3Synchronously process the HTML content of the page.r   partition_htmlelement => element.remove()text


unstructured.partition.htmlr1   r*   locatorall
is_visibler   contentjoinr'   
r   r   r   r   r1   selectorelementselementpage_sourceels
             r   r   z"UnstructuredHtmlEvaluator.evaluateK   s    >--3 	DH||H-113H# D%%'$$%BCD	D lln!{3{{h7CG7887s   B%r   r   r    c                   K   ddl m} | j                  xs g D ]b  }|j                  |      j	                          d{   }|D ]4  }|j                          d{   s|j                  d       d{    6 d |j                          d{   } ||      }dj                  |D 	cg c]  }	t        |	       c}	      S 7 7 m7 U7 ;c c}	w w)z4Asynchronously process the HTML content of the page.r   r0   Nr2   r3   r5   r6   r=   s
             r   r"   z(UnstructuredHtmlEvaluator.evaluate_asyncY   s      	?--3 	JH!\\(37799H# J ++---!**+HIIIJ	J !LLN*!{3{{h7CG788 :-I*7s]   <CCCC	C!C5C6CCC.C C	CCCC)N)
r#   r$   r%   r&   r
   r	   r'   r.   r   r"   r   r   r   r)   r)   <   sa    J
1$s))< 
19V 9i 9: 9RU 999*89DS9	9r   r)   c                       e Zd ZdZ	 	 	 	 	 	 ddee   dededeee      dee   dee	eef      d	ee
eej                  e   f      fd
Zdee   fdZdee   fdZdee   fdZy)PlaywrightURLLoaderad  Load `HTML` pages with `Playwright` and parse with `Unstructured`.

    This is useful for loading pages that require javascript to render.

    Attributes:
        urls (List[str]): List of URLs to load.
        continue_on_failure (bool): If True, continue loading other URLs on failure.
        headless (bool): If True, the browser will run in headless mode.
        proxy (Optional[Dict[str, str]]): If set, the browser will access URLs
            through the specified proxy.
        browser_session (Optional[Union[str, os.PathLike[str]]]): Path to a file with
            browser session data that can be used to restore the browser session.

    Example:
        .. code-block:: python

            from langchain_community.document_loaders import PlaywrightURLLoader

            urls = ["https://api.ipify.org/?format=json",]
            proxy={
                "server": "https://xx.xx.xx:15818", # https://<host>:<port>
                "username": "username",
                "password": "password"
            }
            loader = PlaywrightURLLoader(urls, proxy=proxy)
            data = loader.load()
    Nurlscontinue_on_failureheadlessr*   	evaluatorproxybrowser_sessionc                     	 ddl }|| _        || _        || _        || _        || _        |r|rt        d      |xs t        |      | _	        y# t        $ r t        d      w xY w)z%Load a list of URLs using Playwright.r   NzMplaywright package not found, please install it with `pip install playwright`z:`remove_selectors` and `evaluator` cannot be both not None)

playwrightr-   rF   rG   rH   rJ   rK   
ValueErrorr)   rI   )	r   rF   rG   rH   r*   rI   rJ   rK   rM   s	            r   r.   zPlaywrightURLLoader.__init__   s~    	 	#6  
.	L 
 #Q&?@P&Q%  	+ 	s   A A"r   c           	   #     K   ddl m}  |       5 }|j                  j                  | j                  | j
                        }d}| j                  rht        j                  j                  | j                        r|j                  | j                        }n"t        j                  d| j                          ||j                         }| j                  D ]  }	 |j                         }|j                  |      }|t!        d|       |j#                  d       | j$                  j'                  |||      }|j)                          d	|i}	t+        ||	
        |j)                          ddd       y# t,        $ r4}
| j.                  rt        j1                  d| d|
        n|
Y d}
~
d}
~
ww xY w# 1 sw Y   yxY ww)zLoad the specified URLs using Playwright and create Document instances.

        Returns:
            A list of Document instances with loaded content.
        r   )sync_playwrightrH   rJ   Nstorage_stateSession file not found: "page.goto() returned None for url loadsourcepage_contentmetadataError fetching or processing , exception: )playwright.sync_apirP   chromiumlaunchrH   rJ   rK   ospathexistsnew_contextloggerwarningrF   new_pagegotorN   wait_for_load_staterI   r   closer   	ExceptionrG   error)r   rP   pr   contexturlr   r   r4   rZ   es              r   	lazy_loadzPlaywrightURLLoader.lazy_load   s     	8 !	!jj''djj'QGG##77>>$"6"67%11@T@T1UGNN%=d>R>R=S#TU!--/yy   "++-D#yy~H'(+McU)STT,,V4>>224(KDJJL (#H"xHH ( MMOC!	 !	4 !  //;C5aSQ  	 5!	 !	sH   GC	F7BE7F7.	G7	F4 *F/*F7/F44F77G <Gc                 `   K   | j                         2 cg c3 d{   }|7 6 c}S c c}w w)Load the specified URLs with Playwright and create Documents asynchronously.
        Use this function when in a jupyter notebook environment.

        Returns:
            A list of Document instances with loaded content.
        N)
alazy_load)r   docs     r   aloadzPlaywrightURLLoader.aload   s*      &*__%677c7777s$   .)%#
%)%).c           	       K   ddl m}  |       4 d{   }|j                  j                  | j                  | j
                         d{   }d}| j                  rpt        j                  j                  | j                        r%|j                  | j                         d{   }n"t        j                  d| j                          ||j                          d{   }| j                  D ]  }	 |j                          d{   }|j                  |       d{   }|t!        d|       |j#                  d       d{    | j$                  j'                  |||       d{   }|j)                          d{    d	|i}	t+        ||	
        |j)                          d{    ddd      d{    y7 7 7 .7 7 7 7 7 l7 V# t,        $ r5}
| j.                  rt        j1                  d| d|
        n|
Y d}
~
(d}
~
ww xY w7 k7 ]# 1 d{  7  sw Y   yxY ww)rr   r   )async_playwrightNrQ   rR   rT   rU   rV   rW   rX   r[   r\   )playwright.async_apirw   r^   r_   rH   rJ   rK   r`   ra   rb   rc   rd   re   rF   rf   rg   rN   rh   rI   r"   ri   r   rj   rG   rk   )r   rw   rl   r   rm   rn   r   r   r4   rZ   ro   s              r   rs   zPlaywrightURLLoader.alazy_load   s	     	:#% #	" #	"JJ--t}}DJJ-WWGG##77>>$"6"67$+$7$7&*&:&: %8 % G NN%=d>R>R=S#TU ' 3 3 55yy   !(!1!1!33D%)YYs^3H'(+McU)STT226:::!%!>!>tWh!WWD**,&& (#H"xHH ( --/!!G#	" #	" #	"W
 6 43 ;W& !  //;C5aSQ  	  "G#	" #	" #	" #	"s  H?GH?5H*GAH*(G)<H*%G&H*:G%GG%&G'(G%G$G%4G!5G%G#G%%H*:H&;H*?H?
H(H?H*H*H*G%G%G%!G%#G%%	H#.*HH*H##H*(H?*H<0H31H<8H?)TTNNNN)r#   r$   r%   r&   r	   r'   boolr
   r   r   r   r`   PathLiker.   r   r   rp   ru   r   rs   r   r   r   rE   rE   j   s    > %)0437*.BFR3iR "R 	R
 #49-R /0R S#X'R "%R[[-=(=">?RB)8H- )V8T(^ 8,"-"9 ,"r   rE   ) r&   loggingr`   abcr   r   typingr   r   r   r   r	   r
   r   langchain_core.documentsr   )langchain_community.document_loaders.baser   rx   r   r   r   r   r   r    r]   	getLoggerr#   rd   r   r)   rE   r   r   r   <module>r      so    W  	 # V V V - @<6>;; 
		8	$## #L+9 3 +9\^"* ^"r   