
    h=                         d dl Z d dlmZmZmZmZmZmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ  G d d	e      Zy)
    N)AnyAsyncIteratorIteratorListOptionalSetUnion)urlparse)BeautifulSoup)Document)
BaseLoader)WebBaseLoaderc                      e Zd ZdZ	 	 	 	 	 d!ddddededee   deded	ed
ee   deee      fdZdedefdZ		 d"de
e   dededefdZdeee
e   f   defdZdedefdZdede
e   fdZ	 d#dedee   dee   de
e   fdZ	 d#dededee   dee   de
e   f
dZdee   fdZdee   fdZ	 d#dedee   dee   fdZdede
e   fd Zy)$GitbookLoadera   Load `GitBook` data.

    1. load from either a single page, or
    2. load all (relative) paths in the sitemap, handling nested sitemap indexes.

    When `load_all_paths=True`, the loader parses XML sitemaps and requires the
    `lxml` package to be installed (`pip install lxml`).
    N)sitemap_urlallowed_domainsweb_pageload_all_pathsbase_urlcontent_selectorcontinue_on_failureshow_progressr   r   c                   |xs || _         | j                   j                  d      r| j                   dd | _         || _        || _        || _        || _        || _        || _        | j                  t        |      j                  }	|	r|	h| _        |r|xs | j                    d| _
        n|| _
        | j                  | j                        s%t        d| j                   d| j                         y)aT  Initialize with web page and whether to load all paths.

        Args:
            web_page: The web page to load or the starting point from where
                relative paths are discovered.
            load_all_paths: If set to True, all relative paths in the navbar
                are loaded instead of only `web_page`. Requires `lxml` package.
            base_url: If `load_all_paths` is True, the relative paths are
                appended to this base url. Defaults to `web_page`.
            content_selector: The CSS selector for the content to load.
                Defaults to "main".
            continue_on_failure: whether to continue loading the sitemap if an error
                occurs loading a url, emitting a warning instead of raising an
                exception. Setting this to True makes the loader more robust, but also
                may result in missing data. Default: False
            show_progress: whether to show a progress bar while loading. Default: True
            sitemap_url: Custom sitemap URL to use when load_all_paths is True.
                Defaults to "{base_url}/sitemap.xml".
            allowed_domains: Optional set of allowed domains to fetch from.
                If None (default), the loader will restrict crawling to the domain
                of the `web_page` URL to prevent potential SSRF vulnerabilities.
                Provide an explicit set (e.g., {"example.com", "docs.example.com"})
                to allow crawling across multiple domains. Use with caution in
                server environments where users might control the input URLs.
        /Nz/sitemap.xmlz
Domain in z% is not in the allowed domains list: )r   endswithr   r   r   r   r   r   r
   netloc	start_url_is_url_allowed
ValueError)
selfr   r   r   r   r   r   r   r   initial_domains
             j/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/langchain_community/document_loaders/gitbook.py__init__zGitbookLoader.__init__   s    J !,H==!!#& MM#2.DM , 0#6 *. '%h/66N(6'7$ (Jt}}o\,JDN%DN ##DNN3T^^,,Q''(*  4    urlreturnc                     | j                   y	 t        |      }|j                  dvry|j                  sy|j                  | j                   v S # t        $ r Y yw xY w)z0Check if a URL has an allowed scheme and domain.F)httphttps)r   r
   schemer   	Exception)r!   r&   parseds      r#   r   zGitbookLoader._is_url_allowedY   sf    
 '	c]F }}$55 ====D$8$888 		s   A A A 	AAurl_listurl_typec                     | j                  |      r|j                  |       yt        j                  d| d|        y)aB  Safely add a URL to a list if it's from an allowed domain.

        Args:
            url_list: The list to add the URL to
            url: The URL to add
            url_type: Type of URL for warning message (e.g., "sitemap", "content")

        Returns:
            bool: True if URL was added, False if skipped
        TzSkipping disallowed z URL: F)r   appendwarningswarn)r!   r.   r&   r/   s       r#   _safe_add_urlzGitbookLoader._safe_add_urlp   s>     $OOC MM0
&FGr%   url_or_urlsc                 F    t        || j                  | j                        S )zCreate a new WebBaseLoader instance for the given URL(s).

        This ensures each operation gets its own isolated WebBaseLoader.
        )web_pathr   r   )r   r   r   )r!   r5   s     r#   _create_web_loaderz GitbookLoader._create_web_loader   s&    
   $ 8 8,,
 	
r%   soupc                 (    |j                  d      duS )z+Check if the soup contains a sitemap index.sitemapindexN)find)r!   r9   s     r#   _is_sitemap_indexzGitbookLoader._is_sitemap_index   s    yy(44r%   c                     |j                  d      }g }|D ]@  }|j                  d      }|s|j                  s$| j                  ||j                  d       B |S )z*Extract sitemap URLs from a sitemap index.sitemaploc)find_allr<   textr4   )r!   r9   sitemap_tagsurlsr?   r@   s         r#   _extract_sitemap_urlsz#GitbookLoader._extract_sitemap_urls   sX    }}Y/# 	>G,,u%Csxx""49=	> r%   processed_urls
web_loaderc                 6   || j                  | j                        }| j                  |      r| j                  |      }g }|D ]  }||v rt	        j
                  d|         |j                  |       	 |j                  }|g|_        |j                  d      }||_        | j                  |||      }	|j                  |	        |S | j                  |      S # t        $ r3}
| j                  rt	        j
                  d| d|
        n Y d}
~
d}
~
ww xY w)aO  Process a sitemap, handling both direct content URLs and sitemap indexes.

        Args:
            soup: The BeautifulSoup object of the sitemap
            processed_urls: Set of already processed URLs to avoid cycles
            web_loader: WebBaseLoader instance to reuse for all requests,
                created if None
        Nz(Skipping already processed sitemap URL: lxml-xmlparserError processing sitemap : )r8   r   r=   rE   r2   r3   add	web_pathsscrape_process_sitemapextendr,   r   
_get_paths)r!   r9   rF   rG   sitemap_urlsall_content_urlsr   original_web_pathssitemap_soupcontent_urlses              r#   rQ   zGitbookLoader._process_sitemap   s@    00@J !!$'55d;L!+ .0MMB;-P "";/)3)=)=&,7=J( $.#4#4J#4#GL ,>J( $(#8#8$nj$L %++L91> $# ??4(( ! // (A+bQRPS&TU Vs   6AC	D%)DDc                   K   || j                  | j                        }| j                  |      r| j                  |      }g }|D cg c]	  }||vs| }}|sg S |j                  }	||_        |j                  |d       d{   }
|	|_        t        ||
      D ]D  \  }}|j                  |       	 | j                  ||||       d{   }|j                  |       F |S | j                  |      S c c}w 7 v7 1# t        $ r3}| j                  rt        j                  d| d|        n Y d}~d}~ww xY ww)a^  Async version of _process_sitemap.

        Args:
            soup: The BeautifulSoup object of the sitemap
            base_url: The base URL for relative paths
            processed_urls: Set of already processed URLs to avoid cycles
            web_loader: WebBaseLoader instance to reuse for all requests,
                created if None
        NrI   rJ   rL   rM   )r8   r   r=   rE   rO   ascrape_allziprN   _aprocess_sitemaprR   r,   r   r2   r3   rS   )r!   r9   r   rF   rG   rT   rU   r&   new_urlsrV   soupsr   rW   rX   rY   s                  r#   r]   zGitbookLoader._aprocess_sitemap   sm    " 00@J !!$'55d;L! (4Qs.7PQHQ	 ",!5!5#+J  %00*0MME $6J -05-A )\"";/
)-)?)?$h
* $L %++L9 $# ??4((C R N$ ! // (A+bQRPS&TU Vs`   AD?	C7C7/D?C<.D?5D C>D "D?>D  	D<	)D72D?7D<<D?c              #     K   | j                   sO| j                  | j                        }|j                         }| j	                  || j                        }|r| yy| j                  | j
                        }|j                  d      }t               }| j                  ||      }|s.| j                  r"t        j                  d| j
                          g }|D ]  }| j                  ||d        |sy| j                  |      }	|	j                  |      }
t        |
|      D ]  \  }}| j	                  ||      }|s|   yw)zDFetch text from one single GitBook page or recursively from sitemap.rI   rJ   $No content URLs found in sitemap at contentN)r   r8   r   rP   _get_documentr   setrQ   r   r2   r3   r4   
scrape_allr\   )r!   temp_loaderr9   doc	soup_inforF   relative_pathsrD   r&   content_loader
soup_infoss              r#   	lazy_loadzGitbookLoader.lazy_load  sD    ""11$--@K%%'D$$T4==9C	  11$..AK#****=I (+uN!229nMN!d&8&8 DT^^DTUV !D% 9""4i89  "44T:N (2248J"%j$"7 	3((C8Is   D<E?Ec                @  K   | j                   si| j                  | j                        }|j                  | j                  g       d{   }|d   }| j	                  || j                        }|r| yy| j                  | j
                        }|j                  | j
                  gd       d{   }|d   }t               }| j                  || j                  |       d{   }|s.| j                  r"t        j                  d| j
                          g }|D ]  }| j                  ||d        |sy| j                  |      }	|	j                  |       d{   }
t        |
|      D ]  \  }}| j	                  ||      }|| ! y7 L7 7 7 :w)z/Asynchronously fetch text from GitBook page(s).Nr   rI   rJ   ra   rb   )r   r8   r   r[   rc   r   rd   r]   r   r   r2   r3   r4   r\   )r!   rf   r_   rh   rg   rF   ri   rD   r&   rj   rk   	maybe_docs               r#   
alazy_loadzGitbookLoader.alazy_load?  s    ""11$--@K%114==/BBEaI$$Y>C	  11$..AK%114>>2B:1VVEaI (+uN#'#9#94==.$ N "d&8&8 DT^^DTUV !D% 9""4i89  "44T:N  .99$??J"%j$"7 $	3 ..y#>	(#O$I C W
( @sP   AF	F
A*F4F53F(F)A8F!F"*F	FFFF
custom_urlc                     |j                  | j                        }|sy|j                  d      j                         }|j                  d      }|r|j                  nd}|xs | j
                  |d}t        ||      S )z,Fetch content from page and return Document.N
)	separatorh1 )sourcetitle)page_contentmetadata)r<   r   get_textstriprB   r   r   )r!   r9   rp   page_content_rawrb   title_if_existsrw   ry   s           r#   rc   zGitbookLoader._get_documentm  sz      99T%:%:;"++d+;AAC*//5(7$$R(9DMMEJWx@@r%   c                     g }|j                  d      D ]*  }|j                  s|j                  |j                         , |S )zFetch all URLs in the sitemap.r@   )rA   rB   r1   )r!   r9   rD   r@   s       r#   rS   zGitbookLoader._get_pathsz  s?    ==' 	&Cxx CHH%		&
 r%   )FNmainFT)URL)N)__name__
__module____qualname____doc__strboolr   r   r$   r   r   r4   r	   r   r8   r   r=   rE   rQ   r]   r   r   rl   r   ro   r   rc   rS    r%   r#   r   r      s     %"& &$)"A &*.2AA A 3-	A
 A "A A c]A "#c(+AF3 4 0 >CS	(+7:	(	
eCcN.C 	
 	
5m 5 5- DI  /3	9)9) C9) ]+	9)
 
c9)@ /3;);) ;) C	;)
 ]+;) 
c;)z(8H- (T,$-"9 ,$^ 6:AA%-c]A	(	As tCy r%   r   )r2   typingr   r   r   r   r   r   r	   urllib.parser
   bs4r   langchain_core.documentsr   )langchain_community.document_loaders.baser   -langchain_community.document_loaders.web_baser   r   r   r%   r#   <module>r      s.     K K K !  - @ GvJ vr%   