
    h1                         d dl Z d dlmZ d dlmZmZmZmZmZ d dl	m
Z
 d dlmZ  e j                  e      Zerd dlZ G d de      Zy)    N)Path)TYPE_CHECKINGIteratorOptionalSequenceUnion)Document)
BaseLoaderc                       e Zd ZdZ	 	 	 	 ddeeef   dee   deee	      dee
   dee
   f
dZdd
Zddd	efdZd	ee   fdZy)MWDumpLoadera  Load `MediaWiki` dump from an `XML` file.

    Example:
        .. code-block:: python

            from langchain_text_splitters import RecursiveCharacterTextSplitter
            from langchain_community.document_loaders import MWDumpLoader

            loader = MWDumpLoader(
                file_path="myWiki.xml",
                encoding="utf8"
            )
            docs = loader.load()
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000, chunk_overlap=0
            )
            texts = text_splitter.split_documents(docs)


    :param file_path: XML local file path
    :type file_path: str
    :param encoding: Charset encoding, defaults to "utf8"
    :type encoding: str, optional
    :param namespaces: The namespace of pages you want to parse.
        See https://www.mediawiki.org/wiki/Help:Namespaces#Localisation
        for a list of all common namespaces
    :type namespaces: List[int],optional
    :param skip_redirects: TR=rue to skip pages that redirect to other pages,
        False to keep them. False by default
    :type skip_redirects: bool, optional
    :param stop_on_error: False to skip over pages that cause parsing errors,
        True to stop. True by default
    :type stop_on_error: bool, optional
    N	file_pathencoding
namespacesskip_redirectsstop_on_errorc                     t        |t              r|n
t        |      | _        || _        || _        || _        || _        y )N)
isinstancestrr   r   r   r   r   )selfr   r   r   r   r   s         p/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/langchain_community/document_loaders/mediawikidump.py__init__zMWDumpLoader.__init__3   s8     '1C&@c)n $,*    returnc                     	 dd l }|j                  j                  t	        | j
                  | j                              S # t        $ r}t        d      |d }~ww xY w)Nr   zBUnable to import 'mwxml'. Please install with `pip install mwxml`.)r   )mwxmlImportErrorDump	from_fileopenr   r   )r   r   es      r   _load_dump_filezMWDumpLoader._load_dump_fileB   sU    	 zz##D$--$PQQ  	T	s   A   	A	AApagez
mwxml.Pagec                     	 ddl }|D ]M  }|j                  |j                        }|j	                  ddd      }d|j
                  i}t        ||      c S  y# t        $ r}t        d      |d}~ww xY w)	zParse a single page.r   NzXUnable to import 'mwparserfromhell'. Please install with `pip install mwparserfromhell`.TF)	normalizecollapsekeep_template_paramssource)page_contentmetadata)mwparserfromhellr   parsetext
strip_codetitler	   )r   r"   r*   r    revisioncoder,   r)   s           r   _load_single_page_from_dumpz(MWDumpLoader._load_single_page_from_dumpL   s    	#  	BH#))(--8D??E # D !$**-HAA	B  	3 	s   A 	A3"A..A3c              #     K   | j                         }|j                  D ]T  }| j                  r|j                  r| j                  r|j
                  | j                  vrA	 | j                  |       V y# t        $ r<}t        j                  dj                  |             | j                  r|Y d}~d}~ww xY ww)zLazy load from a file path.zParsing error: {}N)r!   pagesr   redirectr   	namespacer1   	Exceptionloggererrorformatr   )r   dumpr"   r    s       r   	lazy_loadzMWDumpLoader.lazy_load]   s     
 ##%JJ 	D""t}}4>>#H66t<<	  077:;%%Gs0   AB?!A74B?7	B< 2B72B?7B<<B?)utf8NFT)r   z
mwxml.Dump)__name__
__module____qualname____doc__r   r   r   r   r   intboolr   r!   r	   r1   r   r;    r   r   r   r      s    !L #).2).(,+d#+ 3-+ Xc]+	+
 !+  ~+RB B B"	(	r   r   )loggingpathlibr   typingr   r   r   r   r   langchain_core.documentsr	   )langchain_community.document_loaders.baser
   	getLoggerr=   r7   r   r   rC   r   r   <module>rJ      s=      E E - @			8	$a: ar   