a
    bg                     @   sb   d dl Z d dlmZ d dlmZmZmZmZ d dlm	Z	 d dl
mZ e eZG dd deZdS )    N)Path)IteratorOptionalSequenceUnion)Document)
BaseLoaderc                   @   sl   e Zd ZdZdeeef ee eee	  ee
 ee
 dddZd	d
 ZedddZee dddZdS )MWDumpLoadera  Load `MediaWiki` dump from an `XML` file.

    Example:
        .. code-block:: python

            from langchain_text_splitters import RecursiveCharacterTextSplitter
            from langchain_community.document_loaders import MWDumpLoader

            loader = MWDumpLoader(
                file_path="myWiki.xml",
                encoding="utf8"
            )
            docs = loader.load()
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000, chunk_overlap=0
            )
            texts = text_splitter.split_documents(docs)


    :param file_path: XML local file path
    :type file_path: str
    :param encoding: Charset encoding, defaults to "utf8"
    :type encoding: str, optional
    :param namespaces: The namespace of pages you want to parse.
        See https://www.mediawiki.org/wiki/Help:Namespaces#Localisation
        for a list of all common namespaces
    :type namespaces: List[int],optional
    :param skip_redirects: TR=rue to skip pages that redirect to other pages,
        False to keep them. False by default
    :type skip_redirects: bool, optional
    :param stop_on_error: False to skip over pages that cause parsing errors,
        True to stop. True by default
    :type stop_on_error: bool, optional
    utf8NFT)	file_pathencoding
namespacesskip_redirectsstop_on_errorc                 C   s4   t |tr|nt|| _|| _|| _|| _|| _d S )N)
isinstancestrr   r   r   r   r   )selfr   r   r   r   r    r   /var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_community/document_loaders/mediawikidump.py__init__0   s
    zMWDumpLoader.__init__c              
   C   sT   zdd l }W n. ty: } ztd|W Y d }~n
d }~0 0 |jt| j| jdS )Nr   zBUnable to import 'mwxml'. Please install with `pip install mwxml`.)r   )mwxmlImportErrorZDump	from_fileopenr   r   )r   r   er   r   r   _load_dump_file?   s    zMWDumpLoader._load_dump_file)returnc              
   C   s~   zddl }W n. ty: } ztd|W Y d}~n
d}~0 0 |D ]8}||j}|jdddd}d|ji}t||d  S dS )	zParse a single page.r   NzXUnable to import 'mwparserfromhell'. Please install with `pip install mwparserfromhell`.TF)	normalizeZcollapseZkeep_template_paramssource)Zpage_contentmetadata)mwparserfromhellr   parsetextZ
strip_codetitler   )r   pager    r   revisioncoder"   r   r   r   r   _load_single_page_from_dumpI   s    
z(MWDumpLoader._load_single_page_from_dumpc                 c   s   |   }|jD ]}| jr |jr q| jr4|j| jvr4qz| |V  W q ty } z4t	d
| | jrp|nW Y d}~qW Y d}~qd}~0 0 qdS )zLazy load from a file path.zParsing error: {}N)r   Zpagesr   redirectr   	namespacer'   	Exceptionloggererrorformatr   )r   dumpr$   r   r   r   r   	lazy_loadZ   s    
zMWDumpLoader.lazy_load)r
   NFT)__name__
__module____qualname____doc__r   r   r   r   r   intboolr   r   r   r'   r   r/   r   r   r   r   r	      s    &    


r	   )loggingpathlibr   typingr   r   r   r   Zlangchain_core.documentsr   Z)langchain_community.document_loaders.baser   	getLoggerr0   r+   r	   r   r   r   r   <module>   s   
