a
    bgB                     @   sT   d dl mZmZmZmZ d dlmZmZ d dlm	Z	 d dl
mZ G dd deZdS )    )AnyIteratorListOptional)urljoinurlparse)Document)WebBaseLoaderc                       sx   e Zd ZdZdeeee eeed fddZee	 d	d
dZ
deee ee	 dddZeee dddZ  ZS )GitbookLoaderztLoad `GitBook` data.

    1. load from either a single page, or
    2. load all (relative) paths in the navbar.
    FNmainT)web_pageload_all_pathsbase_urlcontent_selectorcontinue_on_failureshow_progressc                    sZ   |p|| _ | j dr&| j dd | _ |r6| j  d}t j|f||d || _|| _dS )a  Initialize with web page and whether to load all paths.

        Args:
            web_page: The web page to load or the starting point from where
                relative paths are discovered.
            load_all_paths: If set to True, all relative paths in the navbar
                are loaded instead of only `web_page`.
            base_url: If `load_all_paths` is True, the relative paths are
                appended to this base url. Defaults to `web_page`.
            content_selector: The CSS selector for the content to load.
                Defaults to "main".
            continue_on_failure: whether to continue loading the sitemap if an error
                occurs loading a url, emitting a warning instead of raising an
                exception. Setting this to True makes the loader more robust, but also
                may result in missing data. Default: False
            show_progress: whether to show a progress bar while loading. Default: True
        /Nz/sitemap.xml)Z	web_pathsr   r   )r   endswithsuper__init__r   r   )selfr   r   r   r   r   r   	__class__ z/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_community/document_loaders/gitbook.pyr      s    
zGitbookLoader.__init__)returnc                 #   s    j r`  } |} fdd|D } |}t||D ]\}} ||}|r>|V  q>n   } | j}|r|V  dS )z(Fetch text from one single GitBook page.c                    s   g | ]}t  j|qS r   )r   r   ).0pathr   r   r   
<listcomp>=       z+GitbookLoader.lazy_load.<locals>.<listcomp>N)r   Zscrape
_get_pathsZ
scrape_allzip_get_documentweb_path)r   Z	soup_infoZrelative_pathsurlsZ
soup_infosurldocr   r   r   	lazy_load8   s    


zGitbookLoader.lazy_load)soup
custom_urlr   c                 C   sX   | | j}|sdS |jdd }| d}|r8|jnd}|pD| j|d}t||dS )z,Fetch content from page and return Document.N
)	separatorZh1 )sourcetitle)Zpage_contentmetadata)findr   Zget_textstriptextr%   r   )r   r*   r+   Zpage_content_rawcontentZtitle_if_existsr0   r1   r   r   r   r$   J   s    
zGitbookLoader._get_document)r*   r   c                 C   s   dd | dD S )z'Fetch all relative paths in the navbar.c                 S   s   g | ]}t |jjqS r   )r   r4   r   )r   locr   r   r   r    Y   r!   z,GitbookLoader._get_paths.<locals>.<listcomp>r6   )Zfind_all)r   r*   r   r   r   r"   W   s    zGitbookLoader._get_paths)FNr   FT)N)__name__
__module____qualname____doc__strboolr   r   r   r   r)   r   r$   r   r"   __classcell__r   r   r   r   r
   	   s*   	     ( r
   N)typingr   r   r   r   urllib.parser   r   Zlangchain_core.documentsr   Z-langchain_community.document_loaders.web_baser	   r
   r   r   r   r   <module>   s   