a
    bg)<                     @   s   d Z ddlZddlZddlZddlmZmZmZmZm	Z	m
Z
mZmZ ddlZddlZddlmZ ddlmZ ddlmZ ddlmZ eeZe dd	d
ddddZeeedddZG dd deZdS )zWeb base loader class.    N)AnyAsyncIteratorDictIteratorListOptionalSequenceUnion)
deprecated)Document)
BaseLoader)get_user_agentzJtext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8zen-US,en;q=0.5zhttps://www.google.com/1z
keep-alive)
User-AgentAcceptzAccept-LanguageZRefererZDNT
ConnectionzUpgrade-Insecure-Requests)soupurlreturnc                 C   sj   d|i}|  d }r"| |d< | j dddid }rH|dd|d< |  d	 }rf|d
d|d< |S )z)Build metadata from BeautifulSoup output.sourcetitlemetanamedescription)attrscontentzNo description found.htmllangzNo language found.language)findget_textget)r   r   metadatar   r   r    r#   {/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_community/document_loaders/web_base.py_build_metadata   s    r%   c                   @   s  e Zd ZdZd5ddd	eeee f ee e	ee e	e	ee ee e
eeeeef  e	eeeef  eeeef  ee	e	dd
ddZeedddZd6ee
e
eedddZeejedddZee edddZeeddddZd7eee eedf ee ddd Zd8ee eedf ee d!d"d#Zd9ee eedf ee d!d$d%Zd:eeedf ee ed&d'd(Zd;eedf edd)d*Zee dd+d,Z e!e dd-d.Z"e#d/d0d1d2ee dd3d4Z$dS )<WebBaseLoaderaQ  
    WebBaseLoader document loader integration

    Setup:
        Install ``langchain_community``.

        .. code-block:: bash

            pip install -U langchain_community

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import WebBaseLoader

            loader = WebBaseLoader(
                web_path = "https://www.espn.com/"
                # header_template = None,
                # verify_ssl = True,
                # proxies = None,
                # continue_on_failure = False,
                # autoset_encoding = True,
                # encoding = None,
                # web_paths = (),
                # requests_per_second = 2,
                # default_parser = "html.parser",
                # requests_kwargs = None,
                # raise_for_status = False,
                # bs_get_text_kwargs = None,
                # bs_kwargs = None,
                # session = None,
                # show_progress = True,
                # trust_env = False,
            )

    Lazy load:
        .. code-block:: python

            docs = []
            for doc in loader.lazy_load():
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            ESPN - Serving Sports Fans. Anytime. Anywhere.

            {'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}


    Async load:
        .. code-block:: python

            docs = []
            async for doc in loader.alazy_load():
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            ESPN - Serving Sports Fans. Anytime. Anywhere.

            {'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}

    .. versionchanged:: 0.3.14

        Deprecated ``aload`` (which was not async) and implemented a native async
        ``alazy_load``. Expand below for more details.

        .. dropdown:: How to update ``aload``

            Instead of using ``aload``, you can use ``load`` for synchronous loading or
            ``alazy_load`` for asynchronous lazy loading.

            Example using ``load`` (synchronous):

            .. code-block:: python

                docs: List[Document] = loader.load()

            Example using ``alazy_load`` (asynchronous):

            .. code-block:: python

                docs: List[Document] = []
                async for doc in loader.alazy_load():
                    docs.append(doc)

            This is in preparation for accommodating an asynchronous ``aload`` in the
            future:

            .. code-block:: python

                docs: List[Document] = await loader.aload()

     NTFr#      html.parser)show_progress	trust_env)web_pathheader_template
verify_sslproxiescontinue_on_failureautoset_encodingencoding	web_pathsrequests_per_seconddefault_parserrequests_kwargsraise_for_statusbs_get_text_kwargs	bs_kwargssessionr*   r+   r   c                C   sJ  |r|rt d|r t|| _nHt|tr4|g| _n4t|trJt|| _ntdt| dt| d|	| _|
| _	|pzi | _
|| _|| _|pi | _|pi | _|r|| _nt }|pt }|dszddlm} | j|d< W n ty   td Y n0 t||_||_|r(|j| || _|| _ || _!|| _"|| _#d	S )
a  Initialize loader.

        Args:
            web_paths: Web paths to load from.
            requests_per_second: Max number of concurrent requests to make.
            default_parser: Default parser to use for BeautifulSoup.
            requests_kwargs: kwargs for requests
            raise_for_status: Raise an exception if http status code denotes an error.
            bs_get_text_kwargs: kwargs for beatifulsoup4 get_text
            bs_kwargs: kwargs for beatifulsoup4 web page parsing
            show_progress: Show progress bar when loading pages.
            trust_env: set to True if using proxy to make web requests, for example
                using http(s)_proxy environment variables. Defaults to False.
        zmReceived web_path and web_paths. Only one can be specified. web_path is deprecated, web_paths should be used.z+web_path must be str or Sequence[str] got (z*) or web_paths must be Sequence[str] got ()r   r   )	UserAgentzxfake_useragent not found, using default user agent.To get a realistic header for requests, `pip install fake_useragent`.N)$
ValueErrorlistr3   
isinstancestrr   	TypeErrortyper4   r5   r6   r7   r*   r8   r9   r:   requestsSessiondefault_header_templatecopyr!   Zfake_useragentr<   randomImportErrorloggerinfodictheadersverifyr/   updater0   r1   r2   r+   )selfr,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r*   r+   r<   r#   r#   r$   __init__   sV    $







zWebBaseLoader.__init__)r   c                 C   s    t | jdkrtd| jd S )N   zMultiple webpaths found.r   )lenr3   r=   )rO   r#   r#   r$   r,      s    zWebBaseLoader.web_path         ?)r   retriescooldownbackoffr   c           
         s  t j| jd4 I d H R}t|D ].}zt| jj| jj d}| jj	sRd|d< |j
|fi | j|B 4 I d H L}| jr|  | I d H W  d   I d H  W   W  d   I d H  S 1 I d H s0    Y  W q" t jyP }	 z^||d k r n@td| d|d  d| d	|	 d
	 t|||  I d H  W Y d }	~	q"d }	~	0 0 q"W d   I d H  q1 I d H sx0    Y  tdd S )N)r+   )rL   cookiesFsslrQ   Error fetching z with attempt /z: z. Retrying...zretry count exceeded)aiohttpZClientSessionr+   rangerK   r:   rL   rX   get_dictrM   r!   r6   r7   textZClientConnectionErrorrI   warningasynciosleepr=   )
rO   r   rU   rV   rW   r:   ikwargsresponseer#   r#   r$   _fetch   s@    
V^zWebBaseLoader._fetch)r   	semaphorer   c                    s   |4 I d H  z&|  |I d H W W  d   I d H  S  ty } zZ| jr|td| d W Y d }~W d   I d H  dS td| d |W Y d }~n
d }~0 0 W d   I d H  q1 I d H s0    Y  d S )NrZ   z*, skipping due to continue_on_failure=Truer'   za and aborting, use continue_on_failure=True to continue loading urls after encountering an error.)rg   	Exceptionr0   rI   r`   	exception)rO   r   rh   rf   r#   r#   r$   _fetch_with_rate_limit  s    &
 
z$WebBaseLoader._fetch_with_rate_limit)urlsr   c                    s   t | j}g }|D ] }t | ||}|| qzD| jrfddlm} |j	|ddddI dH W S t j	| I dH W S W n, t
y   td t j	| I dH  Y S 0 dS )	z/Fetch all urls concurrently with rate limiting.r   )tqdm_asynciozFetching pagesTrQ   )descasciiZminintervalNz2For better logging of progress, `pip install tqdm`)ra   	Semaphorer4   ensure_futurerk   appendr*   Ztqdm.asynciorm   gatherrH   warningswarn)rO   rl   rh   tasksr   taskrm   r#   r#   r$   	fetch_all  s    
zWebBaseLoader.fetch_all)parserr   c                 C   s*   g d}| |vr&t dd| d dS )z#Check that parser is valid for bs4.)r)   Zlxmlxmlzlxml-xmlZhtml5libz`parser` must be one of z, .N)r=   join)ry   Zvalid_parsersr#   r#   r$   _check_parser.  s
    zWebBaseLoader._check_parser)resultsrl   ry   r   c           	      C   sp   ddl m} g }t|D ]R\}}|| }|du rP|dr@d}n| j}| | ||||fi | j q|S )z0Unpack fetch results into BeautifulSoup objects.r   BeautifulSoupN.xmlrz   )bs4r   	enumerateendswithr5   r}   rr   r9   )	rO   r~   rl   ry   r   Zfinal_resultsrc   resultr   r#   r#   r$   _unpack_fetch_results7  s    

z#WebBaseLoader._unpack_fetch_results)rl   ry   r   c                 C   s    t | |}| j|||dS )z2Fetch all urls, then return soups for all results.ry   )ra   runrx   r   rO   rl   ry   r~   r#   r#   r$   
scrape_allI  s    zWebBaseLoader.scrape_allc                    s    |  |I dH }| j|||dS )z8Async fetch all urls, then return soups for all results.Nr   )rx   r   r   r#   r#   r$   ascrape_allN  s    zWebBaseLoader.ascrape_all)r   ry   r9   r   c                 C   s   ddl m} |d u r*|dr$d}n| j}| | | jj|fi | j}| jrX|  | j	d url| j	|_	n| j
rz|j|_	||j|fi |pi S )Nr   r   r   rz   )r   r   r   r5   r}   r:   r!   r6   r7   r2   r1   apparent_encodingr_   )rO   r   ry   r9   r   Zhtml_docr#   r#   r$   _scrapeU  s    



zWebBaseLoader._scrapec                 C   s"   |du r| j }| j| j|| jdS )z?Scrape data from webpage and return it in BeautifulSoup format.N)ry   r9   )r5   r   r,   r9   )rO   ry   r#   r#   r$   scrapeo  s    zWebBaseLoader.scrapec                 c   sJ   | j D ]>}| j|| jd}|jf i | j}t||}t||dV  qdS )z+Lazy load text from the url(s) in web_path.)r9   Zpage_contentr"   N)r3   r   r9   r    r8   r%   r   )rO   pathr   r_   r"   r#   r#   r$   	lazy_loadw  s
    

zWebBaseLoader.lazy_loadc                 C  sV   |  | jI dH }t| j|D ]2\}}|jf i | j}t||}t||dV  qdS )z1Async lazy load text from the url(s) in web_path.Nr   )r   r3   zipr    r8   r%   r   )rO   r~   r   r   r_   r"   r#   r#   r$   
alazy_load  s
    
zWebBaseLoader.alazy_loadz0.3.14z1.0zSee API reference for updated usage: https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.web_base.WebBaseLoader.html)ZsinceZremovalmessagec                 C   sX   |  | j}g }t| j|D ]6\}}|jf i | j}t||}|t||d q|S )z9Load text from the urls in web_path async into Documents.r   )r   r3   r   r    r8   r%   rr   r   )rO   r~   docsr   r   r_   r"   r#   r#   r$   aload  s    
zWebBaseLoader.aload)r'   NTNFTNr#   r(   r)   NFNNN)rS   r(   rT   )N)N)N)NN)N)%__name__
__module____qualname____doc__r	   r@   r   r   rK   boolintr   r   rP   propertyr,   floatrg   ra   rp   rk   r   rx   staticmethodr}   r   r   r   r   r   r   r   r   r   r   r
   r   r#   r#   r#   r$   r&   *   s   e               U 	 $ 
  
r&   )r   ra   loggingrt   typingr   r   r   r   r   r   r   r	   r\   rC   Zlangchain_core._apir
   Zlangchain_core.documentsr   Z)langchain_community.document_loaders.baser   Z$langchain_community.utils.user_agentr   	getLoggerr   rI   rE   r@   rK   r%   r&   r#   r#   r#   r$   <module>   s*   (
