a
    bg4#                     @   s   d dl Z d dlZd dlZd dlmZmZ d dlmZmZm	Z	m
Z
mZmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ eeZe ddd	d
dd
dZeeedddZG dd deZdS )    N)FutureThreadPoolExecutor)	AnyAsyncIteratorDictIteratorListOptionalTupleUnioncast)Document)
BaseLoader)get_user_agentzJtext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8zen-US,en;q=0.5zhttps://www.google.com/1z
keep-alive)
User-AgentAcceptzAccept-LanguageZRefererZDNT
ConnectionzUpgrade-Insecure-Requests)soupurlreturnc                 C   sj   d|i}|  d }r"| |d< | j dddid }rH|dd|d< |  d	 }rf|d
d|d< |S )z)Build metadata from BeautifulSoup output.sourcetitlemetanamedescription)attrscontentzNo description found.htmllangzNo language found.language)findZget_textget)r   r   metadatar   r   r    r$   }/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_community/document_loaders/async_html.py_build_metadata&   s    r&   c                   @   s0  e Zd ZdZd'dddeeee f ee ee	 ee e	ee ee
eeeef  e	e	e	e	dd	d
ZeedddZeeddddZd(ee
e
eedddZeejeeef dddZee e	eeeef  dddZee ee dddZeeedd d!Zee d"d#d$Zee d"d%d&ZdS ))AsyncHtmlLoaderzLoad `HTML` asynchronously.NThtml.parser   F)preserve_order	trust_env)web_pathheader_template
verify_sslproxiesautoset_encodingencodingdefault_parserrequests_per_secondrequests_kwargsraise_for_statusignore_load_errorsr*   r+   c                C   s   t |tr|g| _nt |tr$|| _|p*t}|dspzddlm} | j|d< W n t	yn   t
d Y n0 t | _t|| j_|| j_|r| jj| || _|| _|	pi | _|
| _|| _|| _|| _|| _|| _dS )zInitialize with a webpage path.r   r   )	UserAgentzxfake_useragent not found, using default user agent.To get a realistic header for requests, `pip install fake_useragent`.N)
isinstancestr	web_pathsr   default_header_templater"   Zfake_useragentr7   randomImportErrorloggerinforequestsSessionsessiondictheadersverifyr/   updater3   r2   r4   r5   r0   r1   r6   r*   r+   )selfr,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r*   r+   rD   r7   r$   r$   r%   __init__5   s6    






zAsyncHtmlLoader.__init__)r   r   c              
   C   sj   | j rTz| jj|fi | jW S  tyR } ztt| W Y d }~d S d }~0 0 | jj|fi | jS N)r6   rB   r"   r4   	Exceptionwarningswarnr9   )rG   r   er$   r$   r%   _fetch_valid_connection_docso   s    z,AsyncHtmlLoader._fetch_valid_connection_docs)parserr   c                 C   s*   g d}| |vr&t dd| d dS )z#Check that parser is valid for bs4.)r(   Zlxmlxmlzlxml-xmlZhtml5libz`parser` must be one of z, .N)
ValueErrorjoin)rO   Zvalid_parsersr$   r$   r%   _check_parsery   s
    zAsyncHtmlLoader._check_parser         ?)r   retriescooldownbackoffr   c                    s
  t j| jd4 I d H }t|D ]}ztf | jj| jj d| j	}| jj
s\d|d< |j|fi |4 I d H n}z| I d H }	W n& ty   td|  d}	Y n0 |	W  d   I d H  W   W  d   I d H  S 1 I d H  s0    Y  W q" t jtfy }
 z||d krf| jrftd| d	| d
 W Y d }
~
 W d   I d H  dS ||d krx n@td| d|d  d| d|
 d	 t|||  I d H  W Y d }
~
q"d }
~
0 0 q"W d   I d H  q1 I d H s0    Y  tdd S )N)r+   )rD   cookiesFsslzFailed to decode content from     zError fetching z after z	 retries.z with attempt /z: z. Retrying...zretry count exceeded)aiohttpZClientSessionr+   rangerC   rB   rD   rZ   get_dictr4   rE   r"   textUnicodeDecodeErrorr>   errorZClientConnectionErrorTimeoutErrorr6   warningasynciosleeprR   )rG   r   rW   rX   rY   rB   ikwargsresponserb   rM   r$   r$   r%   _fetch   sP    

N"^zAsyncHtmlLoader._fetch)r   	semaphorer   c              	      sP   |4 I d H * ||  |I d H fW  d   I d H  S 1 I d H sB0    Y  d S rI   )rl   )rG   r   rm   r$   r$   r%   _fetch_with_rate_limit   s    z&AsyncHtmlLoader._fetch_with_rate_limit)urlsr*   r   c                   s   t  j fdd|D }z\ddlm} |rV||ddddD ]}|I d H V  qBn$|j|ddddD ]}|I d H V  qhW nZ ty   td	 |rt j	| I d H D ]
}|V  qnt |D ]}|I d H V  qY n0 d S )
Nc                    s   g | ]}t  |qS r$   )rg   create_taskrn   ).0r   rG   rm   r$   r%   
<listcomp>   s   z3AsyncHtmlLoader._lazy_fetch_all.<locals>.<listcomp>r   )tqdm_asynciozFetching pagesTr]   )descasciiZminintervalz2For better logging of progress, `pip install tqdm`)
rg   	Semaphorer3   Ztqdm.asynciort   as_completedr=   rK   rL   gather)rG   ro   r*   tasksrt   taskresultr$   rr   r%   _lazy_fetch_all   s,    



zAsyncHtmlLoader._lazy_fetch_all)ro   r   c                    s   dd |  |d2 I dH S )z/Fetch all urls concurrently with rate limiting.c                    s   g | z3 d H W \}}|q6 S rI   r$   )rq   _docr$   r$   r%   rs          z-AsyncHtmlLoader.fetch_all.<locals>.<listcomp>TN)r}   )rG   ro   r$   r$   r%   	fetch_all   s    zAsyncHtmlLoader.fetch_all)r   rb   r   c                 C   sL   ddl m} |drd}n| j}| | |||}t||}t||dS )Nr   )BeautifulSoupz.xmlrP   )Zpage_contentr#   )Zbs4r   endswithr2   rT   r&   r   )rG   r   rb   r   rO   r   r#   r$   r$   r%   _to_document   s    



zAsyncHtmlLoader._to_document)r   c                 c   s   zTt   tdd.}|t j| | j}| }W d   n1 sH0    Y  W n$ tyx   t | | j}Y n0 t	t
tt |D ]\}}| | j| |V  qdS )+Lazy load text from the url(s) in web_path.r]   )max_workersN)rg   get_running_loopr   submitrunr   r:   r|   RuntimeError	enumerater   r   r9   r   )rG   executorfutureresultsri   rb   r$   r$   r%   	lazy_load   s    
*zAsyncHtmlLoader.lazy_loadc                 C  s6   |  | j| j2 z3 dH W \}}| ||V  q6 dS )r   N)r}   r:   r*   r   )rG   r   rb   r$   r$   r%   
alazy_load   s    zAsyncHtmlLoader.alazy_load)
NTNTNr(   r)   NFF)rU   r)   rV   )__name__
__module____qualname____doc__r   r9   r   r	   rC   boolintr   r   rH   rN   staticmethodrT   floatrl   rg   rw   r
   rn   r   r}   r   r   r   r   r   r   r$   r$   r$   r%   r'   2   s^             :
	 &
r'   ) rg   loggingrK   concurrent.futuresr   r   typingr   r   r   r   r   r	   r
   r   r   r_   r@   Zlangchain_core.documentsr   Z)langchain_community.document_loaders.baser   Z$langchain_community.utils.user_agentr   	getLoggerr   r>   r;   r9   rC   r&   r'   r$   r$   r$   r%   <module>   s(   ,
