a
    bgL!                     @   s   d Z ddlZddlmZmZ ddlmZmZmZm	Z	m
Z
mZ ddlmZ ddlmZ erddlmZ ddlmZ dd	lmZ dd
lmZmZmZ eeZG dd deZG dd deZG dd deZdS )zQLoader that uses Playwright to load a page, then uses unstructured to parse html.    N)ABCabstractmethod)TYPE_CHECKINGAsyncIteratorDictIteratorListOptional)Document)
BaseLoader)Browser)Page)Response)r   r   r   c                   @   s@   e Zd ZdZedddedddZedd	d
edddZdS )PlaywrightEvaluatorzAbstract base class for all evaluators.

    Each evaluator should take a page, a browser instance, and a response
    object, process the page as necessary, and return the resulting text.
    r   r   r   pagebrowserresponsereturnc                 C   s   dS )a  Synchronously process the page and return the resulting text.

        Args:
            page: The page to process.
            browser: The browser instance.
            response: The response from page.goto().

        Returns:
            text: The text content of the page.
        N selfr   r   r   r   r   /var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_community/document_loaders/url_playwright.pyevaluate   s    zPlaywrightEvaluator.evaluate	AsyncPageAsyncBrowserAsyncResponsec                    s   dS )a  Asynchronously process the page and return the resulting text.

        Args:
            page: The page to process.
            browser: The browser instance.
            response: The response from page.goto().

        Returns:
            text: The text content of the page.
        Nr   r   r   r   r   evaluate_async*   s    z"PlaywrightEvaluator.evaluate_asyncN)__name__
__module____qualname____doc__r   strr   r   r   r   r   r   r      s   r   c                   @   sP   e Zd ZdZdeee  dddZddded	d
dZddded	ddZ	dS )UnstructuredHtmlEvaluatorz@Evaluate the page HTML content using the `unstructured` library.N)remove_selectorsc                 C   s2   zddl }W n ty&   tdY n0 || _dS )z%Initialize UnstructuredHtmlEvaluator.r   NzQunstructured package not found, please install it with `pip install unstructured`)unstructuredImportErrorr$   )r   r$   r%   r   r   r   __init__>   s    
z"UnstructuredHtmlEvaluator.__init__r   r   r   r   c           	      C   sl   ddl m} | jpg D ].}|| }|D ]}| r,|d q,q| }||d}ddd |D S )z3Synchronously process the HTML content of the page.r   partition_htmlelement => element.remove()text

c                 S   s   g | ]}t |qS r   r"   .0elr   r   r   
<listcomp>V       z6UnstructuredHtmlEvaluator.evaluate.<locals>.<listcomp>	Zunstructured.partition.htmlr)   r$   locatorallZ
is_visibler   contentjoin	r   r   r   r   r)   selectorelementselementZpage_sourcer   r   r   r   J   s    
z"UnstructuredHtmlEvaluator.evaluater   r   r   c           	         s   ddl m} | jpg D ]@}|| I dH }|D ]"}| I dH r2|dI dH  q2q| I dH }||d}ddd |D S )	z4Asynchronously process the HTML content of the page.r   r(   Nr*   r+   r-   c                 S   s   g | ]}t |qS r   r.   r/   r   r   r   r2   f   r3   z<UnstructuredHtmlEvaluator.evaluate_async.<locals>.<listcomp>r4   r9   r   r   r   r   X   s    
z(UnstructuredHtmlEvaluator.evaluate_async)N)
r   r   r    r!   r	   r   r"   r'   r   r   r   r   r   r   r#   ;   s   r#   c                
   @   s|   e Zd ZdZdee eeeee  ee ee	eef  dddZ
ee ddd	Zee dd
dZee dddZdS )PlaywrightURLLoadera  Load `HTML` pages with `Playwright` and parse with `Unstructured`.

    This is useful for loading pages that require javascript to render.

    Attributes:
        urls (List[str]): List of URLs to load.
        continue_on_failure (bool): If True, continue loading other URLs on failure.
        headless (bool): If True, the browser will run in headless mode.
        proxy (Optional[Dict[str, str]]): If set, the browser will access URLs
            through the specified proxy.

    Example:
        .. code-block:: python

            from langchain_community.document_loaders import PlaywrightURLLoader

            urls = ["https://api.ipify.org/?format=json",]
            proxy={
                "server": "https://xx.xx.xx:15818", # https://<host>:<port>
                "username": "username",
                "password": "password"
            }
            loader = PlaywrightURLLoader(urls, proxy=proxy)
            data = loader.load()
    TN)urlscontinue_on_failureheadlessr$   	evaluatorproxyc                 C   sb   zddl }W n ty&   tdY n0 || _|| _|| _|| _|rP|rPtd|pZt|| _dS )z%Load a list of URLs using Playwright.r   NzMplaywright package not found, please install it with `pip install playwright`z:`remove_selectors` and `evaluator` cannot be both not None)	
playwrightr&   r>   r?   r@   rB   
ValueErrorr#   rA   )r   r>   r?   r@   r$   rA   rB   rC   r   r   r   r'      s    

zPlaywrightURLLoader.__init__)r   c           
      c   s   ddl m} | }|jj| j| jd}| jD ]}zR| }||}|du r\t	d| | j
|||}d|i}t||dV  W q. ty }	 z.| jrtd| d	|	  n|	W Y d}	~	q.d}	~	0 0 q.|  W d   n1 s0    Y  dS )
zLoad the specified URLs using Playwright and create Document instances.

        Returns:
            A list of Document instances with loaded content.
        r   )sync_playwrightr@   rB   N"page.goto() returned None for url sourceZpage_contentmetadataError fetching or processing , exception: )playwright.sync_apirE   chromiumlaunchr@   rB   r>   new_pagegotorD   rA   r   r
   	Exceptionr?   loggererrorclose)
r   rE   pr   urlr   r   r,   rJ   er   r   r   	lazy_load   s&    

zPlaywrightURLLoader.lazy_loadc                    s   dd |   2 I dH S )Load the specified URLs with Playwright and create Documents asynchronously.
        Use this function when in a jupyter notebook environment.

        Returns:
            A list of Document instances with loaded content.
        c                    s   g | z3 d H W }|q6 S )Nr   )r0   docr   r   r   r2      r3   z-PlaywrightURLLoader.aload.<locals>.<listcomp>N)
alazy_load)r   r   r   r   aload   s    zPlaywrightURLLoader.aloadc           
      C  s.  ddl m} | 4 I dH }|jj| j| jdI dH }| jD ]}zd| I dH }||I dH }|du rvt	d| | j
|||I dH }d|i}t||dV  W q< ty }	 z.| jrtd| d	|	  n|	W Y d}	~	q<d}	~	0 0 q<| I dH  W d  I dH  q*1 I dH s 0    Y  dS )
rZ   r   )async_playwrightNrF   rG   rH   rI   rK   rL   )playwright.async_apir^   rN   rO   r@   rB   r>   rP   rQ   rD   rA   r   r
   rR   r?   rS   rT   rU   )
r   r^   rV   r   rW   r   r   r,   rJ   rX   r   r   r   r\      s&    
zPlaywrightURLLoader.alazy_load)TTNNN)r   r   r    r!   r   r"   boolr	   r   r   r'   r   r
   rY   r]   r   r\   r   r   r   r   r=   i   s"        
	r=   )r!   loggingabcr   r   typingr   r   r   r   r   r	   Zlangchain_core.documentsr
   Z)langchain_community.document_loaders.baser   r_   r   r   r   r   r   r   rM   	getLoggerr   rS   r   r#   r=   r   r   r   r   <module>   s    
&.