a
    bgP0                     @   sT   d dl Z d dlmZmZmZ d dlmZ d dlmZ d dl	m
Z
 G dd deZdS )    N)IteratorLiteralOptional)
BaseLoader)Document)get_from_envc                   @   st   e Zd ZdZeedddZeedddZddddd	eee ee e	d
 ee dddZ
ee dddZdS )FireCrawlLoadera!
  
    FireCrawlLoader document loader integration

    Setup:
        Install ``firecrawl-py``,``langchain_community`` and set environment variable ``FIRECRAWL_API_KEY``.

        .. code-block:: bash

            pip install -U firecrawl-py langchain_community
            export FIRECRAWL_API_KEY="your-api-key"

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import FireCrawlLoader

            loader = FireCrawlLoader(
                url = "https://firecrawl.dev",
                mode = "crawl"
                # other params = ...
            )

    Lazy load:
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            # async variant:
            # docs_lazy = await loader.alazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)
             Join the waitlist to turn any web
            {'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}

    Async load:
        .. code-block:: python

            docs = await loader.aload()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)
             Join the waitlist to turn any web
            {'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}

    )paramsreturnc                 C   s  d}g d}|D ]}| |rd} q(q|rtdt d|v r^|d du rX|d |d< |d= d|v r|d du r~|d |d< |d= d	|v r|d	 du r|d	 |d
< |d	= d|v r|d du r|d |d< |d= d|v rt|d tr| |d |d< |d= |S )NF)includesexcludesallowBackwardCrawlingallowExternalContentLinkspageOptionsTBDeprecated parameters detected. See Firecrawl v1 docs for updates.r   ZincludePathsr   ZexcludePathsr   ZallowBackwardLinksr   ZallowExternalLinksr   ZscrapeOptions)getwarningswarnDeprecationWarning
isinstancedictlegacy_scrape_options_adapter)selfr	   use_legacy_optionsZlegacy_keyskey r   |/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_community/document_loaders/firecrawl.pylegacy_crawler_options_adapterC   sD    
z.FireCrawlLoader.legacy_crawler_options_adapterc                 C   s  d}dg}d|v rd|d v r|d d dksN|d d dksN|d d dkrd}d	|d v r|d d	 r||d d	 |d
< n|d  d	d|d
< d|d v r|d d r|d d |d< d|d v r|d d r|d d |d
< |d= g d}|D ]}| |rd} qq|rtdt d|v rF|d du r@|d |d= d|v rn|d du rh|d |d= d|v r|d du r|d |d= d|v r|d du r|d |d= d|v r|d du r|d |d= d|v r|d du r|d |d= d|v r6|d du r0|d |d= d|v r`|d du rZ|d |d< |d= d|v r|d du r|d |d < |d= d!|vr||d!< |S )"NFmarkdownZextractorOptionsmodezllm-extractionzllm-extraction-from-raw-htmlzllm-extraction-from-markdownTZextractionPromptpromptz-Extract page information based on the schema.ZextractionSchemaZschemaZ
userPrompt)	includeMarkdownincludeHtmlincludeRawHtmlincludeExtractincludeLinks
screenshotfullPageScreenshotonlyIncludeTags
removeTagsr   r!   r"   htmlr#   rawHtmlr$   extractr%   linksr&   r'   zscreenshot@fullPager(   ZincludeTagsr)   ZexcludeTagsformats)r   r   r   r   removeappend)r   r	   r   r.   Zscrape_keysr   r   r   r   r   s   s    




















z-FireCrawlLoader.legacy_scrape_options_adapterNcrawl)api_keyapi_urlr   r	   )r1   scrapemap)urlr2   r3   r   r	   c                C   s   zddl m} W n ty*   tdY n0 |dvrDtd| d|sPtd|p\tdd	}|||d
| _ || _|| _|p~i | _dS )aR  Initialize with API key and url.

        Args:
            url: The url to be crawled.
            api_key: The Firecrawl API key. If not specified will be read from env var
                FIRECRAWL_API_KEY. Get an API key
            api_url: The Firecrawl API URL. If not specified will be read from env var
                FIRECRAWL_API_URL or defaults to https://api.firecrawl.dev.
            mode: The mode to run the loader in. Default is "crawl".
                 Options include "scrape" (single url),
                 "crawl" (all accessible sub pages),
                 "map" (returns list of links that are semantically related).
            params: The parameters to pass to the Firecrawl API.
                Examples include crawlerOptions.
                For more details, visit: https://github.com/mendableai/firecrawl-py
        r   )FirecrawlAppzD`firecrawl` package not found, please run `pip install firecrawl-py`)r1   r4   searchr5   Invalid mode 'z/'. Allowed: 'crawl', 'scrape', 'search', 'map'.zUrl must be providedr2   ZFIRECRAWL_API_KEY)r2   r3   N)	firecrawlr7   ImportError
ValueErrorr   r6   r   r	   )r   r6   r2   r3   r   r	   r7   r   r   r   __init__   s"    

zFireCrawlLoader.__init__)r
   c                 c   s   | j dkr(| jj| j| | jdg}n| j dkrh| js@td| jj| j| | jd}|	dg }nT| j dkr| jstd| jj
| j| jd}n&| j dkrtd	ntd
| j  d|D ]Z}| j dkr|}i }n,|	dp|	dp|	dd}|	di }|sqt||dV  qd S )Nr4   )r	   r1   zURL is required for crawl modedatar5   zURL is required for map moder8   z?Search mode is not supported in this version, please downgrade.r9   z%'. Allowed: 'crawl', 'scrape', 'map'.r   r*   r+    metadata)page_contentr@   )r   r:   Z
scrape_urlr6   r   r	   r<   Z	crawl_urlr   r   Zmap_urlr   )r   Zfirecrawl_docsZcrawl_responsedocrA   r@   r   r   r   	lazy_load  sH    




zFireCrawlLoader.lazy_load)__name__
__module____qualname____doc__r   r   r   strr   r   r=   r   r   rC   r   r   r   r   r   	   s   90p.r   )r   typingr   r   r   Zlangchain_core.document_loadersr   Zlangchain_core.documentsr   Zlangchain_core.utilsr   r   r   r   r   r   <module>   s
   