a
    bg                     @   sX   d dl mZ d dlmZmZ d dlmZ d dlmZ d dl	m
Z
 dZG dd de
Zd	S )
    )Path)IteratorUnion)urlparse)Document)
BaseLoaderzShttps://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=allc                   @   sx   e Zd ZdZdddefeeef eeeedddZ	e
eeddd	Ze
deeeed
ddZee dddZdS )LLMSherpaFileLoaderaD  Load Documents using `LLMSherpa`.

    LLMSherpaFileLoader use LayoutPDFReader, which is part of the LLMSherpa library.
    This tool is designed to parse PDFs while preserving their layout information,
    which is often lost when using most PDF to text parsers.

    Examples
    --------
    from langchain_community.document_loaders.llmsherpa import LLMSherpaFileLoader

    loader = LLMSherpaFileLoader(
        "example.pdf",
        strategy="chunks",
        llmsherpa_api_url="http://localhost:5010/api/parseDocument?renderFormat=all",
    )
    docs = loader.load()
    Tchunks)	file_pathnew_indent_parser	apply_ocrstrategyllmsherpa_api_urlc                 C   s   zddl }W n ty&   tdY n0 g d}||vrNtd| d| d| |sftd| | j|||d	| _|| _t|| _dS )
zInitialize with a file path.r   NzKllmsherpa package not found, please install it with `pip install llmsherpa`)sectionsr	   htmltextzGot z' for `strategy`, but should be one of ``zInvalid URL: )urlr   r   )		llmsherpaImportError
ValueError_is_valid_url_validate_llmsherpa_urlr   r   strr
   )selfr
   r   r   r   r   r   Z_valid_strategies r   |/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_community/document_loaders/llmsherpa.py__init__   s,    	

zLLMSherpaFileLoader.__init__)r   returnc                 C   s   t | }t|jot|jS )zCheck if the url is valid.)r   boolnetlocscheme)r   parsedr   r   r   r   A   s    z!LLMSherpaFileLoader._is_valid_url)r   r   r   r   c                 C   sp   t | }| }d|jvr.d|jvr.td|  d|jvr@|d }|rVd|jvrV|d }|rld|jvrl|d	 }|S )
z$Check if the llmsherpa url is valid.z/api/parseDocumentz%/api/document/developer/parseDocumentzInvalid LLMSherpa URL: zrenderFormat=allz?renderFormat=allzuseNewIndentParser=truez&useNewIndentParser=truezapplyOcr=yesz&applyOcr=yes)r   pathr   query)r   r   r   r"   Z	valid_urlr   r   r   r   G   s    

z+LLMSherpaFileLoader._validate_llmsherpa_url)r   c                 #   s   ddl m} | j}| j} jdkrL fddt| D E dH   jdkrv fddt| D E dH   jd	krt	|
 d
 jidgE dH   jdkrt	| d
 jidgE dH  dS )z
Load file.r   )LayoutPDFReaderr   c                    s2   g | ]*\}}t |jd d d j||jddqS )T)Zinclude_childrenrecurse)sourceZsection_numberZsection_titleZpage_contentmetadata)r   to_textr
   title).0Zsection_numsectionr   r   r   
<listcomp>f   s   	z1LLMSherpaFileLoader.lazy_load.<locals>.<listcomp>Nr	   c                    s,   g | ]$\}}t |  j||jd dqS ))r'   Zchunk_numberZ
chunk_typer(   )r   Zto_context_textr
   tag)r,   Z	chunk_numchunkr.   r   r   r/   r   s   	r   r'   r(   r   )Zllmsherpa.readersr%   r   Zread_pdfr
   r   	enumerater   r	   r   Zto_htmlr*   )r   r%   Zdocs_readerdocr   r.   r   	lazy_load\   s2    


	


	



zLLMSherpaFileLoader.lazy_loadN)TT)__name__
__module____qualname____doc__DEFAULT_APIr   r   r   r   r   staticmethodr   r   r   r   r4   r   r   r   r   r      s,   
" r   N)pathlibr   typingr   r   urllib.parser   Zlangchain_core.documentsr   Z(langchain_community.document_loaders.pdfr   r9   r   r   r   r   r   <module>   s   