a
    bgQ                     @   s   d dl Z d dlZd dlZd dlmZmZ d dlmZmZm	Z	m
Z
mZ d dlmZ d dlmZ G dd deeZG dd	 d	eZG d
d deZdS )    N)ABCabstractmethod)DictIteratorOptionalTupleUnion)Document)
BaseLoaderc                   @   s  e Zd ZdZdddddddd	d	dddd
d
deeeeeef eeeeeeeeef eeef eeef ee ee d
dddZ	e
e dddZeedddZeedddZeee
e dddZd eeee e
e dddZeeeef dddZd
S )!DedocBaseLoadera  
    Base Loader that uses `dedoc` (https://dedoc.readthedocs.io).

    Loader enables extracting text, tables and attached files from the given file:
        * `Text` can be split by pages, `dedoc` tree nodes, textual lines
            (according to the `split` parameter).
        * `Attached files` (when with_attachments=True)
            are split according to the `split` parameter.
            For attachments, langchain Document object has an additional metadata field
            `type`="attachment".
        * `Tables` (when with_tables=True) are not split - each table corresponds to one
            langchain Document object.
            For tables, Document object has additional metadata fields `type`="table"
            and `text_as_html` with table HTML representation.
    documentTF
   
auto_tabbyrus+eng:autoN)splitwith_tableswith_attachmentsrecursion_deep_attachmentspdf_with_text_layerlanguagepagesis_one_column_documentdocument_orientationneed_header_footer_analysisneed_binarizationneed_pdf_table_analysis	delimiterencoding)	file_pathr   r   r   r   r   r   r   r   r   r   r   r   r   r   returnc                C   s~   dd t   D | _h d| _|| jvrBtd| d| j d|| _|| _|| _| jdkrbdnd	}|| jd
< || jd< dS )a
  
        Initialize with file path and parsing parameters.

        Args:
            file_path: path to the file for processing
            split: type of document splitting into parts (each part is returned
                separately), default value "document"
                "document": document text is returned as a single langchain Document
                    object (don't split)
                "page": split document text into pages (works for PDF, DJVU, PPTX, PPT,
                    ODP)
                "node": split document text into tree nodes (title nodes, list item
                    nodes, raw text nodes)
                "line": split document text into lines
            with_tables: add tables to the result - each table is returned as a single
                langchain Document object

            Parameters used for document parsing via `dedoc`
                (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html):

                with_attachments: enable attached files extraction
                recursion_deep_attachments: recursion level for attached files
                    extraction, works only when with_attachments==True
                pdf_with_text_layer: type of handler for parsing PDF documents,
                    available options
                    ["true", "false", "tabby", "auto", "auto_tabby" (default)]
                language: language of the document for PDF without a textual layer and
                    images, available options ["eng", "rus", "rus+eng" (default)],
                    the list of languages can be extended, please see
                    https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
                pages: page slice to define the reading range for parsing PDF documents
                is_one_column_document: detect number of columns for PDF without
                    a textual layer and images, available options
                    ["true", "false", "auto" (default)]
                document_orientation: fix document orientation (90, 180, 270 degrees)
                    for PDF without a textual layer and images, available options
                    ["auto" (default), "no_change"]
                need_header_footer_analysis: remove headers and footers from the output
                    result for parsing PDF and images
                need_binarization: clean pages background (binarize) for PDF without a
                    textual layer and images
                need_pdf_table_analysis: parse tables for PDF without a textual layer
                    and images
                delimiter: column separator for CSV, TSV files
                encoding: encoding of TXT, CSV, TSV
        c                 S   s   i | ]\}}|d vr||qS )>   selfr   r   r     ).0keyvaluer#   r#   x/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_community/document_loaders/dedoc.py
<dictcomp>d   s   z,DedocBaseLoader.__init__.<locals>.<dictcomp>>   pagenodeliner   Got $ for `split`, but should be one of ``r*   treeZlinearstructure_typeZneed_content_analysisN)localsitemsparsing_parametersvalid_split_values
ValueErrorr   r   r    )r"   r    r   r   r   r   r   r   r   r   r   r   r   r   r   r   r0   r#   r#   r'   __init__#   s     A


zDedocBaseLoader.__init__r!   c                 c   s   ddl }zddlm} W n ty2   tdY n0 ||  d}d|jd _| .}|j| j	i | j
d|id	}W d   n1 s0    Y  | j|  | jd
E dH  dS )Lazily load documents.r   N)DedocManagerzE`dedoc` package not found, please install it with `pip install dedoc`)Zmanager_configTloggerZattachments_dir)r    
parametersdocument_treer   )tempfileZdedocr9   ImportError_make_configconfigdisabledTemporaryDirectoryparser    r3   _split_documentZto_api_schemadictr   )r"   r>   r9   Zdedoc_managerZtmpdirr=   r#   r#   r'   	lazy_loadw   s"    

$zDedocBaseLoader.lazy_loadc                 C   s   dS )zu
        Make configuration for DedocManager according to the file extension and
        parsing parameters.
        Nr#   r"   r#   r#   r'   r@      s    zDedocBaseLoader._make_config)	paragraphr!   c                    s>   d  fdd|d D }|r2|d  d| n|d }|S )z1Get text (recursively) of the document tree node.
c                    s   g | ]}  |qS r#   )	_json2txt)r$   subparagraphrH   r#   r'   
<listcomp>   s   z-DedocBaseLoader._json2txt.<locals>.<listcomp>subparagraphstext)join)r"   rI   Zsubparagraphs_textrO   r#   rH   r'   rK      s    
zDedocBaseLoader._json2txt)r=   document_metadatar!   c                 c   sV   t |d dkr4|d D ]}| j||dE dH  qnt|d i ||d dV  dS )z4Parse recursively document tree obtained by `dedoc`.rN   r   r=   rQ   NrO   metadataZpage_contentrS   )len_parse_subparagraphsr	   )r"   r=   rQ   rL   r#   r#   r'   rV      s    z$DedocBaseLoader._parse_subparagraphs)r=   r   additional_metadatar!   c                 c   s  |d }|ri ||}|dkrF| j |d d d}t||dV  n*|dkr|d d d }|d	 d d
 }d}|D ]T}	|	d d
 |kr||  |	7 }qvt|i |d
|idV  |	d d
 }|  |	}qvt|i |d
|idV  n|dkr0|d d d D ]*}	|	d }
t|  |	i ||
dV  qn@|dkrX| j|d d |dE dH  ntd| d| j d| jr|d d D ]4}| |\}}t|i |d d|ddV  q|d D ]"}| j|| jddidE dH  qdS )z=Split document into parts according to the `split` parameter.rS   r   contentZ	structure)rI   rT   r)   rN   r   page_id r+   r*   rR   Nr,   r-   r.   Ztablestable)typeZtext_as_htmlattachmentsr\   
attachment)r=   r   rW   )	rK   r	   rV   r5   r4   r   
_get_tablerE   r   )r"   r=   r   rW   rQ   rO   nodesrY   Z	page_textr*   Zline_metadatar[   
table_text
table_htmlr^   r#   r#   r'   rE      st    




	zDedocBaseLoader._split_document)r[   r!   c              
   C   s   d}|d D ]:}|D ](}|d dd |d D 7 }|d7 }q|d7 }qd	}|d D ]|}|d
7 }|D ]b}d dd |d D }t|}|d7 }|d r|d7 }|d|d  d|d  d| d7 }qd|d7 }qT|d7 }||fS )z.Get text and HTML representation of the table.rZ   cells c                 s   s   | ]}|d  V  qdS rO   Nr#   r$   r+   r#   r#   r'   	<genexpr>      z-DedocBaseLoader._get_table.<locals>.<genexpr>lines	rJ   zK<table border="1" style="border-collapse: collapse; width: 100%;">
<tbody>
z<tr>
c                 s   s   | ]}|d  V  qdS re   r#   rf   r#   r#   r'   rg     rh   z<tdZ	invisiblez style="display: none" z
 colspan="Zcolspanz" rowspan="Zrowspanz">z</td>
z</tr>
z</tbody>
</table>)rP   htmlescape)r"   r[   ra   rowcellrb   Z	cell_textr#   r#   r'   r_      s4    



zDedocBaseLoader._get_table)N)__name__
__module____qualname____doc__strboolr   intr   r6   r   r	   rG   r   rF   r@   rK   rV   rE   r   r_   r#   r#   r#   r'   r      s`   



T Kr   c                   @   s   e Zd ZdZedddZdS )DedocFileLoaderaw  
    DedocFileLoader document loader integration to load files using `dedoc`.

    The file loader automatically detects the file type (with the correct extension).
    The list of supported file types is gives at
    https://dedoc.readthedocs.io/en/latest/index.html#id1.
    Please see the documentation of DedocBaseLoader to get more details.

    Setup:
        Install ``dedoc`` package.

        .. code-block:: bash

            pip install -U dedoc

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import DedocFileLoader

            loader = DedocFileLoader(
                file_path="example.pdf",
                # split=...,
                # with_tables=...,
                # pdf_with_text_layer=...,
                # pages=...,
                # ...
            )

    Load:
        .. code-block:: python

            docs = loader.load()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }

    Lazy load:
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }
    r7   c                 C   s    ddl m} || j| j| jdS )Nr   )make_manager_config)r    Zparsing_paramsr   )Zdedoc.utils.langchainrw   r    r3   r   )r"   rw   r#   r#   r'   r@   `  s    zDedocFileLoader._make_configN)ro   rp   rq   rr   rF   r@   r#   r#   r#   r'   rv     s   Brv   c                       s   e Zd ZdZdddddddd	d
d
ddddddeeeeeeef eeeeeeeeef eeef eeef ee ee dd fddZ	e
e dddZedddZeeeeeeeeef f dddZ  ZS )DedocAPIFileLoaderaU  
    Load files using `dedoc` API.
    The file loader automatically detects the file type (even with the wrong extension).
    By default, the loader makes a call to the locally hosted `dedoc` API.
    More information about `dedoc` API can be found in `dedoc` documentation:
        https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api.html

    Please see the documentation of DedocBaseLoader to get more details.

    Setup:
        You don't need to install `dedoc` library for using this loader.
        Instead, the `dedoc` API needs to be run.
        You may use Docker container for this purpose.
        Please see `dedoc` documentation for more details:
            https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-and-run-dedoc-using-docker

        .. code-block:: bash

            docker pull dedocproject/dedoc
            docker run -p 1231:1231

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import DedocAPIFileLoader

            loader = DedocAPIFileLoader(
                file_path="example.pdf",
                # url=...,
                # split=...,
                # with_tables=...,
                # pdf_with_text_layer=...,
                # pages=...,
                # ...
            )

    Load:
        .. code-block:: python

            docs = loader.load()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }

    Lazy load:
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }
    zhttp://0.0.0.0:1231r   TFr   r   r   r   r   N)urlr   r   r   r   r   r   r   r   r   r   r   r   r   r   )r    ry   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r!   c                   s>   t  j||||||||	|
||||||d || _d| jd< dS )a
  Initialize with file path, API url and parsing parameters.

        Args:
            file_path: path to the file for processing
            url: URL to call `dedoc` API
            split: type of document splitting into parts (each part is returned
                separately), default value "document"
                "document": document is returned as a single langchain Document object
                    (don't split)
                "page": split document into pages (works for PDF, DJVU, PPTX, PPT, ODP)
                "node": split document into tree nodes (title nodes, list item nodes,
                    raw text nodes)
                "line": split document into lines
            with_tables: add tables to the result - each table is returned as a single
                langchain Document object

            Parameters used for document parsing via `dedoc`
                (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html):

                with_attachments: enable attached files extraction
                recursion_deep_attachments: recursion level for attached files
                    extraction, works only when with_attachments==True
                pdf_with_text_layer: type of handler for parsing PDF documents,
                    available options
                    ["true", "false", "tabby", "auto", "auto_tabby" (default)]
                language: language of the document for PDF without a textual layer and
                    images, available options ["eng", "rus", "rus+eng" (default)],
                    the list of languages can be extended, please see
                    https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
                pages: page slice to define the reading range for parsing PDF documents
                is_one_column_document: detect number of columns for PDF without
                    a textual layer and images, available options
                    ["true", "false", "auto" (default)]
                document_orientation: fix document orientation (90, 180, 270 degrees)
                    for PDF without a textual layer and images, available options
                    ["auto" (default), "no_change"]
                need_header_footer_analysis: remove headers and footers from the output
                    result for parsing PDF and images
                need_binarization: clean pages background (binarize) for PDF without a
                    textual layer and images
                need_pdf_table_analysis: parse tables for PDF without a textual layer
                    and images
                delimiter: column separator for CSV, TSV files
                encoding: encoding of TXT, CSV, TSV
        )r    r   r   r   r   r   r   r   r   r   r   r   r   r   r   jsonZreturn_formatN)superr6   ry   r3   )r"   r    ry   r   r   r   r   r   r   r   r   r   r   r   r   r   r   	__class__r#   r'   r6     s&    AzDedocAPIFileLoader.__init__r7   c                 c   s0   | j | j| j| jd}| j|| jdE dH  dS )r8   )ry   r    r;   r<   N)
_send_filery   r    r3   rE   r   )r"   Zdoc_treer#   r#   r'   rG   	  s    zDedocAPIFileLoader.lazy_loadc                 C   s   i S )Nr#   rH   r#   r#   r'   r@     s    zDedocAPIFileLoader._make_config)ry   r    r;   r!   c           
      C   s   ddl }tj|}t|d2}d||fi}|j| d||d}W d   n1 sV0    Y  |jdkr~td|j	  t
|j	 }	|	S )	z7Send POST-request to `dedoc` API and return the resultsr   Nrbfilez/upload)filesdata   zError during file handling: )requestsospathbasenameopenpoststatus_coder5   rX   decoderz   loads)
r"   ry   r    r;   r   	file_namer   r   rresultr#   r#   r'   r~     s    4
zDedocAPIFileLoader._send_file)ro   rp   rq   rr   rs   rt   r   ru   r   r6   r   r	   rG   rF   r@   r   listr~   __classcell__r#   r#   r|   r'   rx   j  sP   M



Urx   )rk   rz   r   abcr   r   typingr   r   r   r   r   Zlangchain_core.documentsr	   Z)langchain_community.document_loaders.baser
   r   rv   rx   r#   r#   r#   r'   <module>   s     M