a
    bg[                     @  s  d Z ddlmZ ddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZmZmZmZmZmZmZmZ ddlmZ ddlZddlZdd	lmZ dd
lm Z  ddl!m"Z" ddl#m$Z$m%Z% erddl&Z&ddl'Z'ddl(Z(ddl)Z)ddl*m+Z+ g dZ,g dZ-dddddZ.e/e0Z1dZ2dZ3dZ4dZ5h dZ6dddddddZ7ddddd Z8dddd!d"Z9d#d$gZ:d%ddd&d'd(Z;G d)d* d*e Z<G d+d, d,e Z=G d-d. d.e Z>G d/d0 d0e Z?G d1d2 d2e Z@G d3d4 d4e ZAG d5d6 d6e ZBdS )7z(Module contains common parsers for PDFs.    )annotationsN)datetime)Path)TemporaryDirectory)TYPE_CHECKINGAnyBinaryIOIterableIteratorLiteralMappingOptionalSequenceUnioncast)urlparse)Document)BaseBlobParser)Blob)BaseImageBlobParserRapidOCRBlobParser)TextLinearizationConfig)Z	DCTDecodeZDCTZ	JPXDecode)Z	LZWDecodeZLZWZFlateDecodeZFlZASCII85DecodeZA85ZASCIIHexDecodeZAHxZRunLengthDecodeZRLZCCITTFaxDecodeZCCFZJBIG2Decodez,Sequence[Union[Iterable[np.ndarray], bytes]]str)imagesreturnc                 C  sp   zddl m} W n ty*   tdY n0 | }d}| D ]0}||\}}|r:dd |D }|d|7 }q:|S )zExtract text from images with RapidOCR.

    Args:
        images: Images to extract text from.

    Returns:
        Text extracted from images.

    Raises:
        ImportError: If `rapidocr-onnxruntime` package is not installed.
    r   )RapidOCRzc`rapidocr-onnxruntime` package not found, please install it with `pip install rapidocr-onnxruntime` c                 S  s   g | ]}|d  qS )    ).0textr   r   ~/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_community/document_loaders/parsers/pdf.py
<listcomp>Z       z5extract_from_images_with_rapidocr.<locals>.<listcomp>
)Zrapidocr_onnxruntimer   ImportErrorjoin)r   r   Zocrr    imgresult_r   r   r!   !extract_from_images_with_rapidocr@   s    
r*   z

{image_text}

r$   z
>   producertotal_pagescreatorsourcecreationdater   )blobcontentformatr   c                 C  s^   |rZ| j pd}|dkr6|dd}d| d| d}n$|dkrZd	tj|d
d d| d}|S )a  Format the content of the image with the source of the blob.

    blob: The blob containing the image.
    format::
      The format for the parsed output.
      - "text" = return the content as is
      - "markdown-img" = wrap the content into an image markdown link, w/ link
      pointing to (`![body)(#)`]
      - "html-img" = wrap the content as the `alt` text of an tag and link to
      (`<img alt="{body}" src="#"/>`)
    #zmarkdown-img]z\\]z![z]()zhtml-imgz
<img alt="T)quotez src="z" />)r.   replacehtmlescape)r0   r1   r2   r.   r   r   r!   _format_inner_imagei   s    
r:   dict[str, Any])metadatar   c                 C  s4   t |  stdt| ddts0td| S )zValidate that the metadata has all the standard keys and the page is an integer.

    The standard keys are:
    - source
    - total_page
    - creationdate
    - creator
    - producer

    Validate that page is an integer if it is present.
    z3The PDF parser must valorize the standard metadata.pager   z(The PDF metadata page must be a integer.)_STD_METADATA_KEYSissubsetkeys
ValueError
isinstancegetint)r<   r   r   r!   _validate_metadata   s
    rE   c              	   C  s   i }ddd}|   D ]\}}t|ttfvr6t|}|drL|dd }| }|dv rz"t|dd	d
	d||< W q t
y   |||< Y q0 q||v r|||| < |||< qt|tr| ||< qt|tr|||< q|S )zPurge metadata from unwanted keys and normalize key names.

    Args:
        metadata: The original metadata dictionary.

    Returns:
        The cleaned and normalized the key format of metadata dictionary.
    r,   r.   )Z
page_count	file_path/r   N)r/   Zmoddate'r   zD:%Y%m%d%H%M%S%zT)itemstyper   rD   
startswithlowerr   strptimer7   	isoformatrA   rB   strip)r<   Znew_metadataZmap_keykvr   r   r!   _purge_metadata   s6    	




rS   z




	list[str])extrastext_from_pager   c                   s\   ddddd fdd  | |d}|sXd	}d
 tdd | }|rPtd | }|| }|S )a5  Insert extras such as image/table in a text between two paragraphs if possible,
    else at the end of the text.

    Args:
        extras: List of extra content (images/tables) to insert.
        text_from_page: The text content from the page.

    Returns:
        The merged text with extras inserted.
    rU   r   boolOptional[str])rV   rW   recursr   c           	        s   | rt D ]}||}|dkrd }|r: | |d | d}|rP|||d   }n@d}dtdd | }|rt|| }|d | | ||d   } qqd }n|}|S )NFr   rT   c                 S  s   | S Nr   xr   r   r!   <lambda>   r#   zO_merge_text_and_extras.<locals>._recurs_merge_text_and_extras.<locals>.<lambda>)_PARAGRAPH_DELIMITERrfindr&   filter)	rV   rW   rZ   delimposZprevious_textall_text
all_extras
str_extras_recurs_merge_text_and_extrasr   r!   ri      s*    
z=_merge_text_and_extras.<locals>._recurs_merge_text_and_extrasTr   rT   c                 S  s   | S r\   r   r]   r   r   r!   r_      r#   z(_merge_text_and_extras.<locals>.<lambda>r[   )r&   rb   r`   )rV   rW   re   rf   rg   r   rh   r!   _merge_text_and_extras   s    rj   c                      sf   e Zd ZdZddeddddddd	d
dddddd fddZdddddZdddddZ  ZS )PyPDFParsera  Parse a blob from a PDF using `pypdf` library.

    This class provides methods to parse a blob from a PDF document, supporting various
    configurations such as handling password-protected PDFs, extracting images.
    It integrates the 'pypdf' library for PDF processing and offers synchronous blob
    parsing.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pypdf

        Load a blob from a PDF file:

        .. code-block:: python

            from langchain_core.documents.base import Blob

            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")

        Instantiate the parser:

        .. code-block:: python

            from langchain_community.document_loaders.parsers import PyPDFParser

            parser = PyPDFParser(
                # password = None,
                mode = "single",
                pages_delimiter = "
",
                # images_parser = TesseractBlobParser(),
            )

        Lazily parse the blob:

        .. code-block:: python

            docs = []
            docs_lazy = parser.lazy_parse(blob)

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    NFr=   r    plain)modepages_delimiterimages_parserimages_inner_formatextraction_modeextraction_kwargszOptional[Union[str, bytes]]rX   Literal['single', 'page']r   Optional[BaseImageBlobParser]+Literal['text', 'markdown-img', 'html-img']zLiteral['plain', 'layout']Optional[dict[str, Any]])passwordextract_imagesrm   rn   ro   rp   rq   rr   c          	        s`   t    |dvrtd|| _|r.|s.t }|| _|| _|| _|| _|| _	|| _
|pXi | _dS )u  Initialize a parser based on PyPDF.

        Args:
            password: Optional password for opening encrypted PDFs.
            extract_images: Whether to extract images from the PDF.
            mode: The extraction mode, either "single" for the entire document or "page"
                for page-wise extraction.
            pages_delimiter: A string delimiter to separate pages in single-mode
                extraction.
            images_parser: Optional image blob parser.
            images_inner_format: The format for the parsed output.
                - "text" = return the content as is
                - "markdown-img" = wrap the content into an image markdown link, w/ link
                pointing to (`![body)(#)`]
                - "html-img" = wrap the content as the `alt` text of an tag and link to
                (`<img alt="{body}" src="#"/>`)
            extraction_mode: “plain” for legacy functionality, “layout” extract text
                in a fixed width format that closely adheres to the rendered layout in
                the source pdf.
            extraction_kwargs: Optional additional parameters for the extraction
                process.

        Raises:
            ValueError: If the `mode` is not "single" or "page".
        singler=   mode must be single or pageN)super__init__rA   rx   r   ro   rp   rw   rm   rn   rq   rr   )	selfrw   rx   rm   rn   ro   rp   rq   rr   	__class__r   r!   r}   &  s    %
zPyPDFParser.__init__r   Iterator[Document]r0   r   c              	   #  sH  zddl  W n ty&   tdY n0 ddd fdd}| } j|jd	}td
d
ddtt|jppi B |j	t
|jdB }g }t|jD ]d\}}||d}	|}
t|
g|	 }jdkrt|t|||j| dB dV  q|| qjdkr$tj|t|dV  W d   n1 s:0    Y  dS )am  
        Lazily parse the blob.
        Insert image, if possible, between two paragraphs.
        In this way, a paragraph can be continued on the next page.

        Args:
            blob: The blob to parse.

        Raises:
            ImportError: If the `pypdf` package is not found.

        Yield:
            An iterator over the parsed documents.
        r   NzCpypdf package not found, please install it with `pip install pypdf`zpypdf.PageObjectr   r=   r   c                   s0    j dr|  S | jf djijS dS )z
            Extract text from image given the version of pypdf.

            Args:
                page: The page object to extract text from.

            Returns:
                str: The extracted text.
            3rq   N)__version__rL   extract_textrq   rr   r=   pypdfr~   r   r!   _extract_text_from_pageo  s    
z7PyPDFParser.lazy_parse.<locals>._extract_text_from_pagerw   ZPyPDFr   )r+   r-   r/   )r.   r,   r   r=   )r=   Z
page_labelpage_contentr<   rz   )r   r%   as_bytes_ioZ	PdfReaderrw   rS   r   dictr<   r.   lenpages	enumerateextract_images_from_pagerj   rP   rm   r   rE   Zpage_labelsappendrn   r&   )r~   r0   r   pdf_file_obj
pdf_readerdoc_metadataZsingle_textspage_numberr=   rW   images_from_pagere   r   r   r!   
lazy_parseY  sT    







zPyPDFParser.lazy_parsezpypdf._page.PageObjectr   c              	   C  sp  | j s
dS ddlm} dtt|d  vr0dS |d d  }g }|D ]}d}|| d dkrH|| d	 d
d tv r|| d || d  }}tj	|| 
 tjd||d}nB|| d	 d
d tv rt|t|| 
 }n
td |durHt }	||j|	dd tj|	 dd}
t| j |
j}|t|
|| j qHtj t!"t#d|dS )zExtract images from a PDF page and get the text using images_to_text.

        Args:
            page: The page object from which to extract images.

        Returns:
            str: The extracted text from the images on the page.
        r   r   Imagez/XObjectz
/ResourcesNz/Subtypez/Imagez/Filterr   z/Heightz/WidthZdtyper[   Unknown PDF Filter!ZPNG)r2   z	image/pngZ	mime_type
image_text)$ro   PILr   r   r   r@   Z
get_object_PDF_FILTER_WITHOUT_LOSSnp
frombufferget_datauint8reshape_PDF_FILTER_WITH_LOSSarrayopenioBytesIOloggerwarningZ	fromarraysaver   	from_datagetvaluenextr   r   r   r:   rp   _FORMAT_IMAGE_STRr2   _JOIN_IMAGESr&   rb   )r~   r=   r   ZxObjectr   objZnp_imageheightwidthimage_bytesr0   r   r   r   r!   r     s>    	
 
z$PyPDFParser.extract_images_from_page)NF)	__name__
__module____qualname____doc___DEFAULT_PAGES_DELIMITERr}   r   r   __classcell__r   r   r   r!   rk      s   2  $3Mrk   c                
      s   e Zd ZdZdZd%ddeddddddd	d
dddd fddZedd
dddZedddddZ	d&dd
dddddZ
d d!d"d#d$Z  ZS )'PDFMinerParsera  Parse a blob from a PDF using `pdfminer.six` library.

    This class provides methods to parse a blob from a PDF document, supporting various
    configurations such as handling password-protected PDFs, extracting images, and
    defining extraction mode.
    It integrates the 'pdfminer.six' library for PDF processing and offers synchronous
    blob parsing.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pdfminer.six pillow

        Load a blob from a PDF file:

        .. code-block:: python

            from langchain_core.documents.base import Blob

            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")

        Instantiate the parser:

        .. code-block:: python

            from langchain_community.document_loaders.parsers import PDFMinerParser

            parser = PDFMinerParser(
                # password = None,
                mode = "single",
                pages_delimiter = "
",
                # extract_images = True,
                # images_to_text = convert_images_to_text_with_tesseract(),
            )

        Lazily parse the blob:

        .. code-block:: python

            docs = []
            docs_lazy = parser.lazy_parse(blob)

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    FNrz   r    )rw   rm   rn   ro   rp   concatenate_pagesrX   rY   rs   r   rt   ru   zOptional[bool])rx   rw   rm   rn   ro   rp   r   c                  s|   t    |dvrtd|r(|s(t }|| _|| _|| _|| _|| _|| _	|durxt
jsjdt
_td |rrdnd| _dS )aH  Initialize a parser based on PDFMiner.

        Args:
            password: Optional password for opening encrypted PDFs.
            mode: Extraction mode to use. Either "single" or "page" for page-wise
                extraction.
            pages_delimiter: A string delimiter to separate pages in single-mode
                extraction.
            extract_images: Whether to extract images from PDF.
            images_inner_format: The format for the parsed output.
                - "text" = return the content as is
                - "markdown-img" = wrap the content into an image markdown link, w/ link
                pointing to (`![body)(#)`]
                - "html-img" = wrap the content as the `alt` text of an tag and link to
                (`<img alt="{body}" src="#"/>`)
            concatenate_pages: Deprecated. If True, concatenate all PDF pages
                into one a single document. Otherwise, return one document per page.

        Returns:
            This method does not directly return data. Use the `parse` or `lazy_parse`
            methods to retrieve parsed documents with content and metadata.

        Raises:
            ValueError: If the `mode` is not "single" or "page".

        Warnings:
            `concatenate_pages` parameter is deprecated. Use `mode='single' or 'page'
            instead.
        ry   r{   NTzS`concatenate_pages` parameter is deprecated. Use `mode='single' or 'page'` instead.rz   r=   )r|   r}   rA   r   rx   ro   rp   rw   rm   rn   r   _warn_concatenate_pagesr   r   )r~   rx   rw   rm   rn   ro   rp   r   r   r   r!   r}     s$    (
zPDFMinerParser.__init__zUnion[bytes, str])sr   c                   s|   ddl m  t| tr4| dr4t| dd ddS z(dd	 | D }d
 fdd	|D W S  tyv   t|  Y S 0 dS )z
        Decodes a PDFDocEncoding string to Unicode.
        Adds py3 compatibility to pdfminer's version.

        Args:
            s: The string to decode.

        Returns:
            str: The decoded Unicode string.
        r   PDFDocEncodings      Nzutf-16beignorec                 s  s$   | ]}t |trt|n|V  qd S r\   )rB   r   ord)r   cr   r   r!   	<genexpr>U  r#   z-PDFMinerParser.decode_text.<locals>.<genexpr>r   c                 3  s   | ]} | V  qd S r\   r   )r   or   r   r!   r   V  r#   )Zpdfminer.utilsr   rB   bytesrL   r   r&   
IndexError)r   Zordsr   r   r!   decode_textD  s    zPDFMinerParser.decode_textr   )r   r   c                 C  s   ddl m} t| dr|  } t| tr8tttj| S t| |rNt	| j
S t| ttfrft	| S t| tr|  D ]\}}t|| |< qx| S | S )z
        Recursively resolve the metadata values.

        Args:
            obj: The object to resolve and decode. It can be of any type.

        Returns:
            The resolved and decoded object.
        r   )	PSLiteralresolve)Zpdfminer.psparserr   hasattrr   rB   listmapr   resolve_and_decoder   namer   r   r   rJ   )r   r   rQ   rR   r   r   r!   r   Z  s    




z!PDFMinerParser.resolve_and_decoder   Tr   r;   )fprw   cachingr   c                 C  s   ddl m}m}m} ||}||||d}i }	|jD ]}
|	|
 q4|	 D ]R\}}zt||	|< W qL t	y } zt
d|t| W Y d}~qLd}~0 0 qLtt|||	d< |	S )ag  
        Extract metadata from a PDF file.

        Args:
            fp: The file pointer to the PDF file.
            password: The password for the PDF file, if encrypted. Defaults to an empty
                string.
            caching: Whether to cache the PDF structure. Defaults to True.

        Returns:
            Metadata of the PDF file.
        r   )PDFDocumentPDFPage	PDFParser)rw   r   zD[WARNING] Metadata key "%s" could not be parsed due to exception: %sNr,   )pdfminer.pdfpager   r   r   infoupdaterJ   r   r   	Exceptionr   r   r   r   r   Zcreate_pages)r~   r   rw   r   r   r   r   parserdocr<   r   rQ   rR   er   r   r!   _get_metadatav  s"    
zPDFMinerParser._get_metadatar   r   r   c                 #  s2  znddl }ddlm} ddlm mmmmm	m
 ddlm}m ddlm} t|jdk rltdW n ty   td	Y n0 | }t ^	|j|jpd
d} }tj|jpd
d}	|j|	d< G  	
fddd|}
t 
|||
|  d}g }t|D ]\}}
d 
d || 
  }|! }j"dkr
d 
d t#|t$|	d|iB dV  n"|%dr|dd }|&| q,j"dkrj'(|}t#|t$|	dV  W d   n1 s0    Y  W d   n1 s$0    Y  dS )a  
        Lazily parse the blob.
        Insert image, if possible, between two paragraphs.
        In this way, a paragraph can be continued on the next page.

        Args:
            blob: The blob to parse.

        Raises:
            ImportError: If the `pdfminer.six` or `pillow` package is not found.

        Yield:
            An iterator over the parsed documents.
        r   N)PDFLayoutAnalyzer)LAParamsLTContainerLTImageLTItemLTPageLTText	LTTextBox)PDFPageInterpreterPDFResourceManager)r   i:>4zThis parser is tested with pdfminer.six version 20201018 or later. Remove pdfminer, and install pdfminer.six with `pip uninstall pdfminer && pip install pdfminer.six`.zMpdfminer package not found, please install it with `pip install pdfminer.six`r   r   r.   c                	      sL   e Zd Zdddddd fdd	Zd
dd	
fddZ  ZS )z*PDFMinerParser.lazy_parse.<locals>.Visitorr   Nr   rD   zOptional[LAParams]None)rsrcmgrpagenolaparamsr   c                   s   t  j|||d d S )N)r   r   )r|   r}   )r~   r   r   r   r   r   r!   r}     s    z3PDFMinerParser.lazy_parse.<locals>.Visitor.__init__r   )ltpager   c              	     s.   ddd fdd  | d S )Nr   r   )itemr   c                   s   t |  r| D ]}| qnt | r6|   t | rLd nrt | rjrddlm} |}|| }tt	| }d|j
d< tj|j}t||j n d S )Nr$   r   )ImageWriterr3   r.   )rB   writeget_textro   Zpdfminer.imager   Zexport_imager   	from_pathr   r<   r   r   r   r:   rp   )r   childr   Zimage_writerfilenamer0   r   )r   r   r   r   renderr~   tempdirtext_ior   r!   r     s.    






zIPDFMinerParser.lazy_parse.<locals>.Visitor.receive_layout.<locals>.renderr   )mer   )r   r   r   r   r   r~   r   r   )r   r!   receive_layout  s    "z9PDFMinerParser.lazy_parse.<locals>.Visitor.receive_layout)r   N)r   r   r   r}   r   r   r   r   r   r   r   r   r   r   r   r~   r   r   r   r!   Visitor  s     r   )r   r=   r   r[   rz   ))pdfminerZpdfminer.converterr   Zpdfminer.layoutr   r   r   r   r   r   r   Zpdfminer.pdfinterpr   r   r   r   rD   r   r%   r   r   Z	get_pagesrw   rS   r   r.   r   StringIOr   truncateseekZprocess_pager   rP   rm   r   rE   endswithr   rn   r&   )r~   r0   r  r   r   r   r   r   r   r   r   Zvisitor_for_allZall_contentir=   re   Zdocument_contentr   r   r!   r     sb    $	

((





zPDFMinerParser.lazy_parse)F)r   T)r   r   r   r   r   r   r}   staticmethodr   r   r   r   r   r   r   r   r!   r     s&   2 "<  .r   c                      s   e Zd ZdZe Zd+ddedddddddd	d
ddddddd
 fddZdddddZ	d,ddddddZ
ddddddd Zddd!d"d#d$Zdddd%d&d'Zddd(d)d*Z  ZS )-PyMuPDFParsera  Parse a blob from a PDF using `PyMuPDF` library.

    This class provides methods to parse a blob from a PDF document, supporting various
    configurations such as handling password-protected PDFs, extracting images, and
    defining extraction mode.
    It integrates the 'PyMuPDF' library for PDF processing and offers synchronous blob
    parsing.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pymupdf

        Load a blob from a PDF file:

        .. code-block:: python

            from langchain_core.documents.base import Blob

            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")

        Instantiate the parser:

        .. code-block:: python

            from langchain_community.document_loaders.parsers import PyMuPDFParser

            parser = PyMuPDFParser(
                # password = None,
                mode = "single",
                pages_delimiter = "
",
                # images_parser = TesseractBlobParser(),
                # extract_tables="markdown",
                # extract_tables_settings=None,
                # text_kwargs=None,
            )

        Lazily parse the blob:

        .. code-block:: python

            docs = []
            docs_lazy = parser.lazy_parse(blob)

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    NFr=   r    )rw   rm   rn   ro   rp   extract_tablesextract_tables_settingsrv   rX   rY   rs   r   rt   ru   z/Union[Literal['csv', 'markdown', 'html'], None]r   )
text_kwargsrx   rw   rm   rn   ro   rp   r	  r
  r   c          
        sz   t    |dvrtd|r.|dvr.td|| _|| _|| _|pFi | _|rX|sXt }|| _|| _	|| _
|| _|	| _dS )a  Initialize a parser based on PyMuPDF.

        Args:
            password: Optional password for opening encrypted PDFs.
            mode: The extraction mode, either "single" for the entire document or "page"
                for page-wise extraction.
            pages_delimiter: A string delimiter to separate pages in single-mode
                extraction.
            extract_images: Whether to extract images from the PDF.
            images_parser: Optional image blob parser.
            images_inner_format: The format for the parsed output.
                - "text" = return the content as is
                - "markdown-img" = wrap the content into an image markdown link, w/ link
                pointing to (`![body)(#)`]
                - "html-img" = wrap the content as the `alt` text of an tag and link to
                (`<img alt="{body}" src="#"/>`)
            extract_tables: Whether to extract tables in a specific format, such as
                "csv", "markdown", or "html".
            extract_tables_settings: Optional dictionary of settings for customizing
                table extraction.

        Returns:
            This method does not directly return data. Use the `parse` or `lazy_parse`
            methods to retrieve parsed documents with content and metadata.

        Raises:
            ValueError: If the mode is not "single" or "page".
            ValueError: If the extract_tables format is not "markdown", "html",
            or "csv".
        ry   r{   )markdownr8   csvzmode must be markdownN)r|   r}   rA   rm   rn   rw   r  r   rx   rp   ro   r	  r
  )
r~   r  rx   rw   rm   rn   ro   rp   r	  r
  r   r   r!   r}   X  s     +

zPyMuPDFParser.__init__r   r   r   c                 C  s
   |  |S r\   )_lazy_parse)r~   r0   r   r   r!   r     s    zPyMuPDFParser.lazy_parse)r0   r  r   c                 c  s  zhddl }|p| j}| jsfddlm}m}m}m} ddddd|dd|ddd||ddddddddd| _W n ty   tdY n0 t	j
 | }|jdu r||}	n|j|dd	}	|	jr|	| j | |	|}
g }|	D ]J}| |	|| }| jd
kr t|t|
d
|jiB dV  q|| q| jdkrRt| j|t|
dV  W d   n1 sh0    Y  W d   n1 s0    Y  dS )a  Lazily parse the blob.
        Insert image, if possible, between two paragraphs.
        In this way, a paragraph can be continued on the next page.

        Args:
            blob: The blob to parse.
            text_kwargs: Optional keyword arguments to pass to the `get_text` method.
                If provided at run time, it will override the default text_kwargs.

        Raises:
            ImportError: If the `pypdf` package is not found.

        Yield:
            An iterator over the parsed documents.
        r   N)DEFAULT_JOIN_TOLERANCEDEFAULT_MIN_WORDS_HORIZONTALDEFAULT_MIN_WORDS_VERTICALDEFAULT_SNAP_TOLERANCElines   )ZclipZvertical_strategyZhorizontal_strategyZvertical_linesZhorizontal_linesZsnap_toleranceZsnap_x_toleranceZsnap_y_toleranceZjoin_toleranceZjoin_x_toleranceZjoin_y_toleranceZedge_min_lengthZmin_words_verticalZmin_words_horizontalZintersection_toleranceZintersection_x_toleranceZintersection_y_toleranceZtext_toleranceZtext_x_toleranceZtext_y_toleranceZstrategyZ	add_lineszGpymupdf package not found, please install it with `pip install pymupdf`Zpdf)streamfiletyper=   r   rz   )pymupdfr  r
  Zpymupdf.tabler  r  r  r  r%   r  _lockr   datar   is_encryptedZauthenticaterw   _extract_metadata_get_page_contentrP   rm   r   rE   numberr   rn   r&   )r~   r0   r  r  r  r  r  r  rF   r   r   Zfull_contentr=   re   r   r   r!   r    sp    
	




zPyMuPDFParser._lazy_parsezpymupdf.Documentzpymupdf.Pager;   )r   r=   r  r   c           	      C  s^   |j f i i | j|}| ||}| |}g }|rB|| |rP|| t||}|S )a:  Get the text of the page using PyMuPDF and RapidOCR and issue a warning
        if it is empty.

        Args:
            doc: The PyMuPDF document object.
            page: The PyMuPDF page object.
            blob: The blob being parsed.

        Returns:
            str: The text content of the page.
        )r   r  _extract_images_from_page_extract_tables_from_pager   rj   )	r~   r   r=   r  rW   r   Ztables_from_pagerV   re   r   r   r!   r    s    



zPyMuPDFParser._get_page_contentr   )r   r0   r   c              	     s\   t i ddd|j|jt d fdd jD }dD ]}| jv r: j| ||< q:|S )zExtract metadata from the document and page.

        Args:
            doc: The PyMuPDF document object.
            blob: The blob being parsed.

        Returns:
            dict: The extracted metadata.
        ZPyMuPDFr   )r+   r-   r/   r.   rF   r,   c                   s,   i | ]$}t  j| ttfr| j| qS r   )rB   r<   r   rD   r   rQ   r   r   r!   
<dictcomp>(  s   z3PyMuPDFParser._extract_metadata.<locals>.<dictcomp>)ZmodDateZcreationDate)rS   r.   r   r<   )r~   r   r0   r<   rQ   r   r!  r!   r    s&    
	

zPyMuPDFParser._extract_metadata)r   r=   r   c                 C  s   | j s
dS ddl}| }g }|D ]}| j r"|d }|||}tj|jtjd|j	|j
d}	t }
t|
|	 tj|
 dd}t| j |j}|t||| j q"tjttd|dS )	a	  Extract images from a PDF page and get the text using images_to_text.

        Args:
            doc: The PyMuPDF document object.
            page: The PyMuPDF page object.

        Returns:
            str: The extracted text from the images on the page.
        r   r   Nr   r[   zapplication/x-npyr   r   )ro   r  Z
get_imagesZPixmapr   r   Zsamplesr   r   r   r   r   r   numpyr   r   r   r   r   r   r   r   r:   rp   r   r2   r   r&   rb   )r~   r   r=   r  Zimg_listr   r'   ZxrefZpiximager   r0   r   r   r   r!   r  4  s0    
z'PyMuPDFParser._extract_images_from_pager   c                 C  s   | j du rdS ddl}t|jj|fi | j}|r| j dkrRtdd |D S | j dkrptdd |D S | j d	krtd
d |D S td| j  ddS )zExtract tables from a PDF page.

        Args:
            page: The PyMuPDF page object.

        Returns:
            str: The extracted tables in the specified format.
        Nr   r   r  c                 S  s   g | ]}|  qS r   )Zto_markdownr   tabler   r   r!   r"   m  r#   z;PyMuPDFParser._extract_tables_from_page.<locals>.<listcomp>r8   c                 S  s    g | ]}|  jd d d dqS )F)headerindexZ	bold_rows)	to_pandasZto_htmlr%  r   r   r!   r"   p  s   r  c                 S  s   g | ]}|  jd d dqS )F)r'  r(  )r)  Zto_csvr%  r   r   r!   r"   {  s
   zextract_tables z not implemented)	r	  r  r   r&  Zfind_tablesr
  _JOIN_TABLESr&   rA   )r~   r=   r  Ztables_listr   r   r!   r  [  s2    	





z'PyMuPDFParser._extract_tables_from_page)NF)N)r   r   r   r   	threadingLockr  r   r}   r   r  r  r  r  r  r   r   r   r   r!   r    s(   6  (=
 ] 'r  c                   @  sB   e Zd ZdZddddddZdd	d
ddZdddddZdS )PyPDFium2ParserzParse `PDF` with `PyPDFium2`.FrX   r   )rx   r   c                 C  s2   zddl }W n ty&   tdY n0 || _dS )zInitialize the parser.r   NzKpypdfium2 package not found, please install it with `pip install pypdfium2`)	pypdfium2r%   rx   )r~   rx   r.  r   r   r!   r}     s    
zPyPDFium2Parser.__init__r   r   r   c           
   	   c  s   ddl }| }|j|dd}zjt|D ]T\}}| }| }|  |d| | 7 }|  |j|d}	t	||	dV  q*W |  n
|  0 W d   n1 s0    Y  dS )Lazily parse the blob.r   NT)Z	autocloser$   r.   r=   r   )
r.  r   ZPdfDocumentr   Zget_textpageZget_text_rangecloser  r.   r   )
r~   r0   r.  rF   r   r   r=   Z	text_pager1   r<   r   r   r!   r     s    
zPyPDFium2Parser.lazy_parsezpypdfium2._helpers.page.PdfPager   r   c                 C  sD   | j s
dS ddlm} t|j|jfd}ttdd |}t|S )8Extract images from page and get the text with RapidOCR.r   r   N)rb   c                 S  s   |    S r\   )Z
get_bitmapZto_numpyr]   r   r   r!   r_     r#   z;PyPDFium2Parser._extract_images_from_page.<locals>.<lambda>)rx   Zpypdfium2.rawrawr   Zget_objectsZFPDF_PAGEOBJ_IMAGEr   r*   )r~   r=   Zpdfium_cr   r   r   r!   r    s    z)PyPDFium2Parser._extract_images_from_pageN)F)r   r   r   r   r}   r   r  r   r   r   r!   r-    s   r-  c                   @  sV   e Zd ZdZdddddddd	Zd
ddddZdddddZdddddZdS )PDFPlumberParserzParse `PDF` with `PDFPlumber`.NFzOptional[Mapping[str, Any]]rX   r   )r  deduperx   r   c                 C  sB   zddl }W n ty&   tdY n0 |p.i | _|| _|| _dS )zInitialize the parser.

        Args:
            text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
            dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
        r   NzEpillow package not found, please install it with `pip install pillow`)r   r%   r  r5  rx   )r~   r  r5  rx   r   r   r   r!   r}     s    

zPDFPlumberParser.__init__r   r   r   c                 #  s\   ddl }  8}|| fddjD E dH  W d   n1 sN0    Y  dS )r/  r   Nc              
     sb   g | ]Z}t |d  | t j j|jd tjdfi fddjD dqS )r$   r   )r.   rF   r=   r,   c                   s.   i | ]&}t  j| ttfv r| j| qS r   )rK   r<   r   rD   r   r!  r   r!   r"    s   z:PDFPlumberParser.lazy_parse.<locals>.<listcomp>.<dictcomp>r   )	r   _process_page_contentr  r   r.   r   r   r   r<   )r   r=   r0   r   r~   r   r!   r"     s&   
z/PDFPlumberParser.lazy_parse.<locals>.<listcomp>)
pdfplumberr   r   r   )r~   r0   r8  rF   r   r7  r!   r     s    

zPDFPlumberParser.lazy_parsezpdfplumber.page.Pager   r   c                 C  s.   | j r| jf i | jS |jf i | jS )z)Process the page content based on dedupe.)r5  Zdedupe_charsr   r  )r~   r=   r   r   r!   r6    s    z&PDFPlumberParser._process_page_contentc                 C  s   ddl m} | jsdS g }|jD ]}|d d jtv r|d d dkr|t|	d|d d	 |d d
 f|d 
 d q|tj|d 
 tjd|d d
 |d d	 d q |d d jtv r||d 
  q td q t|S )r2  r   r   r   r  FilterZBitsPerComponentr   1ZWidthZHeightLr   r[   r   )r   r   rx   r   r   r   r   r   r   	frombytesr   convertr   r   r   r   warningswarnr*   )r~   r=   r   r   r'   r   r   r!   r    s6    


z*PDFPlumberParser._extract_images_from_page)NFF)r   r   r   r   r}   r   r6  r  r   r   r   r!   r4    s      r4  c                   @  s<   e Zd ZdZddddddddd	d
ZdddddZdS )AmazonTextractPDFParsera{  Send `PDF` files to `Amazon Textract` and parse them.

    For parsing multi-page PDFs, they have to reside on S3.

    The AmazonTextractPDFLoader calls the
    [Amazon Textract Service](https://aws.amazon.com/textract/)
    to convert PDFs into a Document structure.
    Single and multi-page documents are supported with up to 3000 pages
    and 512 MB of size.

    For the call to be successful an AWS account is required,
    similar to the
    [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html)
    requirements.

    Besides the AWS configuration, it is very similar to the other PDF
    loaders, while also supporting JPEG, PNG and TIFF and non-native
    PDF formats.

    ```python
    from langchain_community.document_loaders import AmazonTextractPDFLoader
    loader=AmazonTextractPDFLoader("example_data/alejandro_rosalez_sample-small.jpeg")
    documents = loader.load()
    ```

    One feature is the linearization of the output.
    When using the features LAYOUT, FORMS or TABLES together with Textract

    ```python
    from langchain_community.document_loaders import AmazonTextractPDFLoader
    # you can mix and match each of the features
    loader=AmazonTextractPDFLoader(
        "example_data/alejandro_rosalez_sample-small.jpeg",
        textract_features=["TABLES", "LAYOUT"])
    documents = loader.load()
    ```

    it will generate output that formats the text in reading order and
    try to output the information in a tabular structure or
    output the key/value pairs with a colon (key: value).
    This helps most LLMs to achieve better accuracy when
    processing these texts.

    N)linearization_configzOptional[Sequence[int]]zOptional[Any]z!Optional[TextLinearizationConfig]r   )textract_featuresclientrA  r   c                  s   ztddl  ddlm  m}  | _|| _|durF fdd|D | _ng | _|dur\|| _n| jjddddd	| _W n t	y   t	d
Y n0 |szddl
}|d| _W q t	y   t	dY q0 n|| _dS )a5  Initializes the parser.

        Args:
            textract_features: Features to be used for extraction, each feature
                               should be passed as an int that conforms to the enum
                               `Textract_Features`, see `amazon-textract-caller` pkg
            client: boto3 textract client
            linearization_config: Config to be used for linearization of the output
                                  should be an instance of TextLinearizationConfig from
                                  the `textractor` pkg
        r   Nc                   s   g | ]}  |qS r   )ZTextract_Features)r   ftcr   r!   r"   a  s   z4AmazonTextractPDFParser.__init__.<locals>.<listcomp>Tz# z## *)Zhide_figure_layoutZtitle_prefixZsection_header_prefixZlist_element_prefixzCould not import amazon-textract-caller or amazon-textract-textractor python package. Please install it with `pip install amazon-textract-caller` & `pip install amazon-textract-textractor`.ZtextractzRCould not import boto3 python package. Please install it with `pip install boto3`.)ZtextractcallerZtextractor.entities.documententitiesdocumentrF  
textractorrB  rA  r   r%   boto3rC  boto3_textract_client)r~   rB  rC  rA  rJ  rK  r   rE  r!   r}   F  s>    


z AmazonTextractPDFParser.__init__r   r   r   c                 c  s   |j rtt|j nd}|rJ|jdkrJ|jrJ| jjt|j | j| jd}n"| jj|	 | j| jj
j| jd}| jj|}t|jD ],\}}t|j| jd|j|d ddV  qdS )	zIterates over the Blob pages and returns an Iterator with a Document
        for each page, like the other parsers If multi-page document, blob.path
        has to be set to the S3 URI and for single page docs
        the blob.data is taken
        NZs3)input_documentfeaturesrL  )rM  rN  Z	call_moderL  )configr   r0  r   )pathr   r   schemenetlocrF  Zcall_textractrB  rL  as_bytesZTextract_Call_ModeZ
FORCE_SYNCrJ  r   r   r   r   r   rA  r.   )r~   r0   Zurl_parse_resultZtextract_response_jsonrI  idxr=   r   r   r!   r     s0    z"AmazonTextractPDFParser.lazy_parse)NN)r   r   r   r   r}   r   r   r   r   r!   r@    s   /  ?r@  c                   @  sB   e Zd ZdZdddddZdddd	d
dZdddddZdS )DocumentIntelligenceParserzjLoads a PDF with Azure Document Intelligence
    (formerly Form Recognizer) and chunks at character level.r   r   )rC  modelc                 C  s   t d || _|| _d S )Na<  langchain_community.document_loaders.parsers.pdf.DocumentIntelligenceParserand langchain_community.document_loaders.pdf.DocumentIntelligenceLoader are deprecated. Please upgrade to langchain_community.document_loaders.DocumentIntelligenceLoader for any file parsing purpose using Azure Document Intelligence service.)r>  r?  rC  rV  )r~   rC  rV  r   r   r!   r}     s
    z#DocumentIntelligenceParser.__init__r   r   )r0   r(   r   c                 c  sB   |j D ]6}ddd |jD }t||j|jdd}|V  qd S )N c                 S  s   g | ]
}|j qS r   )r1   )r   liner   r   r!   r"     r#   z=DocumentIntelligenceParser._generate_docs.<locals>.<listcomp>r0  r   )r   r&   r  r   r.   r   )r~   r0   r(   pr1   dr   r   r!   _generate_docs  s    
z)DocumentIntelligenceParser._generate_docsr   c                 c  sZ   |  >}| j| j|}| }| ||}|E dH  W d   n1 sL0    Y  dS )r/  N)r   rC  Zbegin_analyze_documentrV  r(   r[  )r~   r0   Zfile_objZpollerr(   docsr   r   r!   r     s
    
z%DocumentIntelligenceParser.lazy_parseN)r   r   r   r   r}   r[  r   r   r   r   r!   rU    s   rU  )Cr   
__future__r   r8   r   loggingr+  r>  r   pathlibr   tempfiler   typingr   r   r   r	   r
   r   r   r   r   r   r   urllib.parser   r#  r   Zlangchain_core.documentsr   Z)langchain_community.document_loaders.baser   Z1langchain_community.document_loaders.blob_loadersr   Z3langchain_community.document_loaders.parsers.imagesr   r   r8  r  r   r.  Z)textractor.data.text_linearization_configr   r   r   r*   	getLoggerr   r   r   r   r*  r   r>   r:   rE   rS   r`   rj   rk   r   r  r-  r4  r@  rU  r   r   r   r!   <module>   sh   4
'5 _  N  m/_ 