a
    bg`<                     @   s   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ erddlmZ ddlmZ eeZeG dd dZeddddG dd deZ dS )zModule contains a PDF parser based on Document AI from Google Cloud.

You need to install two libraries to use this parser:
pip install google-cloud-documentai
pip install google-cloud-documentai-toolbox
    N)	dataclass)TYPE_CHECKINGIteratorListOptionalSequence)
deprecated)Document)batch_iterate)BaseBlobParser)Blob)get_client_info)	OperationDocumentProcessorServiceClientc                   @   s"   e Zd ZU dZeed< eed< dS )DocAIParsingResultsz/Dataclass to store Document AI parsing results.source_pathparsed_pathN)__name__
__module____qualname____doc__str__annotations__ r   r   /var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_community/document_loaders/parsers/docai.pyr      s   
r   z0.0.32z1.0z&langchain_google_community.DocAIParser)ZsinceZremovalZalternative_importc                	   @   s8  e Zd ZdZddddded ee ee ee dddZeee	 ddd	Z
d$eeee eee  ee	 dddZd%ee ee eeee	 dddZee ee	 dddZee ed dddZed edddZdddd
ddee ee ee eeee ed dd d!Zed ee dd"d#ZdS )&DocAIParserz`Google Cloud Document AI` parser.

    For a detailed explanation of Document AI, refer to the product documentation.
    https://cloud.google.com/document-ai/docs/overview
    N)clientlocationgcs_output_pathprocessor_namer   c          
   
   C   s   t |t |krtdd}|r<t||s<td| d|| _|| _|rT|| _npzddlm} ddl	m
} W n. ty } ztd|W Y d	}~n
d	}~0 0 || d
d}	||	tddd| _d	S )a  Initializes the parser.

        Args:
            client: a DocumentProcessorServiceClient to use
            location: a Google Cloud location where a Document AI processor is located
            gcs_output_path: a path on Google Cloud Storage to store parsing results
            processor_name: full resource name of a Document AI processor or processor
                version

        You should provide either a client or location (and then a client
            would be instantiated).
        zGYou must specify either a client or a location to instantiate a client.z?projects\/[0-9]+\/locations\/[a-z\-0-9]+\/processors\/[a-z0-9]+zProcessor name z has the wrong format. If your prediction endpoint looks like https://us-documentai.googleapis.com/v1/projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID:process, use only projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID part.r   )ClientOptionsr   Zdocumentai package not found, please install it with `pip install google-cloud-documentai`Nz-documentai.googleapis.com)Zapi_endpointzdocument-ai)module)Zclient_optionsZclient_info)bool
ValueErrorre	fullmatch_gcs_output_path_processor_name_clientZgoogle.api_core.client_optionsr!   google.cloud.documentair   ImportErrorr   )
selfr   r   r   r    patternr!   r   excoptionsr   r   r   __init__2   s:    
zDocAIParser.__init__)blobreturnc                 c   s   | j |g| jdE dH  dS )zParses a blob lazily.

        Args:
            blobs: a Blob to parse

        This is a long-running operation. A recommended way is to batch
            documents together and use the `batch_parse()` method.
        r   N)batch_parser(   )r-   r2   r   r   r   
lazy_parsel   s    	zDocAIParser.lazy_parseT)r2   enable_native_pdf_parsing
field_mask
page_ranger3   c              
   #   s  z$ddl m} ddlm}m}m} W n. tyR }	 ztd|	W Y d}	~	n
d}	~	0 0 zddlm  W n. ty }	 ztd|	W Y d}	~	n
d}	~	0 0 |r||dnd}
|r||d	nd}| j	
|j| j|jjjpd
d||
|dd|d fddjjD E dH  dS )a  Parses a blob lazily using online processing.

        Args:
            blob: a blob to parse.
            enable_native_pdf_parsing: enable pdf embedded text extraction
            field_mask: a comma-separated list of which fields to include in the
                Document AI response.
                suggested: "text,pages.pageNumber,pages.layout"
            page_range: list of page numbers to parse. If `None`,
                entire document will be parsed.
        r   
documentai)IndividualPageSelector	OcrConfigProcessOptionsr"   N_text_from_layoutjdocumentai_toolbox package not found, please install it with `pip install google-cloud-documentai-toolbox`r7   )pagesapplication/pdfgcs_uriZ	mime_type)
ocr_configindividual_page_selectorT)nameZgcs_documentprocess_optionsskip_human_reviewr8   c                 3   s0   | ](}t  |jjj|jjd dV  qdS )pagesource)Zpage_contentmetadataN)r	   layoutdocumenttextpage_numberpath).0rM   r@   r2   responser   r   	<genexpr>   s   z-DocAIParser.online_process.<locals>.<genexpr>)google.cloudr;    google.cloud.documentai_v1.typesr<   r=   r>   r,   -google.cloud.documentai_toolbox.wrappers.pager@   r*   Zprocess_documentZProcessRequestr)   GcsDocumentrT   mimetyperQ   rC   )r-   r2   r7   r8   r9   r;   r<   r=   r>   r/   rG   rH   r   rV   r   online_processw   sR    zDocAIParser.online_process  <   )blobsr   timeout_seccheck_in_interval_secr3   c           
      c   s   |p| j }|std| j||d}dd |D }td| d}| |rt| ||7 }||krvtd| dtd	 qB| j	|d
}	| 
|	E dH  dS )a  Parses a list of blobs lazily.

        Args:
            blobs: a list of blobs to parse.
            gcs_output_path: a path on Google Cloud Storage to store parsing results.
            timeout_sec: a timeout to wait for Document AI to complete, in seconds.
            check_in_interval_sec: an interval to wait until next check
                whether parsing operations have been completed, in seconds
        This is a long-running operation. A recommended way is to decouple
            parsing from creating LangChain Documents:
            >>> operations = parser.docai_parse(blobs, gcs_path)
            >>> parser.is_running(operations)
            You can get operations names and save them:
            >>> names = [op.operation.name for op in operations]
            And when all operations are finished, you can use their results:
            >>> operations = parser.operations_from_names(operation_names)
            >>> results = parser.get_results(operations)
            >>> docs = parser.parse_from_results(results)
        :An output path on Google Cloud Storage should be provided.r4   c                 S   s   g | ]}|j jqS r   )Z	operationrI   rU   opr   r   r   
<listcomp>       z+DocAIParser.batch_parse.<locals>.<listcomp>z9Started parsing with Document AI, submitted operations %sr   z#Timeout exceeded! Check operations z later!.)
operationsN)r(   r%   docai_parseloggerdebug
is_runningtimesleepTimeoutErrorget_resultsparse_from_results)
r-   ra   r   rb   rc   output_pathrj   operation_namesZtime_elapsedresultsr   r   r   r5      s*    



zDocAIParser.batch_parse)rv   r3   c              
   #   s   z(ddl m} ddlm} ddlm  W n. tyV } ztd|W Y d }~n
d }~0 0 |D ]6|j\}}|||} fdd|D E d H  q\d S )Nr   )split_gcs_uri)_get_shardsr?   rA   c                 3   s:   | ]2}|j D ]&}t |j|j|jjd dV  qqdS rL   )rC   r	   rP   rR   rS   r   )rU   ZshardrM   r@   resultr   r   rX     s   
z1DocAIParser.parse_from_results.<locals>.<genexpr>)Z7google.cloud.documentai_toolbox.utilities.gcs_utilitiesrw   Z1google.cloud.documentai_toolbox.wrappers.documentrx   r[   r@   r,   r   )r-   rv   rw   rx   r/   Zgcs_bucket_nameZ
gcs_prefixZshardsr   ry   r   rs      s     
zDocAIParser.parse_from_resultsr   )ru   r3   c              
      sT   zddl m  W n. ty> } ztd|W Y d}~n
d}~0 0  fdd|D S )z5Initializes Long-Running Operations from their names.r   )GetOperationRequestzhlong running operations package not found, please install it with `pip install gapic-google-longrunning`Nc                    s    g | ]}j j |d dqS ))rI   )request)r*   Zget_operation)rU   rI   r{   r-   r   r   rg     s   z5DocAIParser.operations_from_names.<locals>.<listcomp>)Z!google.longrunning.operations_pb2r{   r,   )r-   ru   r/   r   r}   r   operations_from_names
  s    z!DocAIParser.operations_from_names)rj   r3   c                 C   s   t dd |D S )Nc                 s   s   | ]}|   V  qd S )N)donere   r   r   r   rX     rh   z)DocAIParser.is_running.<locals>.<genexpr>)any)r-   rj   r   r   r   rn     s    zDocAIParser.is_runningi  )r   r    
batch_sizer7   r8   )ra   r   r    r   r7   r8   r3   c                   s  z ddl m  ddlm}m} W n. tyN }	 ztd|	W Y d}	~	n
d}	~	0 0 |pX| j}
|
du rjtd|pr| j}|du rtdg }t	||dD ]x} j
 j fd	d
|D dd} j jj|
|dd}|r|||ddnd}|| j j||||dd q|S )a3  Runs Google Document AI PDF Batch Processing on a list of blobs.

        Args:
            blobs: a list of blobs to be parsed
            gcs_output_path: a path (folder) on GCS to store results
            processor_name: name of a Document AI processor.
            batch_size: amount of documents per batch
            enable_native_pdf_parsing: a config option for the parser
            field_mask: a comma-separated list of which fields to include in the
                Document AI response.
                suggested: "text,pages.pageNumber,pages.layout"

        Document AI has a 1000 file limit per batch, so batches larger than that need
        to be split into multiple requests.
        Batch processing is an async long-running operation
        and results are stored in a output GCS bucket.
        r   r:   )r=   r>   r"   Nrd   z0A Document AI processor name should be provided.)sizeiterablec                    s"   g | ]} j |j|jpd dqS )rD   rE   )r\   rT   r]   )rU   r2   r:   r   r   rg   O  s
   z+DocAIParser.docai_parse.<locals>.<listcomp>)Z	documents)Zgcs_documents)rF   r8   )Zgcs_output_configrB   )rG   T)rI   Zinput_documentsZdocument_output_configrJ   rK   )rY   r;   rZ   r=   r>   r,   r(   r%   r)   r
   ZBatchDocumentsInputConfigZGcsDocumentsZDocumentOutputConfigZGcsOutputConfigappendr*   Zbatch_process_documentsZBatchProcessRequest)r-   ra   r   r    r   r7   r8   r=   r>   r/   rt   rj   batchZinput_configZoutput_configrJ   r   r:   r   rk     sf    


	zDocAIParser.docai_parsec              
      sR   zddl m  W n. ty> } ztd|W Y d }~n
d }~0 0  fdd|D S )Nr   BatchProcessMetadatar"   c                    sF   g | ]>}t |j r|jjn |jjjD ]}t|j|jd q,qS ))r   r   )
isinstancerO   Zindividual_process_statusesZdeserializevaluer   Zinput_gcs_sourceZoutput_gcs_destination)rU   rf   statusr   r   r   rg   ~  s   

z+DocAIParser.get_results.<locals>.<listcomp>)Zgoogle.cloud.documentai_v1r   r,   )r-   rj   r/   r   r   r   rr   u  s    
zDocAIParser.get_results)TNN)Nr_   r`   )r   r   r   r   r   r   r1   r   r   r	   r6   r$   r   intr^   r   r5   r   rs   r~   rn   rk   rr   r   r   r   r   r   &   sh   	:   
K   2Wr   )!r   loggingr&   ro   dataclassesr   typingr   r   r   r   r   Zlangchain_core._api.deprecationr   Zlangchain_core.documentsr	   Zlangchain_core.utils.iterr
   Z)langchain_community.document_loaders.baser   Z1langchain_community.document_loaders.blob_loadersr   Z&langchain_community.utilities.vertexair   Zgoogle.api_core.operationr   r+   r   	getLoggerr   rl   r   r   r   r   r   r   <module>   s.   
