a
    bŠÝg`<  ã                   @   sæ   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ er¨ddlmZ ddlmZ e e¡ZeG dd„ dƒƒZeddddG dd„ deƒƒZ dS )zÌModule contains a PDF parser based on Document AI from Google Cloud.

You need to install two libraries to use this parser:
pip install google-cloud-documentai
pip install google-cloud-documentai-toolbox
é    N)Ú	dataclass)ÚTYPE_CHECKINGÚIteratorÚListÚOptionalÚSequence)Ú
deprecated)ÚDocument)Úbatch_iterate)ÚBaseBlobParser)ÚBlob)Úget_client_info)Ú	Operation©ÚDocumentProcessorServiceClientc                   @   s"   e Zd ZU dZeed< eed< dS )ÚDocAIParsingResultsz/Dataclass to store Document AI parsing results.Úsource_pathÚparsed_pathN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__ÚstrÚ__annotations__© r   r   ú€/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_community/document_loaders/parsers/docai.pyr      s   
r   z0.0.32z1.0z&langchain_google_community.DocAIParser)ZsinceZremovalZalternative_importc                	   @   s8  e Zd ZdZdddddœed ee ee ee dœdd„Zeee	 dœdd	„Z
d$eeee eee  ee	 dœdd„Zd%ee ee eeee	 dœdd„Zee ee	 dœdd„Zee ed dœdd„Zed edœdd„Zdddd
ddœee ee ee eeee ed dœd d!„Zed ee dœd"d#„ZdS )&ÚDocAIParserz²`Google Cloud Document AI` parser.

    For a detailed explanation of Document AI, refer to the product documentation.
    https://cloud.google.com/document-ai/docs/overview
    N)ÚclientÚlocationÚgcs_output_pathÚprocessor_namer   c          
   
   C   sÈ   t |ƒt |ƒkrtdƒ‚d}|r<t ||¡s<td|› dƒ‚|| _|| _|rT|| _npzddlm} ddl	m
} W n. tyž } ztdƒ|‚W Y d	}~n
d	}~0 0 ||› d
d}	||	tddd| _d	S )aõ  Initializes the parser.

        Args:
            client: a DocumentProcessorServiceClient to use
            location: a Google Cloud location where a Document AI processor is located
            gcs_output_path: a path on Google Cloud Storage to store parsing results
            processor_name: full resource name of a Document AI processor or processor
                version

        You should provide either a client or location (and then a client
            would be instantiated).
        zGYou must specify either a client or a location to instantiate a client.z?projects\/[0-9]+\/locations\/[a-z\-0-9]+\/processors\/[a-z0-9]+zProcessor name zï has the wrong format. If your prediction endpoint looks like https://us-documentai.googleapis.com/v1/projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID:process, use only projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID part.r   )ÚClientOptionsr   úZdocumentai package not found, please install it with `pip install google-cloud-documentai`Nz-documentai.googleapis.com)Zapi_endpointzdocument-ai)Úmodule)Zclient_optionsZclient_info)ÚboolÚ
ValueErrorÚreÚ	fullmatchÚ_gcs_output_pathÚ_processor_nameÚ_clientZgoogle.api_core.client_optionsr!   Úgoogle.cloud.documentair   ÚImportErrorr   )
Úselfr   r   r   r    Úpatternr!   r   ÚexcÚoptionsr   r   r   Ú__init__2   s:    ÿ
ÿÿýÿþzDocAIParser.__init__)ÚblobÚreturnc                 c   s   | j |g| jdE dH  dS )zÜParses a blob lazily.

        Args:
            blobs: a Blob to parse

        This is a long-running operation. A recommended way is to batch
            documents together and use the `batch_parse()` method.
        ©r   N)Úbatch_parser(   )r-   r2   r   r   r   Ú
lazy_parsel   s    	zDocAIParser.lazy_parseT)r2   Úenable_native_pdf_parsingÚ
field_maskÚ
page_ranger3   c              
   #   s  z$ddl m} ddlm}m}m} W n. tyR }	 ztdƒ|	‚W Y d}	~	n
d}	~	0 0 zddlm‰  W n. ty’ }	 ztdƒ|	‚W Y d}	~	n
d}	~	0 0 |r¢||dnd}
|r´||d	nd}| j	 
|j| j|jˆjˆjpÖd
d||
|dd|d¡‰‡ ‡‡fdd„ˆjjD ƒE dH  dS )aÜ  Parses a blob lazily using online processing.

        Args:
            blob: a blob to parse.
            enable_native_pdf_parsing: enable pdf embedded text extraction
            field_mask: a comma-separated list of which fields to include in the
                Document AI response.
                suggested: "text,pages.pageNumber,pages.layout"
            page_range: list of page numbers to parse. If `None`,
                entire document will be parsed.
        r   ©Ú
documentai)ÚIndividualPageSelectorÚ	OcrConfigÚProcessOptionsr"   N©Ú_text_from_layoutújdocumentai_toolbox package not found, please install it with `pip install google-cloud-documentai-toolbox`©r7   )Úpagesúapplication/pdf©Úgcs_uriZ	mime_type)Ú
ocr_configÚindividual_page_selectorT)ÚnameZgcs_documentÚprocess_optionsÚskip_human_reviewr8   c                 3   s0   | ](}t ˆ |jˆjjƒ|jˆjd œdV  qdS ©)ÚpageÚsource)Zpage_contentÚmetadataN)r	   ÚlayoutÚdocumentÚtextÚpage_numberÚpath)Ú.0rM   ©r@   r2   Úresponser   r   Ú	<genexpr>´   s   ùþþz-DocAIParser.online_process.<locals>.<genexpr>)Úgoogle.cloudr;   Ú google.cloud.documentai_v1.typesr<   r=   r>   r,   Ú-google.cloud.documentai_toolbox.wrappers.pager@   r*   Zprocess_documentZProcessRequestr)   ÚGcsDocumentrT   ÚmimetyperQ   rC   )r-   r2   r7   r8   r9   r;   r<   r=   r>   r/   rG   rH   r   rV   r   Úonline_processw   sR    ÿýÿýÿýÿþþõÿøzDocAIParser.online_processé  é<   )Úblobsr   Útimeout_secÚcheck_in_interval_secr3   c           
      c   s¢   |p| j }|stdƒ‚| j||d}dd„ |D ƒ}t d|¡ d}|  |¡r‚t |¡ ||7 }||krvtd|› dƒ‚t d	¡ qB| j	|d
}	|  
|	¡E dH  dS )a  Parses a list of blobs lazily.

        Args:
            blobs: a list of blobs to parse.
            gcs_output_path: a path on Google Cloud Storage to store parsing results.
            timeout_sec: a timeout to wait for Document AI to complete, in seconds.
            check_in_interval_sec: an interval to wait until next check
                whether parsing operations have been completed, in seconds
        This is a long-running operation. A recommended way is to decouple
            parsing from creating LangChain Documents:
            >>> operations = parser.docai_parse(blobs, gcs_path)
            >>> parser.is_running(operations)
            You can get operations names and save them:
            >>> names = [op.operation.name for op in operations]
            And when all operations are finished, you can use their results:
            >>> operations = parser.operations_from_names(operation_names)
            >>> results = parser.get_results(operations)
            >>> docs = parser.parse_from_results(results)
        ú:An output path on Google Cloud Storage should be provided.r4   c                 S   s   g | ]}|j j‘qS r   )Z	operationrI   ©rU   Úopr   r   r   Ú
<listcomp>ß   ó    z+DocAIParser.batch_parse.<locals>.<listcomp>z9Started parsing with Document AI, submitted operations %sr   z#Timeout exceeded! Check operations z later!Ú.)Ú
operationsN)r(   r%   Údocai_parseÚloggerÚdebugÚ
is_runningÚtimeÚsleepÚTimeoutErrorÚget_resultsÚparse_from_results)
r-   ra   r   rb   rc   Úoutput_pathrj   Úoperation_namesZtime_elapsedÚresultsr   r   r   r5   ¿   s*    
ÿÿ


ÿzDocAIParser.batch_parse)rv   r3   c              
   #   s˜   z(ddl m} ddlm} ddlm‰  W n. tyV } ztdƒ|‚W Y d }~n
d }~0 0 |D ]6‰|ˆjƒ\}}|||ƒ}‡ ‡fdd„|D ƒE d H  q\d S )Nr   )Úsplit_gcs_uri)Ú_get_shardsr?   rA   c                 3   s:   | ]2}|j D ]&}tˆ |j|jƒ|jˆjd œdV  qqdS rL   )rC   r	   rP   rR   rS   r   )rU   ZshardrM   ©r@   Úresultr   r   rX     s   
ûþz1DocAIParser.parse_from_results.<locals>.<genexpr>)Z7google.cloud.documentai_toolbox.utilities.gcs_utilitiesrw   Z1google.cloud.documentai_toolbox.wrappers.documentrx   r[   r@   r,   r   )r-   rv   rw   rx   r/   Zgcs_bucket_nameZ
gcs_prefixZshardsr   ry   r   rs   ð   s     ÿý
ûzDocAIParser.parse_from_resultsr   )ru   r3   c              
      sT   zddl m‰  W n. ty> } ztdƒ|‚W Y d}~n
d}~0 0 ‡ ‡fdd„|D ƒS )z5Initializes Long-Running Operations from their names.r   )ÚGetOperationRequestzhlong running operations package not found, please install it with `pip install gapic-google-longrunning`Nc                    s    g | ]}ˆj jˆ |d d‘qS ))rI   )Úrequest)r*   Zget_operation)rU   rI   ©r{   r-   r   r   rg     s   ÿz5DocAIParser.operations_from_names.<locals>.<listcomp>)Z!google.longrunning.operations_pb2r{   r,   )r-   ru   r/   r   r}   r   Úoperations_from_names
  s    ÿýþz!DocAIParser.operations_from_names)rj   r3   c                 C   s   t dd„ |D ƒƒS )Nc                 s   s   | ]}|  ¡  V  qd S )N)Údonere   r   r   r   rX     rh   z)DocAIParser.is_running.<locals>.<genexpr>)Úany)r-   rj   r   r   r   rn     s    zDocAIParser.is_runningiè  )r   r    Ú
batch_sizer7   r8   )ra   r   r    r   r7   r8   r3   c                   s  z ddl m‰  ddlm}m} W n. tyN }	 ztdƒ|	‚W Y d}	~	n
d}	~	0 0 |pX| j}
|
du rjtdƒ‚|pr| j}|du r„tdƒ‚g }t	||dD ]x}ˆ j
ˆ j‡ fd	d
„|D ƒdd}ˆ jˆ jj|
|dd}|ræ|||ddnd}| | j ˆ j||||dd¡¡ q”|S )a3  Runs Google Document AI PDF Batch Processing on a list of blobs.

        Args:
            blobs: a list of blobs to be parsed
            gcs_output_path: a path (folder) on GCS to store results
            processor_name: name of a Document AI processor.
            batch_size: amount of documents per batch
            enable_native_pdf_parsing: a config option for the parser
            field_mask: a comma-separated list of which fields to include in the
                Document AI response.
                suggested: "text,pages.pageNumber,pages.layout"

        Document AI has a 1000 file limit per batch, so batches larger than that need
        to be split into multiple requests.
        Batch processing is an async long-running operation
        and results are stored in a output GCS bucket.
        r   r:   )r=   r>   r"   Nrd   z0A Document AI processor name should be provided.)ÚsizeÚiterablec                    s"   g | ]}ˆ j |j|jpd d‘qS )rD   rE   )r\   rT   r]   )rU   r2   r:   r   r   rg   O  s
   üþz+DocAIParser.docai_parse.<locals>.<listcomp>)Z	documents)Zgcs_documents)rF   r8   )Zgcs_output_configrB   )rG   T)rI   Zinput_documentsZdocument_output_configrJ   rK   )rY   r;   rZ   r=   r>   r,   r(   r%   r)   r
   ZBatchDocumentsInputConfigZGcsDocumentsZDocumentOutputConfigZGcsOutputConfigÚappendr*   Zbatch_process_documentsZBatchProcessRequest)r-   ra   r   r    r   r7   r8   r=   r>   r/   rt   rj   ÚbatchZinput_configZoutput_configrJ   r   r:   r   rk     sf    ÿý
ÿ

ûÿÿÿÿûÿÿù	ûÿÿzDocAIParser.docai_parsec              
      sR   zddl m‰  W n. ty> } ztdƒ|‚W Y d }~n
d }~0 0 ‡ fdd„|D ƒS )Nr   ©ÚBatchProcessMetadatar"   c                    sF   g | ]>}t |jˆ ƒr|jjnˆ  |jj¡jD ]}t|j|jd ‘q,qS ))r   r   )Ú
isinstancerO   Zindividual_process_statusesZdeserializeÚvaluer   Zinput_gcs_sourceZoutput_gcs_destination)rU   rf   Ústatusr†   r   r   rg   ~  s   
ÿ
ÿýûþz+DocAIParser.get_results.<locals>.<listcomp>)Zgoogle.cloud.documentai_v1r‡   r,   )r-   rj   r/   r   r†   r   rr   u  s    ÿý
ûzDocAIParser.get_results)TNN)Nr_   r`   )r   r   r   r   r   r   r1   r   r   r	   r6   r$   r   Úintr^   r   r5   r   rs   r~   rn   rk   rr   r   r   r   r   r   &   sh   	úú:   û
úK   ûú2þø÷Wr   )!r   Úloggingr&   ro   Údataclassesr   Útypingr   r   r   r   r   Zlangchain_core._api.deprecationr   Zlangchain_core.documentsr	   Zlangchain_core.utils.iterr
   Z)langchain_community.document_loaders.baser   Z1langchain_community.document_loaders.blob_loadersr   Z&langchain_community.utilities.vertexair   Zgoogle.api_core.operationr   r+   r   Ú	getLoggerr   rl   r   r   r   r   r   r   Ú<module>   s.   
ý