a
    bgP5                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlmZ d dlmZmZ d dlmZ dZd	Zd
ZdZdZdZdZdZdZ e!e"Z#eddddG dd deeZ$dS )    N)Path)AnyDictListMappingOptionalSequenceUnion)
deprecated)Document)	BaseModelmodel_validator)
BaseLoaderz#{http://www.w3.org/1999/xhtml}tablexpathidsourcename	structuretagprojectsz#https://api.docugami.com/v1preview1z0.0.24z1.0z!docugami_langchain.DocugamiLoader)ZsinceZremovalZalternative_importc                   @   s  e Zd ZU dZeZeed< ej	
dZee ed< dZeed< dZeed< d	Zeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZee ed< dZeee  ed< eeeeef   ed< dZeed< eddeeee f e dddZ!d/e"ee ee# e$e% ddd Z&ee$e d!d"d#Z'ee$e d!d$d%Z(eed&d'd(Z)d0eeee ee# e$e% d)d*d+Z*e$e% d,d-d.Z+dS )1DocugamiLoaderzdLoad from `Docugami`.

    To use, you should have the ``dgml-utils`` python package installed.
    apiZDOCUGAMI_API_KEYaccess_tokeni   max_text_length    min_text_lengthi   max_metadata_lengthFinclude_xml_tagsr   parent_hierarchy_levelsdoc_idparent_id_keysub_chunk_tablesTwhitespace_normalize_textN	docset_iddocument_ids
file_paths(include_project_metadata_in_doc_metadatabefore)mode)valuesreturnc                 C   sX   | dr| drtd| ds8| ds8td| drT| dsTtd|S )zValidate that either local file paths are given, or remote API docset ID.

        Args:
            values: The values to validate.

        Returns:
            The validated values.
        r%   r#   z7Cannot specify both file_paths and remote API docset_idz6Must specify either file_paths or remote API docset_idr   z7Must specify access token if using remote API docset_id)get
ValueError)clsr)    r.   {/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_community/document_loaders/docugami.pyvalidate_local_or_remoteT   s    z'DocugamiLoader.validate_local_or_remote)contentdocument_nameadditional_doc_metadatar*   c              	      s*  zddl m} W n ty*   tdY n0 zddlm} ddlm} W n tyb   tdY n0 |td fdd	}|t	
|}| }	||	jjjjjjd
}
i }|
D ]^}||}|jt}|r|||< |jr||j}|jt}|r|jr||jj< |||< qt| S )z6Parse a single DGML document into a list of Documents.r   etreePCould not import lxml python package. Please install it with `pip install lxml`.)Chunk)
get_chunkszaCould not import from dgml-utils python package. Please install it with `pip install dgml-utils`.)dg_chunkr*   c                    sf   t | j  }t| jt|tt	t
| jt| ji}| j} rPjrP|  t|d j |dS )N)page_contentmetadata)hashlibmd5textencode	hexdigest	XPATH_KEYr   ID_KEYDOCUMENT_NAME_KEYDOCUMENT_SOURCE_KEYSTRUCTURE_KEYr   TAG_KEYr   r&   updater   r   )r9   Z
_hashed_idr;   r>   r3   r2   selfr.   r/   _build_framework_chunk   s     	
z:DocugamiLoader._parse_dgml.<locals>._build_framework_chunk)r   r   r"   r!   r   r   )lxmlr5   ImportErrorZdgml_utils.modelsr7   Zdgml_utils.segmentationr8   r   parseioBytesIOgetrootr   r   r"   r!   r   r   r;   r+   rB   parentr:   r    listr)   )rI   r1   r2   r3   r5   r7   r8   rJ   treerootZ	dg_chunksZframework_chunksr9   Zframework_chunkZchunk_idZframework_parent_chunkZ	parent_idr.   rH   r/   _parse_dgmlj   sL    





zDocugamiLoader._parse_dgml)r#   r*   c                 C   s|   | j  d| d}g }|rxtj|dd| j id}|jr^| }||d  |dd}qtd	| d
|j dq|S )z1Gets all document details for the given docset ID	/docsets/z
/documentsAuthorizationBearer )headersZ	documentsnextNFailed to download 
 (status: ))	r   requestsr+   r   okjsonextend	Exceptionstatus_code)rI   r#   urlZall_documentsresponsedatar.   r.   r/   _document_details_for_docset_id   s    z.DocugamiLoader._document_details_for_docset_idc                 C   s~   | j  d| }g }|rztjd|dd| j ii d}|jr`| }||d  |dd}qtd	| d
|j	 dq|S )z0Gets all project details for the given docset IDz/projects?docset.id=GETrW   rX   rY   rf   r   rZ   Nr[   r\   r]   )
r   r^   requestr   r_   r`   ra   r+   rb   rc   )rI   r#   rd   Zall_projectsre   rf   r.   r.   r/   _project_details_for_docset_id   s"    z-DocugamiLoader._project_details_for_docset_id)projectr*   c              	   C   s  | t}| j d| d}g }i }|rtjd|dd| j ii d}|jrp| }||d  | dd	}q$|j	d
kr~|S t
d| d|j	 dq$|D ]6}| d}	| d}
| d}|	dkr|
r|r|t }i }tjd|
 ddd| j ii d}|jrzddlm} W n ty4   tdY n0 |t|j}| }|j}|jd|d}|D ]L}|jd|dd j}d|jd|dd   }|d	| j ||< qh|||< qt
d|
 dd q|S )z#Gets project metadata for all filesz
/projects/z/artifacts/latestrh   rW   rX   ri   Z	artifactsrZ   Ni  r[   r\   r]   r   rd   Zdocumentzreport-values.xmlz/contentr   r4   r6   z
//pr:Entry)
namespacesz./pr:Heading z
./pr:Valuez	/content z (status: {response.status_code}))r+   rB   r   r^   rj   r   r_   r`   ra   rc   rb   rK   r5   rL   rM   rN   rO   r1   rP   Znsmapr   r>   joinZitertextstripr   )rI   rl   Z
project_idrd   Zall_artifactsZper_file_metadatare   rf   artifactZartifact_nameZartifact_urlZartifact_docr   r;   r5   Zartifact_treeZartifact_rootnsentriesentryheadingvaluer.   r.   r/   _metadata_for_project   sr    








z$DocugamiLoader._metadata_for_project)document_idr#   r2   additional_metadatar*   c                 C   sj   | j  d| d| d}tjd|dd| j ii d}|jrN| j|j||dS td	| d
|j ddS )zLoad chunks for a document.rV   z/documents/z/dgmlrh   rW   rX   ri   )r1   r2   r3   r[   r\   r]   N)	r   r^   rj   r   r_   rU   r1   rb   rc   )rI   rx   r#   r2   ry   rd   re   r.   r.   r/   _load_chunks_for_document(  s     z(DocugamiLoader._load_chunks_for_document)r*   c              	      s8  g } j rԈ jrԈ  j} jr4 fdd|D }  j}i }|r jr|D ]@} |}|D ],}||vr~|| ||< qd|| ||  qdqR|D ]8}|t }	|	t
}
|	|	}| j|	 j|
|d7 }qn` jr4 jD ]P}t|}t|d(}| j| |jd7 }W d   q1 s(0    Y  q|S )zLoad documents.c                    s   g | ]}|t   jv r|qS r.   )rB   r$   ).0drI   r.   r/   
<listcomp>L  s   z'DocugamiLoader.load.<locals>.<listcomp>)rx   r#   r2   ry   rb)r1   r2   N)r   r#   rg   r$   rk   r&   rw   rG   rB   r+   rC   rz   r%   r   openrU   readr   )rI   chunksZ_document_detailsZ_project_detailsZcombined_project_metadatarl   r;   Zfile_iddocr   Zdoc_nameZdoc_metadatapathfiler.   r}   r/   loadD  sF    





*zDocugamiLoader.load)NN)NN),__name__
__module____qualname____doc__DEFAULT_API_ENDPOINTr   str__annotations__osenvironr+   r   r   r   intr   r   r   boolr   r    r!   r"   r#   r$   r   r	   r   r&   r   classmethodr   r   r0   bytesr   r   r   rU   rg   rk   rw   rz   r   r.   r.   r.   r/   r      sN   
  KK  r   )%r<   rN   loggingr   pathlibr   typingr   r   r   r   r   r   r	   r^   Zlangchain_core._api.deprecationr
   Zlangchain_core.documentsr   Zpydanticr   r   Z)langchain_community.document_loaders.baser   Z
TABLE_NAMErA   rB   rD   rC   rE   rF   ZPROJECTS_KEYr   	getLoggerr   loggerr   r.   r.   r.   r/   <module>   s4   $
