a
    bgc                     @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZ e e!Z"dZ#dZ$dZ%dZ&g dZ'g dZ(dgZ)g dZ*e'e(e)e*dZ+G dd de,eZ-G dd deZ.G dd deZ/G dd deZ0G dd deZ1G d d! d!eZ2d"d"d#d$d%Z3d"d"d&d'd(Z4d)d"d&d*d+Z5d,d-d.d/Z6d"d-d0d1Z7dCd2d3d4d5d6d7Z8d"d"d8d9d:Z9d"d3d;d<d=Z:d"d3d>d?d@Z;G dAdB dBeZ<dS )D    )annotationsN)Enum)
HTTPStatus)AnyDictListOptionalTuple)Document)get_runtime_environment)get_from_dict_or_env)	BaseModel)Responserequest)RequestException)
BaseLoaderz0.1.1zhttp://localhost:8000zhttps://api.daxa.ai  )Z
JSONLoaderS3FileLoaderZUnstructuredMarkdownLoaderZUnstructuredPDFLoaderZUnstructuredFileLoaderZUnstructuredJsonLoaderZPyPDFLoaderGCSFileLoaderZAmazonTextractPDFLoaderZ	CSVLoaderZUnstructuredExcelLoaderZUnstructuredEmailLoader)ZDirectoryLoaderZS3DirLoaderZSlackDirectoryLoaderZPyPDFDirectoryLoaderZNotionDirectoryLoaderDataFrameLoader)NotionDBLoaderGoogleDriveLoaderSharePointLoader)filedir	in-memoryzcloud-folderc                   @  s   e Zd ZdZdZdZdS )Routesz2Routes available for the Pebblo API as enumerator.z/v1/loader/docz/v1/app/discoverN)__name__
__module____qualname____doc__
loader_docloader_app_discover r#   r#   r/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_community/utilities/pebblo.pyr   C   s   r   c                   @  s   e Zd ZU dZded< dS )IndexedDocumentzPebblo Indexed Document.strpb_idNr   r   r   r    __annotations__r#   r#   r#   r$   r%   J   s   
r%   c                   @  sn   e Zd ZU dZdZded< ded< ded< dZded	< ded
< ded< ded< ded< ded< dZded< dS )RuntimezPebblo Runtime.localr&   typehostpath Optional[str]ipplatformos
os_versionlanguagelanguage_versionruntimeN)r   r   r   r    r,   r)   r1   r7   r#   r#   r#   r$   r*   Q   s   
r*   c                   @  s"   e Zd ZU dZded< ded< dS )	FrameworkzPebblo Framework instance.r&   nameversionNr(   r#   r#   r#   r$   r8   j   s   
r8   c                   @  sR   e Zd ZU dZded< ded< ded< ded< ded	< d
ed< ded< d
ed< dS )AppzPebblo AI application.r&   r9   ownerr0   descriptionload_idr*   r7   r8   	frameworkplugin_versionclient_versionNr(   r#   r#   r#   r$   r;   s   s   
r;   c                   @  sb   e Zd ZU dZded< ded< ded< ded< ded< d	ed
< ded< ded< ded< ded< dS )DoczPebblo document.r&   r9   r<   listdocsr@   r>   dictloader_detailsboolloading_endsource_ownerclassifier_locationanonymize_snippetsNr(   r#   r#   r#   r$   rB      s   
rB   r&   )r.   returnc                 C  sF   | r d| v s d| d ks | dv r$| S t | }| r>| }t|S )zReturn an absolute local path for a local file/directory,
    for a network related path, return as is.

    Args:
        path (str): Relative path to be resolved.

    Returns:
        str: Resolved absolute path.
    z:///r   )unknown-r   )pathlibPathexistsresolver&   )r.   	full_pathr#   r#   r$   get_full_path   s    

rU   )loaderrL   c                 C  s&   t  D ]\}}| |v r|  S qdS )zReturn loader type among, file, dir or in-memory.

    Args:
        loader (str): Name of the loader, whose type is to be resolved.

    Returns:
        str: One of the loader type among, file/dir/in-memory.
    unsupported)LOADER_TYPE_MAPPINGitems)rV   loader_typeloadersr#   r#   r$   get_loader_type   s    	
r\   r   c                 C  s  ddl m}m}m}m} d}t| ts4td |S | j	}zd|v rt| |rfd| j
 d| j }nt| |rd| j
 d| j }nRd	|v r|d	 }|rd
|v r|d
 }|r| d| }nd|v r|d }nd|v r|d }nd|v r$|d }|rt|trt|dkr|d }nt| |r6d}nt| |rPd| j }n| jjdkr|dr|d}	d|	 }nZ|dr|dg }
ddd |
D }n,|dr|dg }ddd |D }W n ty   Y n0 tt|S )zReturn an absolute source path of source of loader based on the
    keys present in Document.

    Args:
        loader (BaseLoader): Langchain document loader, derived from Baseloader.
    r   )r   r   r   r   rO   zGloader is not derived from BaseLoader, source location will be unknown!bucketzgc://rM   zs3://sourcechannelr.   	file_path	web_pathsr   znotiondb://r   	folder_idz+https://drive.google.com/drive/u/2/folders/file_idsz, c                 S  s   g | ]}d | dqS )z https://drive.google.com/file/d/z/viewr#   ).0Zfile_idr#   r#   r$   
<listcomp>   s   z(get_loader_full_path.<locals>.<listcomp>document_idsc                 S  s   g | ]}d | dqS )z#https://docs.google.com/document/d/z/editr#   )rd   doc_idr#   r#   r$   re     s   )Z$langchain_community.document_loadersr   r   r   r   
isinstancer   loggererror__dict__r]   ZblobkeyrC   lenZdatabase_id	__class__r   getjoin	ExceptionrU   r&   )rV   r   r   r   r   locationZloader_dictr_   ra   rb   rc   rf   r#   r#   r$   get_loader_full_path   sj    




 

rs   zTuple[Framework, Runtime])rL   c                  C  s   t  } td| ddd}t }t|jtjd | dd|j	|j
t | dd| d	dd
}d|jv rvd|_d|_td|  td|  ||fS )zFetch the current Framework and Runtime details.

    Returns:
        Tuple[Framework, Runtime]: Framework and Runtime for the current app instance.
    Z	langchainZlibrary_versionN)r9   r:   ZPWDr2   rN   r7   Zruntime_version)r-   r.   r2   r3   r4   r1   r5   r6   DarwinZdesktopzMac OSXz
framework zruntime )r   r8   ro   r2   unamer*   noder3   environsystemr:   get_ipr,   r7   ri   debug)Zruntime_envr?   ru   r7   r#   r#   r$   get_runtime  s*    



r{   c                  C  s@   ddl } |  }z| |}W n ty:   | d}Y n0 |S )zJFetch local runtime ip address.

    Returns:
        str: IP address
    r   N	localhost)socketgethostnamegethostbynamerq   )r}   r-   Z	public_ipr#   r#   r$   ry   .  s    ry   zList[Document]intzList[List[Document]])rD   max_batch_sizerL   c                 C  s~   g }g }d}| D ]Z}t |jd}||kr:||g q|| |krX|| g }d}|| ||7 }q|rz|| |S )a  
    Generate batches of documents based on page_content size.
    Args:
        docs: List of documents to be batched.
        max_batch_size: Maximum size of each batch in bytes. Defaults to 100*1024(100KB)
    Returns:
        List[List[Document]]: List of batches of documents
    r   utf-8)rm   page_contentencodeappend)rD   r   ZbatchesZcurrent_batchZcurrent_batch_sizedocZdoc_sizer#   r#   r$   generate_size_based_batches>  s     



r   )r`   rL   c                 C  s@   z$ddl }t| j}||j}W n ty:   d}Y n0 |S )zFetch owner of local file path.

    Args:
        file_path (str): Local file path.

    Returns:
        str: Name of owner.
    r   NrN   )pwdr3   statst_uidgetpwuidpw_namerq   )r`   r   Zfile_owner_uidZfile_owner_namer#   r#   r$   get_file_owner_from_pathf  s    	
r   )source_pathrL   c                 C  s   | sdS d}t j| r&t j| }n^t j| rd}t | D ]>\}}}|D ].}t j||}t j|sN|t j|7 }qNq@|}|S )zFetch size of source path. Source can be a directory or a file.

    Args:
        source_path (str): Local path of data source.

    Returns:
        int: Source size in bytes.
    r   )r3   r.   isfilegetsizeisdirwalkrp   islink)r   size
total_sizedirpath_	filenamesffpr#   r#   r$   get_source_sizey  s    	r   )datarL   c                 C  s   |  d}t|}|S )zCalculate the content size in bytes:
    - Encode the string to bytes using a specific encoding (e.g., UTF-8)
    - Get the length of the encoded bytes.

    Args:
        data (str): Data string.

    Returns:
        int: Size of string in bytes.
    r   )r   rm   )r   Zencoded_contentr   r#   r#   r$   calculate_content_size  s    
r   c                	      s  e Zd ZU dZded< dZded< ded< ded< d	Zd
ed< dd fddZdddddZd3dddd
ddddZ	dddddZ
d4d
ddddZdd ddd!d
dd"d#d$Zed5dddd'd!d(d)d*d+Zedddd,d-d.d/Zed ddd0d1d2Z  ZS )6PebbloLoaderAPIWrapperzWrapper for Pebblo Loader API.r0   api_keyr+   r&   rJ   classifier_url	cloud_urlFrG   rK   r   )kwargsc                   sL   t |ddd|d< t |ddt|d< t |ddt|d< t jf i | dS )	z%Validate that api key in environment.r   ZPEBBLO_API_KEYr/   r   ZPEBBLO_CLASSIFIER_URLr   ZPEBBLO_CLOUD_URLN)r   _DEFAULT_CLASSIFIER_URL_DEFAULT_PEBBLO_CLOUD_URLsuper__init__)selfr   rn   r#   r$   r     s    zPebbloLoaderAPIWrapper.__init__r;   None)apprL   c           	      C  s   d}|j dd}| jdkrD|  }| j tjj }| d|||}| jr| jdd}|rzt	
|jd}|d|i |dti | j tjj }| d|||}dS )	z
        Send app discovery request to Pebblo server & cloud.

        Args:
            app (App): App instance to be discovered.
        NTZexclude_unsetr+   POSTcloud_requestpebblo_server_versionZpebblo_client_version)rE   rJ   _make_headersr   r   r"   valuemake_requestr   jsonloadstextro   updatePLUGIN_VERSIONr   )	r   r   pebblo_resppayloadheadersZapp_discover_urlr   pebblo_cloud_urlr   r#   r#   r$   send_loader_discover  s"    
z+PebbloLoaderAPIWrapper.send_loader_discoverzList[IndexedDocument]rE   )docs_with_idr   rF   rH   rL   c              
   C  s6  | dd}t|}| |||\}}| ||||||}	i }
| jdkr|  }| j tjj	 }zF| 
d|||	d}|rt|j dg D ]}|
|d |i qW n0 ty } ztd| W Y d	}~n
d	}~0 0 | jr| jdkr| |	d |
 |	d
d	 | |	 n| jdkr2td td|
S )a  
        Send documents to Pebblo server for classification.
        Then send classified documents to Daxa cloud(If api_key is present).

        Args:
            docs_with_id (List[IndexedDocument]): List of documents to be classified.
            app (App): App instance.
            loader_details (dict): Loader details.
            loading_end (bool): Boolean, indicating the halt of data loading by loader.
        r   r/   r+   r   i,  rD   r'   z3An Exception caught in classify_documents: local %sNrK   zpebblo-cloudz4API key is missing for sending docs to Pebblo cloud.)ro   r   prepare_docs_for_classificationbuild_classification_payloadrJ   r   r   r   r!   r   r   r   r   r   r   rq   ri   warningr   update_doc_datapopsend_docs_to_pebblo_cloud	NameError)r   r   r   rF   rH   r   rI   rD   source_aggregate_sizer   classified_docsr   Zload_doc_urlr   Zclassified_docer#   r#   r$   classify_documents  s@    



"

z)PebbloLoaderAPIWrapper.classify_documents)r   rL   c              
   C  sh   | j dd}| j tjj }z| d|||}W n0 tyb } ztd| W Y d}~n
d}~0 0 dS )z
        Send documents to Pebblo cloud.

        Args:
            payload (dict): The payload containing documents to be sent.
        Tr   r   z3An Exception caught in classify_documents: cloud %sN)	r   r   r   r!   r   r   rq   ri   r   )r   r   r   r   r   r   r#   r#   r$   r     s    z0PebbloLoaderAPIWrapper.send_docs_to_pebblo_cloud)r   rL   c                 C  s4   ddd}|r0| j r&|d| j i n
td |S )z
        Generate headers for the request.

        args:
            cloud_request (bool): flag indicating whether the request is for Pebblo
            cloud.
        returns:
            dict: Headers for the request.

        zapplication/json)AcceptzContent-Typez	x-api-keyz,API key is missing for Pebblo cloud request.)r   r   ri   r   )r   r   r   r#   r#   r$   r   (  s    
z$PebbloLoaderAPIWrapper._make_headersz
List[dict]r   )r   rD   rF   rI   r   rH   rL   c                 C  sb   |j |j|t|j|d|| j| jd
}|du rHd|d< d|v rH||d d< tf i |jdd}|S )	a  
        Build the payload for document classification.

        Args:
            app (App): App instance.
            docs (List[dict]): List of documents to be classified.
            loader_details (dict): Loader details.
            source_owner (str): Owner of the source.
            source_aggregate_size (int): Aggregate size of the source.
            loading_end (bool): Boolean indicating the halt of data loading by loader.

        Returns:
            dict: Payload for document classification.
        false)
r9   r<   rD   r@   r>   rF   rH   rI   rJ   rK   TtruerH   rF   r   r   )r9   r<   r   r>   rJ   rK   rB   rE   )r   r   rD   rF   rI   r   rH   r   r#   r#   r$   r   ?  s$    
z3PebbloLoaderAPIWrapper.build_classification_payloadN   zOptional[dict]zOptional[Response])methodurlr   r   timeoutrL   c              
   C  s   zt | ||||d}td| |j jtt|j jr6|j jng t|j |jtj	krht
d|j  n>|jtjkrt
d|j  n|jtjkrt
d|j  |W S  ty   t
d| Y n0 ty } zt
d| W Y d}~n
d}~0 0 dS )	a  
        Make a request to the Pebblo API

        Args:
            method (str): HTTP method (GET, POST, PUT, DELETE, etc.).
            url (str): URL for the request.
            headers (dict): Headers for the request.
            payload (Optional[dict]): Payload for the request (for POST, PUT, etc.).
            timeout (int): Timeout for the request in seconds.

        Returns:
            Optional[Response]: Response object if the request is successful.
        )r   r   r   r   r   z5Request: method %s, url %s, len %s response status %szPebblo Server: Error z$Pebblo received an invalid payload: z-Pebblo returned an unexpected response code: zUnable to reach server %sz'An Exception caught in make_request: %sN)r   ri   rz   r   r&   rm   bodystatus_coder   INTERNAL_SERVER_ERRORr   BAD_REQUESTr   OKr   rq   )r   r   r   r   r   responser   r#   r#   r$   r   k  s6    
"z#PebbloLoaderAPIWrapper.make_requestzTuple[List[dict], int])r   r   rF   rL   c              
   C  s:  g }d}dd | D }d}|D ]}| di }| dg }	|d dkr^t| d	|d
 }
nt| d| d	|}
| dt|
}| dt|
}t| d}t|}||7 }| ddpd}|||
|| di  d|d|	rd|	ini |durd|ini  |d dkr|s| d|d
< d}q||fS )a  
        Prepare documents for classification.

        Args:
            docs_with_id (List[IndexedDocument]): List of documents to be classified.
            source_path (str): Source path of the documents.
            loader_details (dict): Contains loader info.

        Returns:
            Tuple[List[dict], int]: Documents and the aggregate size
            of the source.
        r   c                 S  s   g | ]}|  qS r#   )rE   )rd   r   r#   r#   r$   re         zJPebbloLoaderAPIWrapper.prepare_docs_for_classification.<locals>.<listcomp>FmetadataZauthorized_identitiesrV   r   r^   r   rT   r<   r   r   r'   Nlast_modified)r   r   r'   r   Z
file_ownerZsource_path_sizeZsource_full_urlT)ro   rU   r   r   r&   r   r   )r   r   rF   rD   r   Zdoc_contentZsource_path_updater   Zdoc_metadataZdoc_authorized_identitiesZdoc_source_pathZdoc_source_ownerZdoc_source_sizer   Zpage_content_sizerg   r#   r#   r$   r     s`    



z6PebbloLoaderAPIWrapper.prepare_docs_for_classification)rD   r   rL   c              
   C  sX   | D ]N}| |d i }|| d| d| di | di d |d qdS )	z
        Update the document data with classified information.

        Args:
            docs (List[dict]): List of document data to be updated.
            classified_docs (dict): The dictionary containing classified documents.
        r'   pb_checksumloader_source_pathentitiestopics)r   r   r   r   r   N)ro   r   r   )rD   r   Zdoc_dataZclassified_datar#   r#   r$   r     s    	

	z&PebbloLoaderAPIWrapper.update_doc_data)F)F)Nr   )r   r   r   r    r)   rJ   rK   r   r   r   r   r   r   staticmethodr   r   r   __classcell__r#   r#   r   r$   r     s*   
$ >,  1Fr   )r   )=
__future__r   r   loggingr3   rP   r2   enumr   httpr   typingr   r   r   r   r	   Zlangchain_core.documentsr
   Zlangchain_core.envr   Zlangchain_core.utilsr   Zpydanticr   requestsr   r   Zrequests.exceptionsr   Z)langchain_community.document_loaders.baser   	getLoggerr   ri   r   r   r   ZBATCH_SIZE_BYTESZfile_loaderZ
dir_loaderZ	in_memoryZcloud_folderrX   r&   r   r%   r*   r8   r;   rB   rU   r\   rs   r{   ry   r   r   r   r   r   r#   r#   r#   r$   <module>   s\   
	H (