a
    bgQ,                     @   s   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZ eeZ G dd	 d	eZ!G d
d deZ"dS )z:Pebblo's safe dataloader is a wrapper for document loaders    N)version)AnyDictIterableIteratorListOptional)Document)
BaseLoader)BATCH_SIZE_BYTESPLUGIN_VERSIONApp	FrameworkIndexedDocumentPebbloLoaderAPIWrappergenerate_size_based_batchesget_full_pathget_loader_full_pathget_loader_typeget_runtimeget_source_sizec                   @   s   e Zd ZU dZdZeed< d"dddeeeee	e ee	e eed	d	d
Z
ee dddZddddZee dddZeddddZedddZee dddZeee dddZee dddZeeedddZeddd d!ZdS )#PebbloSafeLoaderzkPebblo Safe Loader class is a wrapper around document loaders enabling the data
    to be scrutinized.
    F_discover_sent Nlocal)classifier_locationanonymize_snippets)	langchain_loadernameownerdescriptionapi_keyload_semanticclassifier_urlr   r   c                C   s   |rt |tstd|| _tt | _|| _tj	
dp>|| _|| _|| _t| j| _g | _g | _tt| jdd dd }
t|
| _t| j| _t| _|
| j| jd| jdkrdt| jini | _|  | _t||||	d	| _| j| j d S )
NzMust specify a valid name.ZPEBBLO_LOAD_SEMANTIC.'r   )loadersource_pathsource_typesource_path_size)r!   r   r#   r   ) 
isinstancestr	NameErrorapp_nameuuiduuid4load_idr'   osenvirongetr"   r   r    r   r(   docsdocs_with_idtypesplitr   r)   r   r*   r   
batch_sizeloader_details_get_app_detailsappr   	pb_clientZsend_loader_discover)selfr   r   r   r    r!   r"   r#   r   r   Zloader_name r?   y/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_community/document_loaders/pebblo.py__init__%   s>    "

zPebbloSafeLoader.__init__returnc                 C   s   | j  | _|   | jS )zxLoad Documents.

        Returns:
            list: Documents fetched from load method of the wrapped `loader`.
        )r'   loadr5   classify_in_batches)r>   r?   r?   r@   rD   V   s    zPebbloSafeLoader.loadc           	      C   s   t | j| j}g }t|}t|D ]l\}}||d k}|| _|  | _| jj| j| j	| j
|d}| | | jr|| |}n|  }|| q"|| _dS )z
        Classify documents in batches.
        This is to avoid API timeouts when sending large number of documents.
        Batches are generated based on the page_content size.
           )Zloading_endN)r   r5   r9   len	enumerate_index_docsr6   r=   classify_documentsr<   r:   _add_pebblo_specific_metadatar"   _add_semantic_to_docs_unindex_docsextend)	r>   ZbatchesZprocessed_docsZtotal_batchesibatchZis_last_batchclassified_docsZbatch_processed_docsr?   r?   r@   rE   a   s*    

z$PebbloSafeLoader.classify_in_batchesc              
   c   s   z| j  }W nH tyV } z0| j jj d}t| t||W Y d}~n
d}~0 0 zt|}W n ty   g | _	Y qY n0 t
|f| _	|  | _| j| j| j| j}| | | jr| || _	n
|  | _	| j	d V  qXdS )zLoad documents in lazy fashion.

        Raises:
            NotImplementedError: raised when lazy_load id not implemented
            within wrapped loader.

        Yields:
            list: Documents from loader's lazy loading.
        z does not implement lazy_load()Nr   )r'   	lazy_loadNotImplementedError	__class____name__loggererrornextStopIterationr5   listrI   r6   r=   rJ   r<   r:   rK   r"   rL   rM   )r>   Zdoc_iteratorexcZerr_strdocclassified_docr?   r?   r@   rR      s*    

 



zPebbloSafeLoader.lazy_loadc                 C   s
   d| _ d S )NT)r   )clsr?   r?   r@   set_discover_sent   s    z"PebbloSafeLoader.set_discover_sentc                 C   s:   t  \}}t| j| j| j| j||ttdtddd}|S )z\Fetch app details. Internal method.

        Returns:
            App: App details.
        Zlangchain_community)r   r   )r   r   r    r1   runtime	frameworkZplugin_versionclient_version)	r   r   r.   r   r    r1   r   r   r   )r>   ra   r`   r<   r?   r?   r@   r;      s    
z!PebbloSafeLoader._get_app_detailsc                 C   s   dd t | jD }|S )z
        Indexes the documents and returns a list of IndexedDocument objects.

        Returns:
            List[IndexedDocument]: A list of IndexedDocument objects with unique IDs.
        c                 S   s*   g | ]"\}}t f d t|i| qS )pb_id)r   r,   dict.0rO   r\   r?   r?   r@   
<listcomp>   s   z0PebbloSafeLoader._index_docs.<locals>.<listcomp>)rH   r5   )r>   r6   r?   r?   r@   rI      s    zPebbloSafeLoader._index_docs)rQ   rC   c                 C   sV   dd | j D }| D ]&}|d}||v r| || | qdd | D }|S )aF  
        Adds semantic metadata to the given list of documents.

        Args:
            classified_docs (Dict): A dictionary of dictionaries containing the
                classified documents with pb_id as key.

        Returns:
            List[Document]: A list of Document objects with added semantic metadata.
        c                 S   s    i | ]}|j t|j|jd qS )page_contentmetadata)rc   r	   ri   rj   rf   r\   r?   r?   r@   
<dictcomp>   s   z:PebbloSafeLoader._add_semantic_to_docs.<locals>.<dictcomp>rc   c                 S   s   g | ]}|qS r?   r?   rk   r?   r?   r@   rg          z:PebbloSafeLoader._add_semantic_to_docs.<locals>.<listcomp>)r6   valuesr4   _add_semantic_to_doc)r>   rQ   Zindexed_docsr]   Zdoc_idZsemantic_metadata_docsr?   r?   r@   rL      s    
z&PebbloSafeLoader._add_semantic_to_docsc                 C   s   dd t | jD }|S )z
        Converts a list of IndexedDocument objects to a list of Document objects.

        Returns:
            List[Document]: A list of Document objects.
        c                 S   s    g | ]\}}t |j|jd qS rh   )r	   ri   rj   re   r?   r?   r@   rg      s   z2PebbloSafeLoader._unindex_docs.<locals>.<listcomp>)rH   r6   )r>   r5   r?   r?   r@   rM      s    zPebbloSafeLoader._unindex_docs)r\   r]   rC   c                 C   s8   t |di  |jd< t |di  |jd< |S )a4  
        Adds semantic metadata to the given document in-place.

        Args:
            doc (Document): A Document object.
            classified_doc (dict): A dictionary containing the classified document.

        Returns:
            Document: The Document object with added semantic metadata.
        entitiesZpebblo_semantic_entitiesZtopicsZpebblo_semantic_topics)rZ   r4   keysrj   )r>   r\   r]   r?   r?   r@   ro      s    

z%PebbloSafeLoader._add_semantic_to_docc              	   C   st   | j D ]h}|j}| jjjdkr6t|d| j|d< nt|d|d| j|d< ||ji dd|d< qdS )z*Add Pebblo specific metadata to documents.ZSharePointLoadersource	full_pathZpb_checksumN)	r6   rj   r'   rT   rU   r   r4   r(   rc   )r>   rQ   r\   Zdoc_metadatar?   r?   r@   rK     s    

z.PebbloSafeLoader._add_pebblo_specific_metadata)r   r   NFN)rU   
__module____qualname____doc__r   bool__annotations__r
   r,   r   rA   r   r	   rD   rE   r   rR   classmethodr_   r   r;   r   rI   r   rL   rM   rd   ro   rK   r?   r?   r?   r@   r      s@   
     
1 "r   c                
   @   s   e Zd ZdZdddddee ee eee  eeee	f  eeeee	f   ddddZ
ee ddd	Zee dd
dZdS )PebbloTextLoaderz
    Loader for text data.

    Since PebbloSafeLoader is a wrapper around document loaders, this loader is
    used to load text data directly into Documents.
    N)rr   idsrj   	metadatas)textsrr   r{   rj   r|   rC   c                C   s"   || _ || _|| _|| _|| _dS )a  
        Args:
            texts: Iterable of text data.
            source: Source of the text data.
                Optional. Defaults to None.
            ids: List of unique identifiers for each text.
                Optional. Defaults to None.
            metadata: Metadata for all texts.
                Optional. Defaults to None.
            metadatas: List of metadata for each text.
                Optional. Defaults to None.
        N)r}   rr   r{   rj   r|   )r>   r}   rr   r{   rj   r|   r?   r?   r@   rA     s
    zPebbloTextLoader.__init__rB   c                 c   s   t | jD ]r\}}d}| jpi }| jrN|t| jk rN| j| rN|| j|  | jrl|t| jk rl| j| }t|||dV  q
dS )zi
        Lazy load text data into Documents.

        Returns:
            Iterator of Documents
        N)idri   rj   )rH   r}   rj   r|   rG   updater{   r	   )r>   rO   textZ_idrj   r?   r?   r@   rR   9  s    

zPebbloTextLoader.lazy_loadc                 C   s    g }|   D ]}|| q|S )z`
        Load text data into Documents.

        Returns:
            List of Documents
        )rR   append)r>   Z	documentsr\   r?   r?   r@   rD   I  s    zPebbloTextLoader.load)rU   rt   ru   rv   r   r,   r   r   r   r   rA   r   r	   rR   rD   r?   r?   r?   r@   rz     s   
rz   )#rv   loggingr2   r/   importlib.metadatar   typingr   r   r   r   r   r   Zlangchain_core.documentsr	   Z)langchain_community.document_loaders.baser
   Z$langchain_community.utilities.pebblor   r   r   r   r   r   r   r   r   r   r   r   	getLoggerrU   rV   r   rz   r?   r?   r?   r@   <module>   s    8
 y