a
    dgb+                     @   s   U d Z ddlZddlZddlmZmZmZmZmZm	Z	m
Z
mZmZ ddlZddlmZ ddlmZmZ ddlmZ dee eee dd	d
Zee eee ee f dddZed ZdddddZeeef ed< G dd deZdS )z<Experimental **text splitter** based on semantic similarity.    N)	AnyDictIterableListLiteralOptionalSequenceTuplecast)cosine_similarity)BaseDocumentTransformerDocument)
Embeddings   )	sentencesbuffer_sizereturnc                 C   s   t t| D ]}d}t || |D ] }|dkr"|| | d d 7 }q"|| | d 7 }t |d |d | D ]$}|t| k rj|d| | d  7 }qj|| | d< q| S )zCombine sentences based on buffer size.

    Args:
        sentences: List of sentences to combine.
        buffer_size: Number of sentences to combine. Defaults to 1.

    Returns:
        List of sentences with combined sentences.
     r   sentence r   combined_sentence)rangelen)r   r   ir   j r   r/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_experimental/text_splitter.pycombine_sentences   s    r   )r   r   c                 C   sr   g }t t| d D ]T}| | d }| |d  d }t|g|gd d }d| }|| || | d< q|| fS )zCalculate cosine distances between sentences.

    Args:
        sentences: List of sentences to calculate distances for.

    Returns:
        Tuple of distances and sentences.
    r   combined_sentence_embeddingr   Zdistance_to_next)r   r   r   append)r   	distancesr   Zembedding_currentZembedding_nextZ
similarityZdistancer   r   r   calculate_cosine_distances8   s    	
r!   )
percentilestandard_deviationinterquartilegradient_      g      ?BREAKPOINT_DEFAULTSc                
   @   s   e Zd ZdZdeeeeee	 ee e
ee ddd	Zee	 ee	ee	 f d
ddZee	 e	d
ddZee
 eee	 ee f dddZe
ee
 dddZdee
 eee  ee dddZee ee dddZee eee dddZdS ) SemanticChunkera  Split the text based on semantic similarity.

    Taken from Greg Kamradt's wonderful notebook:
    https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb

    All credits to him.

    At a high level, this splits into sentences, then groups into groups of 3
    sentences, and then merges one that are similar in the embedding space.
    r   Fr"   N(?<=[.?!])\s+)
embeddingsr   add_start_indexbreakpoint_threshold_typebreakpoint_threshold_amountnumber_of_chunkssentence_split_regexmin_chunk_sizec	           	      C   sH   || _ || _|| _|| _|| _|| _|d u r8t| | _n|| _|| _d S )N)	_add_start_indexr+   r   r-   r/   r0   r(   r.   r1   )	selfr+   r   r,   r-   r.   r/   r0   r1   r   r   r   __init__o   s    zSemanticChunker.__init__)r    r   c                 C   s   | j dkr"ttt|| j|fS | j dkrPttt|| jt|  |fS | j dkrt|ddg\}}|| }t|| j|  |fS | j dkrt|t	dt
|}ttt|| j|fS td| j  d S )	Nr"   r#   r$      K   r%   r   z,Got unexpected `breakpoint_threshold_type`: )r-   r
   floatnpr"   r.   meanZstdr%   r   r   
ValueError)r3   r    q1Zq3ZiqrZdistance_gradientr   r   r   _calculate_breakpoint_threshold   sN    



z/SemanticChunker._calculate_breakpoint_thresholdc                 C   s   | j du rtdt|d }}d\}}tt| j ||}||krH|}n||| ||  ||   }tt|dd}ttt||S )zn
        Calculate the threshold based on the number of chunks.
        Inverse of percentile method.
        Nz:This should never be called if `number_of_chunks` is None.g        )g      ?g      Y@r   d   )	r/   r:   r   maxminr
   r7   r8   r"   )r3   r    x1y1Zx2y2xyr   r   r   _threshold_from_clusters   s    
z(SemanticChunker._threshold_from_clusters)single_sentences_listr   c                 C   sZ   dd t |D }t|| j}| jdd |D }t |D ]\}}|| |d< q<t|S )z$Split text into multiple components.c                 S   s   g | ]\}}||d qS ))r   indexr   .0r   rC   r   r   r   
<listcomp>   s   zASemanticChunker._calculate_sentence_distances.<locals>.<listcomp>c                 S   s   g | ]}|d  qS )r   r   )rI   rC   r   r   r   rJ          r   )	enumerater   r   r+   Zembed_documentsr!   )r3   rF   Z
_sentencesr   r+   r   r   r   r   r   _calculate_sentence_distances   s    z-SemanticChunker._calculate_sentence_distances)textr   c                    s"  t | j|}t|dkr|S | jdkr8t|dkr8|S | |\}}| jd ur`| | |}n| |\ } fddt	|D }g }d}|D ]X}	|	}
|||
d  }d
dd |D }| jd urt|| jk rq|| |	d }q|t|k rd
d	d ||d  D }|| |S )
Nr   r%      c                    s   g | ]\}}| kr|qS r   r   rH   Zbreakpoint_distance_thresholdr   r   rJ      s   z.SemanticChunker.split_text.<locals>.<listcomp>r   r   c                 S   s   g | ]}|d  qS r   r   rI   dr   r   r   rJ      rK   c                 S   s   g | ]}|d  qS rQ   r   rR   r   r   r   rJ     rK   )resplitr0   r   r-   rM   r/   rE   r<   rL   joinr1   r   )r3   rN   rF   r    r   Zbreakpoint_arrayZindices_above_threshchunksstart_indexrG   Z	end_indexgroupZcombined_textr   rP   r   
split_text   sH    






zSemanticChunker.split_text)texts	metadatasr   c                 C   s~   |pi gt | }g }t|D ]Z\}}d}| |D ]B}t|| }	| jrT||	d< t||	d}
||
 |t |7 }q4q|S )z&Create documents from a list of texts.r   rX   )page_contentmetadata)r   rL   rZ   copydeepcopyr2   r   r   )r3   r[   r\   Z
_metadatas	documentsr   rN   rX   chunkr^   Znew_docr   r   r   create_documents  s    
z SemanticChunker.create_documents)ra   r   c                 C   s:   g g  }}|D ]}| |j | |j q| j||dS )zSplit documents.)r\   )r   r]   r^   rc   )r3   ra   r[   r\   docr   r   r   split_documents!  s
    
zSemanticChunker.split_documents)ra   kwargsr   c                 K   s   |  t|S )z2Transform sequence of documents by splitting them.)re   list)r3   ra   rf   r   r   r   transform_documents)  s    z#SemanticChunker.transform_documents)r   Fr"   NNr*   N)N)__name__
__module____qualname____doc__r   intboolBreakpointThresholdTyper   r7   strr4   r   r	   r<   rE   dictrM   rZ   r   rc   r   re   r   r   rh   r   r   r   r   r)   c   sJ          "> 	r)   )r   )rl   r_   rT   typingr   r   r   r   r   r   r   r	   r
   numpyr8   Zlangchain_community.utils.mathr   Zlangchain_core.documentsr   r   Zlangchain_core.embeddingsr   rq   rm   r   r7   r!   ro   r(   __annotations__r)   r   r   r   r   <module>   s$   ,)$ 