a
    `g{/                     @  s  d dl mZ d dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZmZmZmZmZmZ d dlmZmZ eeZedd	d
ZG dd	 d	eeZG dd deZ G dd de!e
Z"eddG dd dZ#ddddddZ$dS )    )annotationsN)ABCabstractmethod)	dataclass)Enum)AbstractSetAnyCallable
CollectionIterableListLiteralOptionalSequenceTypeTypeVarUnion)BaseDocumentTransformerDocumentTSTextSplitter)boundc                
   @  s   e Zd ZdZddedddfddddd	d	d
dddZedddddZd6ddddddZdddddZ	dddddd Z
d!ddd"d#d$Zed%d%d d&d'd(Zed)de d*fd+ddd,d-d%d.d/d0d1Zd2d%d2d3d4d5ZdS )7r   z)Interface for splitting text into chunks.i     FTintzCallable[[str], int]z$Union[bool, Literal['start', 'end']]boolNone)
chunk_sizechunk_overlaplength_functionkeep_separatoradd_start_indexstrip_whitespacereturnc                 C  sF   ||krt d| d| d|| _|| _|| _|| _|| _|| _dS )ad  Create a new TextSplitter.

        Args:
            chunk_size: Maximum size of chunks to return
            chunk_overlap: Overlap in characters between chunks
            length_function: Function that measures the length of given chunks
            keep_separator: Whether to keep the separator and where to place it
                            in each corresponding chunk (True='start')
            add_start_index: If `True`, includes chunk's start index in metadata
            strip_whitespace: If `True`, strips whitespace from the start and end of
                              every document
        zGot a larger chunk overlap (z) than chunk size (z), should be smaller.N)
ValueError_chunk_size_chunk_overlap_length_functionZ_keep_separator_add_start_index_strip_whitespace)selfr   r   r   r   r    r!    r*   k/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_text_splitters/base.py__init__!   s    zTextSplitter.__init__str	List[str]textr"   c                 C  s   dS )z$Split text into multiple components.Nr*   )r)   r0   r*   r*   r+   
split_textB   s    zTextSplitter.split_textNzOptional[List[dict]]zList[Document])texts	metadatasr"   c                 C  s   |pi gt | }g }t|D ]z\}}d}d}| |D ]^}	t|| }
| jr|| | j }||	td|}||
d< t |	}t	|	|
d}|
| q8q|S )z&Create documents from a list of texts.r   start_index)page_contentmetadata)len	enumerater1   copydeepcopyr'   r%   findmaxr   append)r)   r2   r3   Z
_metadatas	documentsir0   indexZprevious_chunk_lenchunkr6   offsetZnew_docr*   r*   r+   create_documentsF   s    zTextSplitter.create_documentszIterable[Document])r>   r"   c                 C  s:   g g  }}|D ]}| |j | |j q| j||dS )zSplit documents.)r3   )r=   r5   r6   rC   )r)   r>   r2   r3   docr*   r*   r+   split_documentsZ   s
    
zTextSplitter.split_documentsOptional[str])docs	separatorr"   c                 C  s,   | |}| jr| }|dkr$d S |S d S )N )joinr(   strip)r)   rG   rH   r0   r*   r*   r+   
_join_docsb   s    
zTextSplitter._join_docszIterable[str])splitsrH   r"   c           
      C  sV  |  |}g }g }d}|D ]}|  |}|| t|dkr@|nd | jkr|| jkrptd| d| j  t|dkr| ||}	|	d ur||	 || jks|| t|dkr|nd | jkr|dkr||  |d t|dkr|nd 8 }|dd  }q|| ||t|dkr(|nd 7 }q| ||}	|	d urR||	 |S )Nr   zCreated a chunk of size z%, which is longer than the specified    )r&   r7   r$   loggerwarningrL   r=   r%   )
r)   rM   rH   Zseparator_lenrG   Zcurrent_doctotald_lenrD   r*   r*   r+   _merge_splitsk   sJ    






 

zTextSplitter._merge_splitsr   )	tokenizerkwargsr"   c                   sd   z6ddl m} t |s tdddd fdd}W n tyP   td	Y n0 | f d
|i|S )z>Text splitter that uses HuggingFace tokenizer to count length.r   )PreTrainedTokenizerBasezATokenizer received was not an instance of PreTrainedTokenizerBaser-   r   r/   c                   s   t  | S )Nr7   encoder0   rU   r*   r+   _huggingface_tokenizer_length   s    zNTextSplitter.from_huggingface_tokenizer.<locals>._huggingface_tokenizer_lengthz`Could not import transformers python package. Please install it with `pip install transformers`.r   )ZtransformersrW   
isinstancer#   ImportError)clsrU   rV   rW   r\   r*   r[   r+   from_huggingface_tokenizer   s    

z'TextSplitter.from_huggingface_tokenizergpt2allzType[TS]'Union[Literal['all'], AbstractSet[str]]&Union[Literal['all'], Collection[str]]r   )r_   encoding_name
model_nameallowed_specialdisallowed_specialrV   r"   c           	        s   zddl }W n ty&   tdY n0 |dur<||n
||ddd fdd}t| tr|| d	}i ||}| f d
|i|S )z9Text splitter that uses tiktoken encoder to count length.r   NzCould not import tiktoken python package. This is needed in order to calculate max_tokens_for_prompt. Please install it with `pip install tiktoken`.r-   r   r/   c                   s   t j|  dS N)rg   rh   rX   rZ   rg   rh   encr*   r+   _tiktoken_encoder   s    z=TextSplitter.from_tiktoken_encoder.<locals>._tiktoken_encoder)re   rf   rg   rh   r   )tiktokenr^   encoding_for_modelget_encoding
issubclassTokenTextSplitter)	r_   re   rf   rg   rh   rV   rm   rl   extra_kwargsr*   rj   r+   from_tiktoken_encoder   s$    


	
z"TextSplitter.from_tiktoken_encoderzSequence[Document])r>   rV   r"   c                 K  s   |  t|S )z2Transform sequence of documents by splitting them.)rE   list)r)   r>   rV   r*   r*   r+   transform_documents   s    z TextSplitter.transform_documents)N)__name__
__module____qualname____doc__r7   r,   r   r1   rC   rE   rL   rT   classmethodr`   setrs   ru   r*   r*   r*   r+   r      s0   ! 	*+c                      sL   e Zd ZdZdde dfddddd	d
d fddZdddddZ  ZS )rq   z/Splitting text to tokens using model tokenizer.ra   Nrb   r-   rF   rc   rd   r   r   )re   rf   rg   rh   rV   r"   c                   sn   t  jf i | zddl}W n ty8   tdY n0 |durN||}n
||}|| _|| _|| _dS )zCreate a new TextSplitter.r   NzCould not import tiktoken python package. This is needed in order to for TokenTextSplitter. Please install it with `pip install tiktoken`.)	superr,   rm   r^   rn   ro   
_tokenizer_allowed_special_disallowed_special)r)   re   rf   rg   rh   rV   rm   rk   	__class__r*   r+   r,      s    	

zTokenTextSplitter.__init__r.   r/   c                   s8   ddd fdd}t  j j jj|d}t||dS )a  Splits the input text into smaller chunks based on tokenization.

        This method uses a custom tokenizer configuration to encode the input text
        into tokens, processes the tokens in chunks of a specified size with overlap,
        and decodes them back into text chunks. The splitting is performed using the
        `split_text_on_tokens` function.

        Args:
            text (str): The input text to be split into smaller chunks.

        Returns:
            List[str]: A list of text chunks, where each chunk is derived from a portion
            of the input text based on the tokenization and chunking rules.
        r-   z	List[int])_textr"   c                   s    j j|  j jdS ri   )r}   rY   r~   r   )r   r)   r*   r+   _encode  s
    z-TokenTextSplitter.split_text.<locals>._encode)r   tokens_per_chunkdecoderY   )r0   rU   )	Tokenizerr%   r$   r}   r   split_text_on_tokens)r)   r0   r   rU   r*   r   r+   r1      s    zTokenTextSplitter.split_text)rv   rw   rx   ry   r{   r,   r1   __classcell__r*   r*   r   r+   rq      s   rq   c                   @  sx   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdS )Languagez"Enum of the programming languages.cppgojavakotlinjstsphpprotopythonrstrubyrustscalaswiftmarkdownlatexhtmlZsolcsharpcobolcluaperlhaskellelixir
powershellN)rv   rw   rx   ry   CPPZGOZJAVAZKOTLINZJSr   PHPPROTOPYTHONZRSTZRUBYZRUSTZSCALAZSWIFTMARKDOWNZLATEXHTMLZSOLZCSHARPCOBOLCZLUAZPERLZHASKELLZELIXIRZ
POWERSHELLr*   r*   r*   r+   r     s6   r   T)frozenc                   @  s2   e Zd ZU dZded< ded< ded< ded< d	S )
r   zTokenizer data class.r   r   r   zCallable[[List[int]], str]r   zCallable[[str], List[int]]rY   N)rv   rw   rx   ry   __annotations__r*   r*   r*   r+   r   ;  s   
r   r-   r.   )r0   rU   r"   c                 C  s   g }| | }d}t||j t|}||| }|t|k r||| |t|kr\q||j|j 7 }t||j t|}||| }q2|S )z6Split incoming text and return chunks using tokenizer.r   )rY   minr   r7   r=   r   r   )r0   rU   rM   Z	input_idsZ	start_idxZcur_idxZ	chunk_idsr*   r*   r+   r   I  s    
r   )%
__future__r   r9   loggingabcr   r   dataclassesr   enumr   typingr   r   r	   r
   r   r   r   r   r   r   r   r   Zlangchain_core.documentsr   r   	getLoggerrv   rO   r   r   rq   r-   r   r   r   r*   r*   r*   r+   <module>   s    8
 @?