a
    `g                     @  sL   d dl mZ d dlmZmZmZmZ d dlmZm	Z	m
Z
 G dd deZdS )    )annotations)AnyListOptionalcast)TextSplitter	Tokenizersplit_text_on_tokensc                      s~   e Zd ZU dZdddddd	d
 fddZdd	dddZdddddZdddddZdZde	d< dddddZ
  ZS )%SentenceTransformersTokenTextSplitterz8Splitting text to tokens using sentence model tokenizer.2   'sentence-transformers/all-mpnet-base-v2NintstrzOptional[int]r   None)chunk_overlap
model_nametokens_per_chunkkwargsreturnc                   sr   t  jf i |d|i zddlm} W n tyD   tdY n0 || _|| j| _| jj| _| j|d dS )zCreate a new TextSplitter.r   r   )SentenceTransformerzCould not import sentence_transformers python package. This is needed in order to for SentenceTransformersTokenTextSplitter. Please install it with `pip install sentence-transformers`.)r   N)	super__init__Zsentence_transformersr   ImportErrorr   _model	tokenizer_initialize_chunk_configuration)selfr   r   r   r   r   	__class__ |/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_text_splitters/sentence_transformers.pyr      s    

z.SentenceTransformersTokenTextSplitter.__init__)r   r   c                C  sZ   t t| jj| _|d u r"| j| _n|| _| j| jkrVtd| j d| j d| j dd S )NzThe token limit of the models 'z' is: z. Argument tokens_per_chunk=z > maximum token limit.)r   r   r   Zmax_seq_lengthZmaximum_tokens_per_chunkr   
ValueErrorr   )r   r   r   r   r    r   #   s    

zESentenceTransformersTokenTextSplitter._initialize_chunk_configurationz	List[str]textr   c                   s8   ddd fdd}t  j j jj|d}t||dS )a  Splits the input text into smaller components by splitting text on tokens.

        This method encodes the input text using a private `_encode` method, then
        strips the start and stop token IDs from the encoded result. It returns the
        processed segments as a list of strings.

        Args:
            text (str): The input text to be split.

        Returns:
            List[str]: A list of string components derived from the input text after
            encoding and processing.
        r   	List[int]r"   c                   s     | dd S )N   )_encode)r#   r   r   r    %encode_strip_start_and_stop_token_idsD   s    z_SentenceTransformersTokenTextSplitter.split_text.<locals>.encode_strip_start_and_stop_token_ids)r   r   decodeencode)r#   r   )r   Z_chunk_overlapr   r   r*   r	   )r   r#   r)   r   r   r(   r    
split_text5   s    z0SentenceTransformersTokenTextSplitter.split_textc                C  s   t | |S )ay  Counts the number of tokens in the given text.

        This method encodes the input text using a private `_encode` method and
        calculates the total number of tokens in the encoded result.

        Args:
            text (str): The input text for which the token count is calculated.

        Returns:
            int: The number of tokens in the encoded text.
        )lenr'   )r   r#   r   r   r    count_tokensP   s    z2SentenceTransformersTokenTextSplitter.count_tokensl         _max_length_equal_32_bit_integerr$   c                 C  s   | j j|| jdd}|S )NZdo_not_truncate)
max_lengthZ
truncation)r   r+   r/   )r   r#   Z&token_ids_with_start_and_end_token_idsr   r   r    r'   `   s    z-SentenceTransformersTokenTextSplitter._encode)r   r   N)__name__
__module____qualname____doc__r   r   r,   r.   r/   __annotations__r'   __classcell__r   r   r   r    r
      s   
   r
   N)
__future__r   typingr   r   r   r   Zlangchain_text_splitters.baser   r   r	   r
   r   r   r   r    <module>   s   