a
    ^g%                      @  s   d Z ddlmZ ddlZddlZddlZG dd dZdddd	d
dddZddddd	ddddZdddddZ	dd Z
dS )zJThis is an educational implementation of the byte pair encoding algorithm.    )annotationsNc                   @  s   e Zd ZddddddZdddd	d
ddZd	ddddZd	ddddZd	ddddZeddddddZ	edd Z
dS )SimpleBytePairEncodingstrdict[bytes, int]None)pat_strmergeable_ranksreturnc                C  s0   || _ || _dd | D | _t|| _dS )zCreates an Encoding object.c                 S  s   i | ]\}}||qS  r
   ).0token_bytestokenr
   r
   c/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/tiktoken/_educational.py
<dictcomp>       z3SimpleBytePairEncoding.__init__.<locals>.<dictcomp>N)r   r   items_decoderregexcompile_pat)selfr   r   r
   r
   r   __init__   s    zSimpleBytePairEncoding.__init__colour
str | None	list[int])text	visualiser	   c                 C  sB   | j |}g }|D ](}|d}t| j||d}|| q|S )z`Encodes a string into tokens.

        >>> enc.encode("hello world")
        [388, 372]
        utf-8)r   )r   findallencode
bpe_encoder   extend)r   r   r   wordstokenswordZ
word_bytesZword_tokensr
   r
   r   r      s    
zSimpleBytePairEncoding.encodebytes)r#   r	   c                   s   d  fdd|D S )znDecodes a list of tokens into bytes.

        >>> enc.decode_bytes([388, 372])
        b'hello world'
        r   c                 3  s   | ]} j | V  qd S Nr   r   r   r   r
   r   	<genexpr>-   r   z6SimpleBytePairEncoding.decode_bytes.<locals>.<genexpr>)joinr   r#   r
   r)   r   decode_bytes'   s    z#SimpleBytePairEncoding.decode_bytesc                 C  s   |  |jdddS )u   Decodes a list of tokens into a string.

        Decoded bytes are not guaranteed to be valid UTF-8. In that case, we replace
        the invalid bytes with the replacement character "�".

        >>> enc.decode([388, 372])
        'hello world'
        r   replaceerrors)r-   decoder,   r
   r
   r   r1   /   s    	zSimpleBytePairEncoding.decodelist[bytes]c                   s    fdd|D S )zDecodes a list of tokens into a list of bytes.

        Useful for visualising how a string is tokenised.

        >>> enc.decode_tokens_bytes([388, 372])
        [b'hello', b' world']
        c                   s   g | ]} j | qS r
   r'   r(   r)   r
   r   
<listcomp>B   r   z>SimpleBytePairEncoding.decode_tokens_bytes.<locals>.<listcomp>r
   r,   r
   r)   r   decode_tokens_bytes:   s    z*SimpleBytePairEncoding.decode_tokens_bytesint)training_data
vocab_sizer   c                 C  s   t | ||d}t||dS )z#Train a BPE tokeniser on some data!)datar7   r   r   r   )	bpe_trainr   )r6   r7   r   r   r
   r
   r   trainD   s    zSimpleBytePairEncoding.trainc                 C  s$   t | trt| } t| j| jdS )Nr9   )
isinstancer   tiktokenZget_encodingr   Z_pat_strZ_mergeable_ranks)encodingr
   r
   r   from_tiktokenJ   s
    

z$SimpleBytePairEncoding.from_tiktokenN)r   )__name__
__module____qualname__r   r   r-   r1   r4   staticmethodr;   r?   r
   r
   r
   r   r      s   

r   r   r   r%   r   r   )r   inputr   r	   c           
        s  dd |D }|r4|dv r$t | n|dkr4t| d }d }tt|d d |dd  D ]>\}} |d |d  }|d urZ|d u s||k rZ|}|}qZ|d u rq|d usJ |d | || ||d   g ||d d   }q|rt   fd	d|D }	|	S )
Nc                 S  s   g | ]}t |gqS r
   r%   r   br
   r
   r   r3   V   r   zbpe_encode.<locals>.<listcomp>r   colorsimple   r      c                   s   g | ]} | qS r
   r
   )r   partr   r
   r   r3   s   r   )visualise_tokensprint	enumeratezipget)
r   rD   r   partsmin_idxZmin_rankipairZrankr#   r
   rO   r   r    S   s*    
&4r    r   r5   )r8   r7   r   r   r	   c                   s
  |dk rt di }tdD ]}||t|g< qdd t|| D }t||k rt  |D ]4}t|d d |dd  D ]} |  d7  < q~q`t	  fddd	}	|	d
 |	d  }
t|}|||
< g }|D ]}g }d
}|t|d k r6|| ||d  f|	kr|
|
 |d7 }q|
||  |d7 }q|t|d krV|
||  |
| q|}|rFtd|	d
  d|	d   td|
 dt| d |dv rtd tdd |d d D  n.|dkrtd |d d D ]}t| qtd qF|S )N   z;vocab_size must be at least 256, so we can encode all bytesc                 S  s    g | ]}d d | dD qS )c                 S  s   g | ]}t |gqS r
   rE   rF   r
   r
   r   r3      r   z(bpe_train.<locals>.<listcomp>.<listcomp>r   )r   )r   r$   r
   r
   r   r3      s   zbpe_train.<locals>.<listcomp>rK   rL   c                   s    |  S r&   r
   )xstatsr
   r   <lambda>   r   zbpe_train.<locals>.<lambda>)keyr   rM   z The current most common pair is z + zSo we made z our zth tokenrH   z9Now the first fifty words in our training data look like:c                 S  s   g | ]}|D ]}|qqS r
   r
   )r   r$   r   r
   r
   r   r3      r   2   rJ   z:Now the first twenty words in our training data look like:   
)
ValueErrorranger%   r   r   lencollectionsCounterrS   maxappendrQ   rP   )r8   r7   r   r   ZranksrW   r"   ZpiecerX   Zmost_common_pairr   r   Z	new_wordsr$   Znew_wordr
   r[   r   r:   w   sV    






r:   r2   r   )token_valuesr	   c                 C  s   dd dD }dd | D }d}d }|D ]\}||t |  }||krd||d t |  }||ksdJ |}|t |7 }t|| dd q(td	 d S )
Nc                 S  s   g | ]}d | dqS )z[48;5;mr
   )r   rW   r
   r
   r   r3      r   z$visualise_tokens.<locals>.<listcomp>)         M   P   D      c                 S  s   g | ]}|j d ddqS )r   r.   r/   )r1   )r   rZ   r
   r
   r   r3      r   r   rL    )endz[0m)rd   rQ   )ri   
backgroundZunicode_token_valuesZrunning_lengthZ
last_colorr   rI   r
   r
   r   rP      s    rP   c                  C  s   d} t t}| }W d    n1 s*0    Y  tj|d| d}td |d}||dkshJ ||dkszJ |	|ddgksJ |S )	NzN's|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+iX  )r7   r   zJThis is the sequence of merges performed in order to encode 'hello world':zhello worlds   hello worlds   hellos    world)
open__file__readr   r;   rQ   r   r1   r-   r4   )Zgpt2_patternfr8   encr#   r
   r
   r   train_simple_encoding   s    
&
rz   )r   )r   )__doc__
__future__r   re   r   r=   r   r    r:   rP   rz   r
   r
   r
   r   <module>   s   H % E