a
    `gB                     @  s   d dl mZ d dlZd dlmZmZmZmZmZm	Z	 d dl
mZ d dlmZ d dlmZ G dd deZG d	d
 d
ZG dd deZG dd deZG dd dZdS )    )annotationsN)AnyDictListTuple	TypedDictUnionDocument)Language)RecursiveCharacterTextSplitterc                      s(   e Zd ZdZddd fddZ  ZS )MarkdownTextSplitterz=Attempts to split the text along Markdown-formatted headings.r   None)kwargsreturnc                   s&   |  tj}t jf d|i| dS )z"Initialize a MarkdownTextSplitter.
separatorsN)Zget_separators_for_languager   MARKDOWNsuper__init__)selfr   r   	__class__ o/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_text_splitters/markdown.pyr      s    zMarkdownTextSplitter.__init__)__name__
__module____qualname____doc__r   __classcell__r   r   r   r   r      s   r   c                   @  sD   e Zd ZdZdddddddZd	d
dddZdd
dddZdS )MarkdownHeaderTextSplitterz4Splitting markdown files based on specified headers.FTzList[Tuple[str, str]]boolheaders_to_split_onreturn_each_linestrip_headersc                 C  s$   || _ t|dd dd| _|| _dS )a  Create a new MarkdownHeaderTextSplitter.

        Args:
            headers_to_split_on: Headers we want to track
            return_each_line: Return each line w/ associated headers
            strip_headers: Strip split headers from the content of the chunk
        c                 S  s   t | d S )Nr   )len)splitr   r   r   <lambda>*       z5MarkdownHeaderTextSplitter.__init__.<locals>.<lambda>T)keyreverseN)r#   sortedr"   r$   r   r"   r#   r$   r   r   r   r      s
    
z#MarkdownHeaderTextSplitter.__init__zList[LineType]List[Document])linesr   c                 C  s   g }|D ]}|rB|d d |d krB|d d  d|d  7  < q|r|d d |d krt |d d t |d k r|d d dd d dkr| js|d d  d|d  7  < |d |d d< q|| qdd	 |D S )
zCombine lines with common metadata into chunks.

        Args:
            lines: Line of text / associated header metadata
        metadatacontentz  

r   #c                 S  s    g | ]}t |d  |d dqS r1   r0   page_contentr0   r	   .0chunkr   r   r   
<listcomp>U   s   zHMarkdownHeaderTextSplitter.aggregate_lines_to_chunks.<locals>.<listcomp>)r%   r&   r$   append)r   r.   Zaggregated_chunksliner   r   r   aggregate_lines_to_chunks/   s.    z4MarkdownHeaderTextSplitter.aggregate_lines_to_chunksstrtextr   c                 C  s&  | d}g }g }i }g }i }d}d}	|D ]}
|
 }dttj|}|s|drp|ddkrpd}d}	q|drd}d}	n||	rd}d}	|r|| q*| j	D ]\}}||rt
|t
|ks|t
| dkr|d	ur`|d
}|r.|d d |kr.| }|d |v r||d  q|||t
|d	  d}|| |d ||< |r|d|| d |  | js||  qq|r|| n(|r|d|| d |  | }q*|r|d||d | js| |S dd |D S d	S )zLSplit markdown file.

        Args:
            text: Markdown file
        r2   F z```   Tz~~~ Nr3   r/   levelname)rD   rE   datarF   )r1   r0   c                 S  s    g | ]}t |d  |d dqS r4   r	   r7   r   r   r   r:      s   z9MarkdownHeaderTextSplitter.split_text.<locals>.<listcomp>)r&   stripjoinfilterr>   isprintable
startswithcountr;   r"   r%   popcopyclearr$   r#   r=   )r   r@   r.   Zlines_with_metadataZcurrent_contentZcurrent_metadataZheader_stackZinitial_metadataZin_code_blockZopening_fencer<   Zstripped_lineseprE   Zcurrent_header_levelZpopped_headerheaderr   r   r   
split_textZ   s    










	
z%MarkdownHeaderTextSplitter.split_textN)FT)r   r   r   r   r   r=   rR   r   r   r   r   r      s     +r   c                   @  s"   e Zd ZU dZded< ded< dS )LineTypezLine type as typed dict.zDict[str, str]r0   r>   r1   Nr   r   r   r   __annotations__r   r   r   r   rS      s   
rS   c                   @  s*   e Zd ZU dZded< ded< ded< dS )
HeaderTypezHeader type as typed dict.intrD   r>   rE   rF   NrT   r   r   r   r   rV      s   
rV   c                   @  s   e Zd ZdZdddddddZd*ddddddZdddddZddddddZddddddZddd d!Z	dd"d#d$d%Z
dd"d#d&d'Zdd"d#d(d)Zd	S )+&ExperimentalMarkdownSyntaxTextSplittera  An experimental text splitter for handling Markdown syntax.

    This splitter aims to retain the exact whitespace of the original text while
    extracting structured metadata, such as headers. It is a re-implementation of the
    MarkdownHeaderTextSplitter with notable changes to the approach and
    additional features.

    Key Features:
    - Retains the original whitespace and formatting of the Markdown text.
    - Extracts headers, code blocks, and horizontal rules as metadata.
    - Splits out code blocks and includes the language in the "Code" metadata key.
    - Splits text on horizontal rules (`---`) as well.
    - Defaults to sensible splitting behavior, which can be overridden using the
      `headers_to_split_on` parameter.

    Parameters:
    ----------
    headers_to_split_on : List[Tuple[str, str]], optional
        Headers to split on, defaulting to common Markdown headers if not specified.
    return_each_line : bool, optional
        When set to True, returns each line as a separate chunk. Default is False.

    Usage example:
    --------------
    >>> headers_to_split_on = [
    >>>     ("#", "Header 1"),
    >>>     ("##", "Header 2"),
    >>> ]
    >>> splitter = ExperimentalMarkdownSyntaxTextSplitter(
    >>>     headers_to_split_on=headers_to_split_on
    >>> )
    >>> chunks = splitter.split(text)
    >>> for chunk in chunks:
    >>>     print(chunk)

    This class is currently experimental and subject to change based on feedback and
    further development.
    zHeader 1zHeader 2zHeader 3zHeader 4zHeader 5zHeader 6)r3   z##z###z####z#####z######NFTz"Union[List[Tuple[str, str]], None]r    r!   c                 C  s@   g | _ tdd| _g | _|| _|r.t|| _n| j| _|| _dS )a1  Initialize the text splitter with header splitting and formatting options.

        This constructor sets up the required configuration for splitting text into
        chunks based on specified headers and formatting preferences.

        Args:
            headers_to_split_on (Union[List[Tuple[str, str]], None]):
                A list of tuples, where each tuple contains a header tag (e.g., "h1")
                and its corresponding metadata key. If None, default headers are used.
            return_each_line (bool):
                Whether to return each line as an individual chunk.
                Defaults to False, which aggregates lines into larger chunks.
            strip_headers (bool):
                Whether to exclude headers from the resulting chunks.
                Defaults to True.
        rA   r6   N)	chunksr
   current_chunkcurrent_header_stackr$   dictsplittable_headersDEFAULT_HEADER_KEYSr#   r,   r   r   r   r     s    z/ExperimentalMarkdownSyntaxTextSplitter.__init__r>   r-   r?   c           	      C  s  | j   tdd| _| j  |jdd}|r|d}| |}| |}| 	|}|r| 
  | jsz| j j|7  _t|d}|d}| || q,|r| 
  | ||| j_|d| jjd< | 
  q,|r| 
  q,| j j|7  _q,| 
  | jrd	d
 | j D S | j S )ay  Split the input text into structured chunks.

        This method processes the input text line by line, identifying and handling
        specific patterns such as headers, code blocks, and horizontal rules to
        split it into structured chunks based on headers, code blocks, and
        horizontal rules.

        Args:
            text (str): The input text to be split into chunks.

        Returns:
            List[Document]: A list of `Document` objects representing the structured
            chunks of the input text. If `return_each_line` is enabled, each line
            is returned as a separate `Document`.
        rA   rY   T)keependsr   rB      ZCodec                 S  s6   g | ].}|j  D ]}|r| st||jd qqS )r5   )r6   
splitlinesisspacer
   r0   )r8   r9   r<   r   r   r   r:   q  s   zEExperimentalMarkdownSyntaxTextSplitter.split_text.<locals>.<listcomp>)rZ   rO   r
   r[   r\   rb   rM   _match_header_match_code_match_horz_complete_chunk_docr$   r6   r%   group_resolve_header_stack_resolve_code_chunkr0   r#   )	r   r@   	raw_linesraw_lineZheader_matchZ
code_matchZ
horz_matchheader_depthheader_textr   r   r   rR   :  s@    








z1ExperimentalMarkdownSyntaxTextSplitter.split_textrW   r   )rm   rn   r   c                 C  s\   t | jD ]<\}\}}||kr
||f| j|< | jd |d  | _ d S q
| j||f d S )NrB   )	enumerater\   r;   )r   rm   rn   idepth_r   r   r   ri   y  s    z<ExperimentalMarkdownSyntaxTextSplitter._resolve_header_stackz	List[str])current_linerk   r   c                 C  s.   |}|r*| d}||7 }| |r|S qdS )Nr   rA   )rM   re   )r   rs   rk   r9   rl   r   r   r   rj     s    

z:ExperimentalMarkdownSyntaxTextSplitter._resolve_code_chunk)r   c                 C  s^   | j j}|rN| sN| jD ]$\}}| jd| }|| j j|< q| j| j  t	dd| _ d S )Nr3   rA   rY   )
r[   r6   rc   r\   r^   getr0   rZ   r;   r
   )r   Zchunk_contentrq   valueZ
header_keyr   r   r   rg     s    z:ExperimentalMarkdownSyntaxTextSplitter._complete_chunk_doczUnion[re.Match, None])r<   r   c                 C  s(   t d|}|r$|d| jv r$|S d S )Nz^(#{1,6}) (.*)rB   )rematchrh   r^   )r   r<   rw   r   r   r   rd     s    z4ExperimentalMarkdownSyntaxTextSplitter._match_headerc                   s&    fdddD }t dd |D d S )Nc                   s   g | ]}t | qS r   rv   rw   r8   ruler<   r   r   r:     r(   zFExperimentalMarkdownSyntaxTextSplitter._match_code.<locals>.<listcomp>)z^```(.*)z^~~~(.*)c                 s  s   | ]}|r|V  qd S Nr   r8   rw   r   r   r   	<genexpr>  r(   zEExperimentalMarkdownSyntaxTextSplitter._match_code.<locals>.<genexpr>nextr   r<   matchesr   r{   r   re     s    z2ExperimentalMarkdownSyntaxTextSplitter._match_codec                   s&    fdddD }t dd |D d S )Nc                   s   g | ]}t | qS r   rx   ry   r{   r   r   r:     s   zFExperimentalMarkdownSyntaxTextSplitter._match_horz.<locals>.<listcomp>)z
^\*\*\*+\nz^---+\nz^___+\nc                 s  s   | ]}|r|V  qd S r|   r   r}   r   r   r   r~     r(   zEExperimentalMarkdownSyntaxTextSplitter._match_horz.<locals>.<genexpr>r   r   r   r{   r   rf     s    
z2ExperimentalMarkdownSyntaxTextSplitter._match_horz)NFT)r   r   r   r   r_   r   rR   ri   rj   rg   rd   re   rf   r   r   r   r   rX      s&   (   !?	rX   )
__future__r   rv   typingr   r   r   r   r   r   Zlangchain_core.documentsr
   Zlangchain_text_splitters.baser   Z"langchain_text_splitters.characterr   r   r   rS   rV   rX   r   r   r   r   <module>   s    	 E