a
    bg                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZ d dlmZ d dlmZ d dlmZ G dd	 d	eeZdS )
    N)ABC)Path)IteratorListSetTuple)Document)BaseBlobParser)Blobc                   @   s~   e Zd ZdZeee dddZeee dddZe	j
eeeeeef  ddd	Zee	j
ee ee ee d
ddZdS )
VsdxParserzParser for vsdx files.)blobreturnc                 C   s
   |  |S )zParse a vsdx file.)
lazy_parse)selfr    r   /var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_community/document_loaders/parsers/vsdx.pyparse   s    zVsdxParser.parsec              	   #   s~      J}t|d}| | j}W d   n1 s:0    Y  W d   n1 sX0    Y   fdd|D E dH  dS )zoRetrieve the contents of pages from a .vsdx file
        and insert them into documents, one document per page.rNc                    s(   g | ] \}}}t | j||d dqS ))sourcepage	page_name)page_contentmetadata)r   r   ).0page_numberr   r   r   r   r   
<listcomp>   s   	z)VsdxParser.lazy_parse.<locals>.<listcomp>)Zas_bytes_iozipfileZipFileget_pages_contentr   )r   r   Zpdf_file_objzfileZpagesr   r   r   r      s    
J
	zVsdxParser.lazy_parse)r    r   r   c                    s  zddl W n ty&   tdY n0 d vrFtd| dS d vrdtd| dS d vrtd	| dS d}d}d}t|d
 d trdd |d
 d D  n|d
 d d 	 g t|d d trdd |d d D nd|d d d  g|d d d d dt
  }dd |D } fdd|D }g }|D ]}	t|	}
td|
}t
|dkr~d|}dddddd d!}| D ]\}}|||}q||d" q~fd#d|D }g }tt||D ]X\}\}| ||dfd$d|D fd%d|D  }||||f q,|S )&a  Get the content of the pages of a vsdx file.

        Attributes:
            zfile (zipfile.ZipFile): The vsdx file under zip format.
            source (str): The path of the vsdx file.

        Returns:
            list[tuple[int, str, str]]: A list of tuples containing the page number,
            the name of the page and the content of the page
            for each page of the vsdx file.
        r   NzfThe xmltodict library is required to parse vsdx files. Please install it with `pip install xmltodict`.zvisio/pages/pages.xmlz'WARNING - No pages.xml file found in {}z visio/pages/_rels/pages.xml.relsz,WARNING - No pages.xml.rels file found in {}zdocProps/app.xmlz%WARNING - No app.xml file found in {}ZPagesZPagec                 S   s   g | ]}|d    qS )@Namestripr   relr   r   r   r   P   s   z0VsdxParser.get_pages_content.<locals>.<listcomp>r!   RelationshipsRelationshipc                 S   s   g | ]}d |d  qS )visio/pages/@Targetr   r$   r   r   r   r   X   s   r(   r)   
PropertiesZTitlesOfPartsz	vt:vectorzvt:lpstrc                 S   s   g | ]}|  qS r   r"   r   namer   r   r   r   d       c                    s   g | ]}  |  qS r   )indexr#   r+   )disordered_namesdisordered_pathsr   r   r   e   s   z("#text"\s*:\s*"([^\\"]*(?:\\.[^\\"]*)*)"
	-'   é   ô)z\nz\tz\u2013z\u2019z\u00e9rz\u00f4mer   r   c              
      sJ   g | ]B}d t |j d v r| d t |j ddqS )zvisio/pages/_rels/z	.xml.rels)pathcontent)r   stemnamelistr   read)r   Z	page_path)	xmltodictr    r   r   r      s   c                    s    g | ]}|d   v r|d qS r7   r   r   Zpage_)relationshipsr   r   r      s   c                    s    g | ]}|d   kr|d qS r7   r   r>   )r8   r   r   r      s   )r=   ImportErrorr;   printformatr   r<   
isinstancelistr#   lenjsondumpsrefindalljoinitemsreplaceappend	enumeratezipget_relationships)r   r    r   Zpagesxml_contentZappxml_contentZpagesxmlrels_contentZordered_namesZordered_pathsZdisordered_pagesr9   Zstring_contentZsamplesr   Zmap_symboleskeyvaluepagexml_relsZordered_pagesr   r   r   )r/   r0   r8   r?   r=   r    r   r   (   s    








zVsdxParser.get_pages_content)r   r    filelistrS   r   c                    s   t  j}t  jd| d }t|| vr:t S t fdd|D }t|d d trzdd |d d D }n|d d d	 g}tfd
d|D 	|}	|	D ]}
|	| 
|
|||B }	q|	S )a  Get the relationships of a page and the relationships of its relationships,
        etc... recursively.
        Pages are based on other pages (ex: background page),
        so we need to get all the relationships to get all the content of a single page.
        z_rels/z.relsc                 3   s"   | ]}|d   kr|d V  qdS )r8   r9   Nr   r>   )r   r   r   	<genexpr>   s   z/VsdxParser.get_relationships.<locals>.<genexpr>r&   r'   c                 S   s   g | ]}|d  qS )r)   r   r$   r   r   r   r      s   z0VsdxParser.get_relationships.<locals>.<listcomp>r)   c                    s   g | ]}t  | qS r   )str)r   target)parent_pathr   r   r      r-   )r   r,   parentrV   r;   setnextrC   rD   intersectionrP   )r   r   r    rT   rS   Z	name_pathZ	rels_pathZpagexml_rels_contenttargetsr?   r%   r   )r   rX   r   rP      s.    


zVsdxParser.get_relationshipsN)__name__
__module____qualname____doc__r
   r   r   r   r   r   r   rV   r   r   intr   dictr   rP   r   r   r   r   r      s    r   )rF   rH   r   abcr   pathlibr   typingr   r   r   r   Z%langchain_community.docstore.documentr   Z)langchain_community.document_loaders.baser	   Z1langchain_community.document_loaders.blob_loadersr
   r   r   r   r   r   <module>   s   