a
    bg                     @   sz   d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	 d dl
mZ d dlmZ d dlmZ e eZG dd deZdS )	    N)Path)AnyIteratorListMappingOptional)Document)
BaseLoader)BibtexparserWrapperc                   @   sp   e Zd ZdZddddddeee ee ee eeddd	Z	e
eef ee d
ddZee dddZdS )BibtexLoadera  Load a `bibtex` file.

    Each document represents one entry from the bibtex file.

    If a PDF file is present in the `file` bibtex field, the original PDF
    is loaded into the document text. If no such file entry is present,
    the `abstract` field is used instead.
    Ni  Fz
[^:]+\.pdf)parsermax_docsmax_content_charsload_extra_metadatafile_pattern)	file_pathr   r   r   r   r   c                C   s4   || _ |pt | _|| _|| _|| _t|| _dS )a  Initialize the BibtexLoader.

        Args:
            file_path: Path to the bibtex file.
            parser: The parser to use. If None, a default parser is used.
            max_docs: Max number of associated documents to load. Use -1 means
                           no limit.
            max_content_chars: Maximum number of characters to load from the PDF.
            load_extra_metadata: Whether to load extra metadata from the PDF.
            file_pattern: Regex pattern to match the file name in the bibtex.
        N)	r   r
   r   r   r   r   recompile
file_regex)selfr   r   r   r   r   r    r   y/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_community/document_loaders/bibtex.py__init__   s    zBibtexLoader.__init__)entryreturnc                 C   s   dd l }t| jj}| j|dd}|s0d S g }|D ]z}zF||| $}|dd |D  W d    n1 sv0    Y  W q8 t	y } zt
| W Y d }~q8d }~0 0 q8d|p|dd}	| jr|	d | j }	| jj|| jd}
t|	|
d	S )
Nr   file c                 s   s   | ]}|  V  qd S )N)Zget_text).0pager   r   r   	<genexpr>@       z+BibtexLoader._load_entry.<locals>.<genexpr>
Zabstract)Z
load_extra)Zpage_contentmetadata)fitzr   r   parentr   findallgetopenextendFileNotFoundErrorloggerdebugjoinr   r   get_metadatar   r   )r   r   r#   
parent_dirZ
file_namesZtexts	file_namefecontentr"   r   r   r   _load_entry4   s(    6"zBibtexLoader._load_entry)r   c                 c   sl   zddl }W n ty&   tdY n0 | j| j}| jrJ|d| j }|D ]}| |}|rN|V  qNdS )a  Load bibtex file using bibtexparser and get the article texts plus the
        article metadata.
        See https://bibtexparser.readthedocs.io/en/master/

        Returns:
            a list of documents with the document.page_content in text format
        r   NzGPyMuPDF package not found, please install it with `pip install pymupdf`)r#   ImportErrorr   Zload_bibtex_entriesr   r   r3   )r   r#   entriesr   docr   r   r   	lazy_loadL   s    

zBibtexLoader.lazy_load)__name__
__module____qualname____doc__strr   r
   intboolr   r   r   r   r3   r   r7   r   r   r   r   r      s    r   )loggingr   pathlibr   typingr   r   r   r   r   Zlangchain_core.documentsr   Z)langchain_community.document_loaders.baser	   Z$langchain_community.utilities.bibtexr
   	getLoggerr8   r*   r   r   r   r   r   <module>   s   
