a
    bgq                     @   sz   d dl Z d dlmZmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZ e eZG dd deZG dd	 d	e
ZdS )
    N)DictIteratorListUnion)Document)BaseBlobParser)Blobc                   @   s   e Zd ZdZdS )ServerUnavailableExceptionz7Exception raised when the Grobid server is unavailable.N)__name__
__module____qualname____doc__ r   r   /var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_community/document_loaders/parsers/grobid.pyr	      s   r	   c                   @   sP   e Zd ZdZdeeddddZeeeee ddd	Z	e
ee d
ddZdS )GrobidParserz)Load  article `PDF` files using `Grobid`.1http://localhost:8070/api/processFulltextDocumentN)segment_sentencesgrobid_serverreturnc                 C   sD   || _ || _zt| W n$ tjjy>   td tY n0 d S )NzyGROBID server does not appear up and running,                 please ensure Grobid is installed and the server is running)	r   r   requestsget
exceptionsRequestExceptionloggererrorr	   )selfr   r   r   r   r   __init__   s    zGrobidParser.__init__)	file_pathxml_datar   r   c                 #   s  zddl m} W n ty*   tdY n0 ||d}|d}|d}|rZ|d jndg }|D ]~}	|	d}
|
d	urft|	d
D ]V\}}g }g }t|dD ]\}}||j g }|dd	ur(|d	dD ]8}|	d}||d |d |d |d |d d q|| |du rt
|dkr|d d |d d  }}|jt||g|
j|
d||fd}|| q|dur|d d d |d d d  }}d|t|||
j|
d||fd}|| qqf fdd|D E d	H  d	S )z!Process the XML file from Grobin.r   )BeautifulSoupzA`bs4` package not found, please install it with `pip install bs4`xmldivtitlezNo title foundheadNpsZcoords;,            )pagexyhwTr,   n)textparabboxessection_titlesection_numberpages c                    sj   g | ]b}t |d  tt|d  t|d t|d t|d t|d t|d tt ddqS )r3   r4   r5   r8   r6   r7   )r3   r4   r5   r8   r6   r7   Zpaper_titler   )Zpage_contentmetadata)r   dictstr).0chunkr   r"   r   r   
<listcomp>i   s   





z,GrobidParser.process_xml.<locals>.<listcomp>)Zbs4r   ImportErrorZfind_allr3   find	enumerateappendr   splitlenr<   join)r   r   r   r   r   ZsoupsectionstitleschunkssectionsectiZ	paragraphZchunk_bboxesZparagraph_textZsentenceZsbboxesZbboxboxZfpageZlpageZsentence_dictZparagraph_dictr   r?   r   process_xml&   sv    






	
zGrobidParser.process_xml)blobr   c           	   	   C   s   |j }|d u rtdt|d}d||dddifi}zNi }dD ]}d||< q>d	d
g|d< |p^i }tjd| jd d ||dd}|j}W n$ tjjy   t	
d d }Y n0 |d u rtg S | ||| jS d S )Nzblob.source cannot be None.rbinputzapplication/pdfZExpires0)ZgenerateIDsZconsolidateHeaderZsegmentSentences1r#   r%   ZteiCoordinatesPOST<   )headersparamsfilesdatatimeoutz%GROBID server timed out. Return None.)source
ValueErroropenr   requestr   r3   r   ReadTimeoutr   r   iterrO   r   )	r   rP   r   ZpdfrY   rZ   paramrr   r   r   r   
lazy_parse|   s6    

	


zGrobidParser.lazy_parse)r   )r
   r   r   r   boolr<   r   r   r   rO   r   rd   r   r   r   r   r      s    Vr   )loggingtypingr   r   r   r   r   Zlangchain_core.documentsr   Z)langchain_community.document_loaders.baser   Z1langchain_community.document_loaders.blob_loadersr   	getLoggerr
   r   	Exceptionr	   r   r   r   r   r   <module>   s   
