a
    bg                     @   sf   d dl Zd dlZd dlmZ d dlmZmZmZ d dl	m
Z
 d dlmZ eeZG dd deZdS )    N)Path)DictIteratorUnion)Document)
BaseLoaderc                   @   sR   e Zd ZdZd
eeef eedf eedf eddddZe	e
 ddd	ZdS )BSHTMLLoaderaS  
    __ModuleName__ document loader integration

    Setup:
        Install ``langchain-community`` and ``bs4``.

        .. code-block:: bash

            pip install -U langchain-community bs4

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import BSHTMLLoader

            loader = BSHTMLLoader(
                file_path="./example_data/fake-content.html",
            )

    Lazy load:
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            # async variant:
            # docs_lazy = await loader.alazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python


            Test Title


            My First Heading
            My first paragraph.



            {'source': './example_data/fake-content.html', 'title': 'Test Title'}

    Async load:
        .. code-block:: python

            docs = await loader.aload()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python



            Test Title


            My First Heading
            My first paragraph.



            {'source': './example_data/fake-content.html', 'title': 'Test Title'}

    N )	file_pathopen_encoding	bs_kwargsget_text_separatorreturnc                 C   sh   zddl }W n ty&   tdY n0 || _|| _|du rXtjdsPtdddi}|| _|| _dS )a  initialize with path, and optionally, file encoding to use, and any kwargs
        to pass to the BeautifulSoup object.

        Args:
            file_path: The path to the file to load.
            open_encoding: The encoding to use when opening the file.
            bs_kwargs: Any kwargs to pass to the BeautifulSoup object.
            get_text_separator: The separator to use when calling get_text on the soup.
        r   NzUbeautifulsoup4 package not found, please install it with `pip install beautifulsoup4`ZlxmlzBy default BSHTMLLoader uses the 'lxml' package. Please either install it with `pip install -U lxml` or pass in init arg `bs_kwargs={'features': '...'}` to overwrite the default BeautifulSoup kwargs.features)	bs4ImportErrorr
   r   	importlibutil	find_specr   r   )selfr
   r   r   r   r    r   z/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_community/document_loaders/html_bs.py__init__S   s     
zBSHTMLLoader.__init__)r   c                 c   s   ddl m} t| jd| jd"}||fi | j}W d   n1 sF0    Y  || j}|jrpt	|jj
}nd}t	| j|d}t||dV  dS )	z)Load HTML document into document objects.r   )BeautifulSoupr)encodingNr	   )sourcetitle)Zpage_contentmetadata)r   r   openr
   r   r   Zget_textr   r   strstringr   )r   r   fZsouptextr   r   r   r   r   	lazy_loady   s    0zBSHTMLLoader.lazy_load)NNr	   )__name__
__module____qualname____doc__r   r    r   dictr   r   r   r$   r   r   r   r   r      s   H   


&r   )importlib.utilr   loggingpathlibr   typingr   r   r   Zlangchain_core.documentsr   Z)langchain_community.document_loaders.baser   	getLoggerr%   loggerr   r   r   r   r   <module>   s   
