a
    bg<                     @   s~   d dl Z d dlZd dlZd dlmZmZmZmZmZm	Z	 d dl
mZ d dlmZ eeZerjd dlmZ G dd deZdS )    N)TYPE_CHECKINGAnyIteratorListOptionalTuple)Document)
BaseLoader)SparkSessionc                   @   sh   e Zd ZdZded ee eedddZe	e
e
f d	d
dZee d	ddZee d	ddZdS )PySparkDataFrameLoaderzLoad `PySpark` DataFrames.Ntext皙?r
   )spark_sessiondfpage_content_columnfraction_of_memoryc                 C   s   zddl m}m} W n ty.   tdY n0 |r8|n|j | _t||s`tdt	| || _
|| _|| _|  \| _| _| j
jt| _| j
j| _dS )ag  Initialize with a Spark DataFrame object.

        Args:
            spark_session: The SparkSession object.
            df: The Spark DataFrame object.
            page_content_column: The name of the column containing the page content.
             Defaults to "text".
            fraction_of_memory: The fraction of memory to use. Defaults to 0.1.
        r   )	DataFramer
   zFpyspark is not installed. Please install it with `pip install pyspark`z3Expected data_frame to be a PySpark DataFrame, got N)pyspark.sqlr   r
   ImportErrorZbuilderZgetOrCreateZspark
isinstance
ValueErrortyper   r   r   get_num_rowsnum_rowsmax_num_rowsZrddmaplistrdd_dfcolumnscolumn_names)selfr   r   r   r   r   r
    r!   /var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_community/document_loaders/pyspark_dataframe.py__init__   s$    

zPySparkDataFrameLoader.__init__)returnc              
   C   s   zddl }W n. ty: } ztd|W Y d}~n
d}~0 0 | jd d }t|}| }|j}t	|| | j
 }t|| j |fS )z4Gets the number of "feasible" rows for the DataFramer   NzBpsutil not installed. Please install it with `pip install psutil`.   )psutilr   r   limitZcollectsys	getsizeofZvirtual_memory	availableintr   mincount)r    r&   erowZestimated_row_sizeZmem_infoZavailable_memoryr   r!   r!   r"   r   :   s     
z#PySparkDataFrameLoader.get_num_rowsc                 #   sT   j  D ]D  fddtt D }|j }|j t||dV  q
dS )z#A lazy loader for document content.c                    s   i | ]}j |  | qS r!   )r   ).0ir/   r    r!   r"   
<dictcomp>N       z4PySparkDataFrameLoader.lazy_load.<locals>.<dictcomp>)Zpage_contentmetadataN)r   ZtoLocalIteratorrangelenr   popr   )r    r5   r   r!   r2   r"   	lazy_loadK   s
    
z PySparkDataFrameLoader.lazy_loadc                 C   sJ   | j  | jkr0td| j   d| j d |  }tt	|| jS )zLoad from the dataframe.z The number of DataFrame rows is zQ, but we will only include the amount of rows that can reasonably fit in memory: .)
r   r-   r   loggerwarningr   r9   r   	itertoolsislice)r    Zlazy_load_iteratorr!   r!   r"   loadS   s    zPySparkDataFrameLoader.load)NNr   r   )__name__
__module____qualname____doc__r   r   strfloatr#   r   r+   r   r   r   r9   r   r?   r!   r!   r!   r"   r      s       'r   )r=   loggingr(   typingr   r   r   r   r   r   Zlangchain_core.documentsr   Z)langchain_community.document_loaders.baser	   	getLogger__file__r;   r   r
   r   r!   r!   r!   r"   <module>   s    
