
    h)                         d dl Z d dlZd dlZd dlmZmZmZmZmZm	Z	 d dl
mZ d dlmZ  ej                  e      Zerd dlmZ  G d de      Zy)    N)TYPE_CHECKINGAnyIteratorListOptionalTuple)Document)
BaseLoader)SparkSessionc            	       x    e Zd ZdZ	 	 	 	 dded   dee   dedefdZd	e	e
e
f   fd
Zd	ee   fdZd	ee   fdZy)PySparkDataFrameLoaderzLoad `PySpark` DataFrames.Nspark_sessionr   dfpage_content_columnfraction_of_memoryc                    	 ddl m}m} |r|n|j                  j                         | _        t        ||      st        dt        |             || _
        || _        || _        | j                         \  | _        | _        | j                  j                   j#                  t$              | _        | j                  j(                  | _        y# t        $ r t        d      w xY w)ag  Initialize with a Spark DataFrame object.

        Args:
            spark_session: The SparkSession object.
            df: The Spark DataFrame object.
            page_content_column: The name of the column containing the page content.
             Defaults to "text".
            fraction_of_memory: The fraction of memory to use. Defaults to 0.1.
        r   )	DataFramer   zFpyspark is not installed. Please install it with `pip install pyspark`z3Expected data_frame to be a PySpark DataFrame, got N)pyspark.sqlr   r   ImportErrorbuildergetOrCreatespark
isinstance
ValueErrortyper   r   r   get_num_rowsnum_rowsmax_num_rowsrddmaplistrdd_dfcolumnscolumn_names)selfr   r   r   r   r   r   s          t/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/langchain_community/document_loaders/pyspark_dataframe.py__init__zPySparkDataFrameLoader.__init__   s     	; +M0D0D0P0P0R 	
 "i(Ed2hZP  #6 "4+/+<+<+>(t(ggkkood+ GGOO%  	X 	s   C C!returnc                    	 ddl }| j                  j                  d      j	                         d   }t        j                  |      }|j                         }|j                  }t        ||z  | j                  z        }t        || j                  j                               |fS # t        $ r}t        d      |d}~ww xY w)z4Gets the number of "feasible" rows for the DataFramer   NzBpsutil not installed. Please install it with `pip install psutil`.   )psutilr   r   limitcollectsys	getsizeofvirtual_memory	availableintr   mincount)r%   r+   erowestimated_row_sizemem_infoavailable_memoryr   s           r&   r   z#PySparkDataFrameLoader.get_num_rows9   s    	
 ggmmA&&(+ ]]3/((*#-- 22d6M6MM
 <1<??  	T	s   B$ $	B>-B99B>c              #   *  K   | j                   j                         D ]m  }t        t        |            D ci c]  }| j                  |   ||    }}|| j
                     }|j                  | j
                         t        ||       o yc c}w w)z#A lazy loader for document content.)page_contentmetadataN)r"   toLocalIteratorrangelenr$   r   popr	   )r%   r6   ir<   texts        r&   	lazy_loadz PySparkDataFrameLoader.lazy_loadJ   s     ;;..0 	AC>CCHoN))!,c!f4NHND445DLL112x@@		ANs   4BBABc                 >   | j                   j                         | j                  kD  r>t        j	                  d| j                   j                          d| j
                   d       | j                         }t        t        j                  || j
                              S )zLoad from the dataframe.z The number of DataFrame rows is zQ, but we will only include the amount of rows that can reasonably fit in memory: .)
r   r4   r   loggerwarningr   rC   r!   	itertoolsislice)r%   lazy_load_iterators     r&   loadzPySparkDataFrameLoader.loadR   s{    77==?T...NN2477==?2C D>>Bmm_AO
 "^^-I$$%7GHH    )NNrB   g?)__name__
__module____qualname____doc__r   r   strfloatr'   r   r2   r   r   r	   rC   r   rK    rL   r&   r   r      s    $ 37 #)$'$,/$, SM$, !	$,
 "$,L@eCHo @"A8H- A	Id8n 	IrL   r   )rH   loggingr.   typingr   r   r   r   r   r   langchain_core.documentsr	   )langchain_community.document_loaders.baser
   	getLogger__file__rF   r   r   r   rS   rL   r&   <module>rZ      sB      
 F F - @			8	$(KIZ KIrL   