o
    iQ                     @   s   d dl Z d dlmZ d dlZddlmZmZmZ ddl	m
Z
 ddlmZmZmZ ddlmZmZ ddlmZmZmZmZmZ d	d
lmZ G dd deeZdS )    N)Integral   )BaseEstimatorTransformerMixin_fit_context)resample)IntervalOptions
StrOptions)_averaged_weighted_percentile_weighted_percentile)_check_feature_names_in_check_sample_weightcheck_arraycheck_is_fittedvalidate_data   )OneHotEncoderc                   @   s   e Zd ZU dZeedddddgeh dgeh dgeh d	geee	j
e	jhdgeed
ddddgdgdZeed< 	d"dddddddddZeddd#ddZdd Zdd Zdd Zd$d d!ZdS )%KBinsDiscretizera&  
    Bin continuous data into intervals.

    Read more in the :ref:`User Guide <preprocessing_discretization>`.

    .. versionadded:: 0.20

    Parameters
    ----------
    n_bins : int or array-like of shape (n_features,), default=5
        The number of bins to produce. Raises ValueError if ``n_bins < 2``.

    encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot'
        Method used to encode the transformed result.

        - 'onehot': Encode the transformed result with one-hot encoding
          and return a sparse matrix. Ignored features are always
          stacked to the right.
        - 'onehot-dense': Encode the transformed result with one-hot encoding
          and return a dense array. Ignored features are always
          stacked to the right.
        - 'ordinal': Return the bin identifier encoded as an integer value.

    strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'
        Strategy used to define the widths of the bins.

        - 'uniform': All bins in each feature have identical widths.
        - 'quantile': All bins in each feature have the same number of points.
        - 'kmeans': Values in each bin have the same nearest center of a 1D
          k-means cluster.

        For an example of the different strategies see:
        :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_strategies.py`.

    quantile_method : {"inverted_cdf", "averaged_inverted_cdf",
            "closest_observation", "interpolated_inverted_cdf", "hazen",
            "weibull", "linear", "median_unbiased", "normal_unbiased"},
            default="linear"
            Method to pass on to np.percentile calculation when using
            strategy="quantile". Only `averaged_inverted_cdf` and `inverted_cdf`
            support the use of `sample_weight != None` when subsampling is not
            active.

            .. versionadded:: 1.7

    dtype : {np.float32, np.float64}, default=None
        The desired data-type for the output. If None, output dtype is
        consistent with input dtype. Only np.float32 and np.float64 are
        supported.

        .. versionadded:: 0.24

    subsample : int or None, default=200_000
        Maximum number of samples, used to fit the model, for computational
        efficiency.
        `subsample=None` means that all the training samples are used when
        computing the quantiles that determine the binning thresholds.
        Since quantile computation relies on sorting each column of `X` and
        that sorting has an `n log(n)` time complexity,
        it is recommended to use subsampling on datasets with a
        very large number of samples.

        .. versionchanged:: 1.3
            The default value of `subsample` changed from `None` to `200_000` when
            `strategy="quantile"`.

        .. versionchanged:: 1.5
            The default value of `subsample` changed from `None` to `200_000` when
            `strategy="uniform"` or `strategy="kmeans"`.

    random_state : int, RandomState instance or None, default=None
        Determines random number generation for subsampling.
        Pass an int for reproducible results across multiple function calls.
        See the `subsample` parameter for more details.
        See :term:`Glossary <random_state>`.

        .. versionadded:: 1.1

    Attributes
    ----------
    bin_edges_ : ndarray of ndarray of shape (n_features,)
        The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
        Ignored features will have empty arrays.

    n_bins_ : ndarray of shape (n_features,), dtype=np.int64
        Number of bins per feature. Bins whose width are too small
        (i.e., <= 1e-8) are removed with a warning.

    n_features_in_ : int
        Number of features seen during :term:`fit`.

        .. versionadded:: 0.24

    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during :term:`fit`. Defined only when `X`
        has feature names that are all strings.

        .. versionadded:: 1.0

    See Also
    --------
    Binarizer : Class used to bin values as ``0`` or
        ``1`` based on a parameter ``threshold``.

    Notes
    -----

    For a visualization of discretization on different datasets refer to
    :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_classification.py`.
    On the effect of discretization on linear models see:
    :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization.py`.

    In bin edges for feature ``i``, the first and last values are used only for
    ``inverse_transform``. During transform, bin edges are extended to::

      np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])

    You can combine ``KBinsDiscretizer`` with
    :class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess
    part of the features.

    ``KBinsDiscretizer`` might produce constant features (e.g., when
    ``encode = 'onehot'`` and certain bins do not contain any data).
    These features can be removed with feature selection algorithms
    (e.g., :class:`~sklearn.feature_selection.VarianceThreshold`).

    Examples
    --------
    >>> from sklearn.preprocessing import KBinsDiscretizer
    >>> X = [[-2, 1, -4,   -1],
    ...      [-1, 2, -3, -0.5],
    ...      [ 0, 3, -2,  0.5],
    ...      [ 1, 4, -1,    2]]
    >>> est = KBinsDiscretizer(
    ...     n_bins=3, encode='ordinal', strategy='uniform'
    ... )
    >>> est.fit(X)
    KBinsDiscretizer(...)
    >>> Xt = est.transform(X)
    >>> Xt  # doctest: +SKIP
    array([[ 0., 0., 0., 0.],
           [ 1., 1., 1., 0.],
           [ 2., 2., 2., 1.],
           [ 2., 2., 2., 2.]])

    Sometimes it may be useful to convert the data back into the original
    feature space. The ``inverse_transform`` function converts the binned
    data into the original feature space. Each value will be equal to the mean
    of the two bin edges.

    >>> est.bin_edges_[0]
    array([-2., -1.,  0.,  1.])
    >>> est.inverse_transform(Xt)
    array([[-1.5,  1.5, -3.5, -0.5],
           [-0.5,  2.5, -2.5, -0.5],
           [ 0.5,  3.5, -1.5,  0.5],
           [ 0.5,  3.5, -1.5,  1.5]])
    r   Nleft)closedz
array-like>   onehotzonehot-denseordinal>   quantileuniformkmeans>
   Zmedian_unbiasedwarnZhazenlinearZweibullZinterpolated_inverted_cdfZclosest_observationaveraged_inverted_cdfZnormal_unbiasedinverted_cdfr   random_staten_binsencodestrategyquantile_methoddtype	subsampler    _parameter_constraints   r   r   r   i@ )r#   r$   r%   r&   r'   r    c                C   s.   || _ || _|| _|| _|| _|| _|| _d S Nr!   )selfr"   r#   r$   r%   r&   r'   r     r,   t/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/sklearn/preprocessing/_discretization.py__init__   s   
zKBinsDiscretizer.__init__T)Zprefer_skip_nested_validationc                    s  t | |dd}| jtjtjfv r| j}n|j}|j\}}dur(t||jd| jdur?|| jkr?t|d| j| j	d}d|jd }| 
|}tj|td}| j}	| jdkrd|	dkrdtd	t d
}	| jdkry|	dvrydurytd|	 d| jdkrdurdk}
ntd}
t|D ]7}|dd|f   |
  } |
  }||krtd|  d||< ttj tjg||< q| jdkrt|||| d ||< n| jdkr'tdd|| d }i }|	d
krdu r|	|d< du rtjtj |fi |tjd||< nttd|	 tj fdd|D tjd||< nh| jdkrddlm} t|||| d }|dd |dd  dddf d }||| |dd}|j  dddf dj!dddf }|"  |dd |dd  d ||< tj#||| |f ||< | jdv rtj$|| tjddk}|| | ||< t%|| d || krtd|  t%|| d ||< q|| _&|| _'d | j(v rt)d!d | j'D | j(d k|d"| _*| j* tdt%| j'f | S )#a  
        Fit the estimator.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Data to be discretized.

        y : None
            Ignored. This parameter exists only for compatibility with
            :class:`~sklearn.pipeline.Pipeline`.

        sample_weight : ndarray of shape (n_samples,)
            Contains weight values to be associated with each sample.

            .. versionadded:: 1.3

            .. versionchanged:: 1.7
               Added support for strategy="uniform".

        Returns
        -------
        self : object
            Returns the instance itself.
        numericr&   NT)replace	n_samplesr    sample_weightr   r   r   a%  The current default behavior, quantile_method='linear', will be changed to quantile_method='averaged_inverted_cdf' in scikit-learn version 1.9 to naturally support sample weight equivalence properties by default. Pass quantile_method='averaged_inverted_cdf' explicitly to silence this warning.r   )r   r   zWhen fitting with strategy='quantile' and sample weights, quantile_method should either be set to 'averaged_inverted_cdf' or 'inverted_cdf', got quantile_method='z
' instead.r   z3Feature %d is constant and will be replaced with 0.r   d   methodc                    s   g | ]	} |d qS ))Zpercentile_rankr,   ).0pcolumnZpercentile_funcr3   r,   r-   
<listcomp>q  s    z(KBinsDiscretizer.fit.<locals>.<listcomp>r   r   )KMeans      ?)Z
n_clustersinitZn_init)r3   )r   r   )Zto_beging:0yE>zqBins whose width are too small (i.e., <= 1e-8) in feature %d are removed. Consider decreasing the number of bins.r   c                 S   s   g | ]}t |qS r,   )npZaranger6   ir,   r,   r-   r:     s    )
categoriesZsparse_outputr&   )+r   r&   r?   float64float32shaper   r'   r   r    _validate_n_binsZzerosobjectr%   r$   warningsr   FutureWarning
ValueErrorslicerangeminmaxarrayinfZlinspaceZasarrayZ
percentiler   r   Zclusterr;   fitZcluster_centers_sortZr_Zediff1dlen
bin_edges_n_bins_r#   r   _encoder)r+   Xyr3   Zoutput_dtyper2   
n_featuresr"   	bin_edgesr%   Znnz_weight_maskjjZcol_minZcol_maxZpercentile_levelsZpercentile_kwargsr;   Zuniform_edgesr>   kmZcentersmaskr,   r8   r-   rQ      s   


	




( zKBinsDiscretizer.fitc                 C   s   | j }t|trtj||tdS t|tddd}|jdks$|jd |kr(t	d|dk ||kB }t
|d }|jd dkrQd	d
d |D }t	dtj||S )z0Returns n_bins_, the number of bins per feature.r0   TF)r&   copyZ	ensure_2dr   r   z8n_bins must be a scalar or array of shape (n_features,).r   z, c                 s   s    | ]}t |V  qd S r*   )strr@   r,   r,   r-   	<genexpr>  s    z4KBinsDiscretizer._validate_n_bins.<locals>.<genexpr>zk{} received an invalid number of bins at indices {}. Number of bins must be at least 2, and must be an int.)r"   
isinstancer   r?   fullintr   ndimrE   rJ   wherejoinformatr   __name__)r+   rY   Z	orig_binsr"   Zbad_nbins_valueZviolating_indicesindicesr,   r,   r-   rF     s"   
z!KBinsDiscretizer._validate_n_binsc                 C   s   t |  | jdu rtjtjfn| j}t| |d|dd}| j}t|jd D ]}tj	|| dd |dd|f dd|dd|f< q%| j
d	krI|S d}d
| j
v rY| jj}|j| j_z| j|}W || j_|S || j_w )a  
        Discretize the data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Data to be discretized.

        Returns
        -------
        Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}
            Data in the binned space. Will be a sparse matrix if
            `self.encode='onehot'` and ndarray otherwise.
        NTF)r^   r&   resetr   r<   right)Zsider   r   )r   r&   r?   rC   rD   r   rT   rL   rE   Zsearchsortedr#   rV   	transform)r+   rW   r&   ZXtrZ   r[   Z
dtype_initZXt_encr,   r,   r-   rl     s"   6



zKBinsDiscretizer.transformc                 C   s   t |  d| jv r| j|}t|dtjtjfd}| jj	d }|j	d |kr2t
d||j	d t|D ])}| j| }|dd |dd  d	 }||dd|f tj |dd|f< q6|S )
a  
        Transform discretized data back to original feature space.

        Note that this function does not regenerate the original data
        due to discretization rounding.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Transformed data in the binned space.

        Returns
        -------
        X_original : ndarray, dtype={np.float32, np.float64}
            Data in the original feature space.
        r   T)r^   r&   r   r   z8Incorrect number of features. Expecting {}, received {}.Nr<   r=   )r   r#   rV   inverse_transformr   r?   rC   rD   rU   rE   rJ   rg   rL   rT   ZastypeZint64)r+   rW   ZXinvrY   r[   rZ   Zbin_centersr,   r,   r-   rm     s    


*z"KBinsDiscretizer.inverse_transformc                 C   s.   t | d t| |}t| dr| j|S |S )a  Get output feature names.

        Parameters
        ----------
        input_features : array-like of str or None, default=None
            Input features.

            - If `input_features` is `None`, then `feature_names_in_` is
              used as feature names in. If `feature_names_in_` is not defined,
              then the following input feature names are generated:
              `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
            - If `input_features` is an array-like, then `input_features` must
              match `feature_names_in_` if `feature_names_in_` is defined.

        Returns
        -------
        feature_names_out : ndarray of str objects
            Transformed feature names.
        Zn_features_in_rV   )r   r   hasattrrV   get_feature_names_out)r+   Zinput_featuresr,   r,   r-   ro   
  s
   


z&KBinsDiscretizer.get_feature_names_out)r)   )NNr*   )rh   
__module____qualname____doc__r   r   r
   r	   typer?   rC   rD   r(   dict__annotations__r.   r   rQ   rF   rl   rm   ro   r,   r,   r,   r-   r      s<   
  ! ?''r   )rH   numbersr   numpyr?   baser   r   r   utilsr   Zutils._param_validationr   r	   r
   Zutils.statsr   r   Zutils.validationr   r   r   r   r   Z	_encodersr   r   r,   r,   r,   r-   <module>   s   