
    h                     :   d Z ddlZddlmZmZ ddlZddlmZm	Z	 ddl
mZ ddlmZmZ ddlmZ dd	lmZmZmZmZmZ dd
lmZmZmZ ddlmZmZ ddlmZm Z  ddl!m"Z" d Z#d Z$ eddgdgdd      d        Z%d Z& eddgdgdd      d        Z' eddgdgdgdgdd      dddd       Z( eddgdgdgdgdd      dddd       Z) G d de"e      Z* G d  d!e*      Z+ G d" d#e*      Z, G d$ d%e*      Z- G d& d'e*      Z. G d( d)e*      Z/ G d* d+e*      Z0y),zUnivariate features selection.    N)IntegralReal)specialstats)issparse   )BaseEstimator_fit_context)LabelBinarizer)as_float_arraycheck_array	check_X_y	safe_masksafe_sqr)Interval
StrOptionsvalidate_params)	row_normssafe_sparse_dot)check_is_fittedvalidate_data   )SelectorMixinc                     t        | d      } t        j                  | j                        j                  | t        j
                  |       <   | S )z
    Fixes Issue #1240: NaNs can't be properly compared, so change them to the
    smallest value of scores's dtype. -inf seems to be unreliable.
    T)copy)r   npfinfodtypeminisnan)scoress    m/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/sklearn/feature_selection/_univariate_selection.py_clean_nansr#      s;     F.F!xx599F288FM    c                     t        |       }| D cg c]  }t        |       } }t        j                  | D cg c]  }|j                  d    c}      }t        j
                  |      }t        d | D              }| D cg c]'  }t        j                  |j                  d            ) }}t        |      dz  }|D cg c]  }|dz  	 }	}||t        |      z  z
  }
d}t        |       D ]  \  }}||	|   ||   z  z  } ||t        |      z  z  }|
|z
  }|dz
  }||z
  }|t        |      z  }|t        |      z  }t        j                  |dk(        d   }t        j                  |      d   j                  |j                  k7  r)|j                  rt        j                  d|z  t               ||z  }t        j                  |      j                         }t!        j"                  |||      }||fS c c}w c c}w c c}w c c}w )a  Perform a 1-way ANOVA.

    The one-way ANOVA tests the null hypothesis that 2 or more groups have
    the same population mean. The test is applied to samples from two or
    more groups, possibly with differing sizes.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    *args : {array-like, sparse matrix}
        Sample1, sample2... The sample measurements should be given as
        arguments.

    Returns
    -------
    f_statistic : float
        The computed F-value of the test.
    p_value : float
        The associated p-value from the F-distribution.

    Notes
    -----
    The ANOVA test has important assumptions that must be satisfied in order
    for the associated p-value to be valid.

    1. The samples are independent
    2. Each sample is from a normally distributed population
    3. The population standard deviations of the groups are all equal. This
       property is known as homoscedasticity.

    If these assumptions are not true for a given set of data, it may still be
    possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`_) although
    with some loss of power.

    The algorithm is from Heiman[2], pp.394-7.

    See ``scipy.stats.f_oneway`` that should give the same results while
    being less efficient.

    References
    ----------
    .. [1] Lowry, Richard.  "Concepts and Applications of Inferential
           Statistics". Chapter 14.
           http://vassarstats.net/textbook

    .. [2] Heiman, G.W.  Research Methods in Statistics. 2002.
    r   c              3   R   K   | ]  }t        |      j                  d        ! yw)r   axisN)r   sum).0as     r"   	<genexpr>zf_oneway.<locals>.<genexpr>^   s     ;Xa[__!_,;s   %'r'   r           r   zFeatures %s are constant.)lenr   r   arrayshaper)   asarrayfloat	enumeratewherenonzerosizewarningswarnUserWarningravelr   fdtrc)args	n_classesr+   n_samples_per_class	n_samples
ss_alldata	sums_argssquare_of_sums_alldatassquare_of_sums_argssstotssbnk_sswndfbndfwnmsbmswconstant_features_idxfprobs                         r"   f_onewayrQ   )   s   b D	I'+,!N1,D,((#=1AGGAJ#=>*+I;d;;J489qAEEqEM*9I9 ^q0)23A1a433/%	2BBBED$ @1#A&)<Q)???@"U9%555D4<Dq=Dy D
t
C
t
CHHSCZ03	zz#q#((*/D/I/I14II;Wc	A


1A==tQ'Dd7N1 -#= :3s   G3G8?,G=?Hz
array-likezsparse matrix)XyTprefer_skip_nested_validationc           	          t        | |g d      \  } }t        j                  |      D cg c]  }| t        | ||k(            }}t	        | S c c}w )a  Compute the ANOVA F-value for the provided sample.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    X : {array-like, sparse matrix} of shape (n_samples, n_features)
        The set of regressors that will be tested sequentially.

    y : array-like of shape (n_samples,)
        The target vector.

    Returns
    -------
    f_statistic : ndarray of shape (n_features,)
        F-statistic for each feature.

    p_values : ndarray of shape (n_features,)
        P-values associated with the F-statistic.

    See Also
    --------
    chi2 : Chi-squared stats of non-negative features for classification tasks.
    f_regression : F-value between label/feature for regression tasks.

    Examples
    --------
    >>> from sklearn.datasets import make_classification
    >>> from sklearn.feature_selection import f_classif
    >>> X, y = make_classification(
    ...     n_samples=100, n_features=10, n_informative=2, n_clusters_per_class=1,
    ...     shuffle=False, random_state=42
    ... )
    >>> f_statistic, p_values = f_classif(X, y)
    >>> f_statistic
    array([2.21e+02, 7.02e-01, 1.70e+00, 9.31e-01,
           5.41e+00, 3.25e-01, 4.71e-02, 5.72e-01,
           7.54e-01, 8.90e-02])
    >>> p_values
    array([7.14e-27, 4.04e-01, 1.96e-01, 3.37e-01,
           2.21e-02, 5.70e-01, 8.29e-01, 4.51e-01,
           3.87e-01, 7.66e-01])
    csrcsccooaccept_sparse)r   r   uniquer   rQ   )rR   rS   rG   r<   s       r"   	f_classifr^   v   sR    f Q)>?DAq-/YYq\:Ai16"#:D:T? ;s   Ac                 4   t        j                  | t         j                        } t        |       }| }||z  }|dz  }t        j                  d      5  ||z  }ddd       |j                  d      }|t        j                  |dz
  |      fS # 1 sw Y   6xY w)	zFast replacement for scipy.stats.chisquare.

    Version from https://github.com/scipy/scipy/pull/2525 with additional
    optimizations.
    r   r   ignore)invalidNr   r'   r   )r   r1   float64r.   errstater)   r   chdtrc)f_obsf_exprG   chisqs       r"   
_chisquareri      s     JJuBJJ/EE
AE	UNE	aKE	X	& II1IE'..Q... s   BBc                    t        | dt        j                  t        j                  f      } t        j                  t        |       r| j                  n| dk        rt        d      t        d      j                  |      }|j                  d   dk(  r+|j                         }t        j                  d|z
  |d      }t        |j                  |       }t        |      r|j                         }| j                  d      j!                  dd	      }|j#                  d      j!                  dd	      }t        j$                  |j                  |      }t'        ||      S )
a  Compute chi-squared stats between each non-negative feature and class.

    This score can be used to select the `n_features` features with the
    highest values for the test chi-squared statistic from X, which must
    contain only **non-negative integer feature values** such as booleans or frequencies
    (e.g., term counts in document classification), relative to the classes.

    If some of your features are continuous, you need to bin them, for
    example by using :class:`~sklearn.preprocessing.KBinsDiscretizer`.

    Recall that the chi-square test measures dependence between stochastic
    variables, so using this function "weeds out" the features that are the
    most likely to be independent of class and therefore irrelevant for
    classification.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    X : {array-like, sparse matrix} of shape (n_samples, n_features)
        Sample vectors.

    y : array-like of shape (n_samples,)
        Target vector (class labels).

    Returns
    -------
    chi2 : ndarray of shape (n_features,)
        Chi2 statistics for each feature.

    p_values : ndarray of shape (n_features,)
        P-values for each feature.

    See Also
    --------
    f_classif : ANOVA F-value between label/feature for classification tasks.
    f_regression : F-value between label/feature for regression tasks.

    Notes
    -----
    Complexity of this algorithm is O(n_classes * n_features).

    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.feature_selection import chi2
    >>> X = np.array([[1, 1, 3],
    ...               [0, 1, 5],
    ...               [5, 4, 1],
    ...               [6, 6, 2],
    ...               [1, 4, 0],
    ...               [0, 0, 0]])
    >>> y = np.array([1, 1, 0, 0, 2, 2])
    >>> chi2_stats, p_values = chi2(X, y)
    >>> chi2_stats
    array([15.3,  6.5       ,  8.9])
    >>> p_values
    array([0.000456, 0.0387, 0.0116 ])
    rX   r\   r   r   zInput X must be non-negative.T)sparse_outputr   r'   )r   r   rc   float32anyr   data
ValueErrorr   fit_transformr0   toarrayappendr   Tr)   reshapemeandotri   )rR   rS   Yobservedfeature_count
class_probexpecteds          r"   chi2r~      s   P 	AU2::rzz2JKA	vv!qvv!q01899 	T*88;AwwqzQIIKIIa!eQQ'qssA&H
 ##%EEqEM))!R0MQ''2.JvvjllM2Hh))r$   boolean)rR   rS   centerforce_finiter   r   c                   t        | |g dt        j                        \  } }| j                  d   }|r|t        j                  |      z
  }| j	                  d      }t        |t        j                        r|j                         n|}t        j                  t        | j                  d      ||dz  z  z
        }nt        | j                        }t        ||       }t        j                  dd	      5  ||z  }|t        j                  j                  |      z  }d
d
d
       |r=t        j                  |      j!                         st        j"                  |      }d||<   |S # 1 sw Y   JxY w)a  Compute Pearson's r for each features and the target.

    Pearson's r is also known as the Pearson correlation coefficient.

    Linear model for testing the individual effect of each of many regressors.
    This is a scoring function to be used in a feature selection procedure, not
    a free standing feature selection procedure.

    The cross correlation between each regressor and the target is computed
    as::

        E[(X[:, i] - mean(X[:, i])) * (y - mean(y))] / (std(X[:, i]) * std(y))

    For more on usage see the :ref:`User Guide <univariate_feature_selection>`.

    .. versionadded:: 1.0

    Parameters
    ----------
    X : {array-like, sparse matrix} of shape (n_samples, n_features)
        The data matrix.

    y : array-like of shape (n_samples,)
        The target vector.

    center : bool, default=True
        Whether or not to center the data matrix `X` and the target vector `y`.
        By default, `X` and `y` will be centered.

    force_finite : bool, default=True
        Whether or not to force the Pearson's R correlation to be finite.
        In the particular case where some features in `X` or the target `y`
        are constant, the Pearson's R correlation is not defined. When
        `force_finite=False`, a correlation of `np.nan` is returned to
        acknowledge this case. When `force_finite=True`, this value will be
        forced to a minimal correlation of `0.0`.

        .. versionadded:: 1.1

    Returns
    -------
    correlation_coefficient : ndarray of shape (n_features,)
        Pearson's R correlation coefficients of features.

    See Also
    --------
    f_regression: Univariate linear regression tests returning f-statistic
        and p-values.
    mutual_info_regression: Mutual information for a continuous target.
    f_classif: ANOVA F-value between label/feature for classification tasks.
    chi2: Chi-squared stats of non-negative features for classification tasks.

    Examples
    --------
    >>> from sklearn.datasets import make_regression
    >>> from sklearn.feature_selection import r_regression
    >>> X, y = make_regression(
    ...     n_samples=50, n_features=3, n_informative=1, noise=1e-4, random_state=42
    ... )
    >>> r_regression(X, y)
    array([-0.157,  1.        , -0.229])
    rW   rk   r   r'   T)squaredr   ra   dividerb   Nr-   )r   r   rc   r0   rw   
isinstancematrixgetA1sqrtr   ru   r   rd   linalgnormisfiniteallr    )	rR   rS   r   r   r?   X_meansX_normscorrelation_coefficientnan_masks	            r"   r_regressionr   $  s/   P Q)>bjjQDAq
I
 
N &&a&.%/%C'--/'')ACC6WaZ9OOPACC.-a3	Hh	7 57*299>>!#445 BKK(?@DDF 8834,/)""5 5s   6(E''E0c                   t        | |||      }|j                  |rdndz
  }|dz  }t        j                  dd      5  |d|z
  z  |z  }t        j
                  j                  |d|      }ddd       |rt        j                        j                         s`t        j                  |      }	t        j                  |j                        j                  ||	<   t        j                  |      }
d||
<   d|
<   fS # 1 sw Y   xY w)	aa  Univariate linear regression tests returning F-statistic and p-values.

    Quick linear model for testing the effect of a single regressor,
    sequentially for many regressors.

    This is done in 2 steps:

    1. The cross correlation between each regressor and the target is computed
       using :func:`r_regression` as::

           E[(X[:, i] - mean(X[:, i])) * (y - mean(y))] / (std(X[:, i]) * std(y))

    2. It is converted to an F score and then to a p-value.

    :func:`f_regression` is derived from :func:`r_regression` and will rank
    features in the same order if all the features are positively correlated
    with the target.

    Note however that contrary to :func:`f_regression`, :func:`r_regression`
    values lie in [-1, 1] and can thus be negative. :func:`f_regression` is
    therefore recommended as a feature selection criterion to identify
    potentially predictive feature for a downstream classifier, irrespective of
    the sign of the association with the target variable.

    Furthermore :func:`f_regression` returns p-values while
    :func:`r_regression` does not.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    X : {array-like, sparse matrix} of shape (n_samples, n_features)
        The data matrix.

    y : array-like of shape (n_samples,)
        The target vector.

    center : bool, default=True
        Whether or not to center the data matrix `X` and the target vector `y`.
        By default, `X` and `y` will be centered.

    force_finite : bool, default=True
        Whether or not to force the F-statistics and associated p-values to
        be finite. There are two cases where the F-statistic is expected to not
        be finite:

        - when the target `y` or some features in `X` are constant. In this
          case, the Pearson's R correlation is not defined leading to obtain
          `np.nan` values in the F-statistic and p-value. When
          `force_finite=True`, the F-statistic is set to `0.0` and the
          associated p-value is set to `1.0`.
        - when a feature in `X` is perfectly correlated (or
          anti-correlated) with the target `y`. In this case, the F-statistic
          is expected to be `np.inf`. When `force_finite=True`, the F-statistic
          is set to `np.finfo(dtype).max` and the associated p-value is set to
          `0.0`.

        .. versionadded:: 1.1

    Returns
    -------
    f_statistic : ndarray of shape (n_features,)
        F-statistic for each feature.

    p_values : ndarray of shape (n_features,)
        P-values associated with the F-statistic.

    See Also
    --------
    r_regression: Pearson's R between label/feature for regression tasks.
    f_classif: ANOVA F-value between label/feature for classification tasks.
    chi2: Chi-squared stats of non-negative features for classification tasks.
    SelectKBest: Select features based on the k highest scores.
    SelectFpr: Select features based on a false positive rate test.
    SelectFdr: Select features based on an estimated false discovery rate.
    SelectFwe: Select features based on family-wise error rate.
    SelectPercentile: Select features based on percentile of the highest
        scores.

    Examples
    --------
    >>> from sklearn.datasets import make_regression
    >>> from sklearn.feature_selection import f_regression
    >>> X, y = make_regression(
    ...     n_samples=50, n_features=3, n_informative=1, noise=1e-4, random_state=42
    ... )
    >>> f_statistic, p_values = f_regression(X, y)
    >>> f_statistic
    array([1.21, 2.67e13, 2.66])
    >>> p_values
    array([0.276, 1.54e-283, 0.11])
    r   r   r   ra   r   Nr-   g      ?)r   r6   r   rd   r   rO   sfr   r   isinfr   r   maxr    )rR   rS   r   r   r   deg_of_freedomcorr_coef_squaredf_statisticp_valuesmask_infmask_nans              r"   f_regressionr     s    L +	1V, VVFq2N/2	Hh	7 >'1/@+@ANR77::k1n=> BKK488: 88K( "):): ; ? ?H 88K( #H   > >s   -C==Dc                   d     e Zd ZU dZdegiZeed<   d Z e	d      d
d       Z
d Z fd	Z xZS )_BaseFilterzInitialize the univariate feature selection.

    Parameters
    ----------
    score_func : callable
        Function taking two arrays X and y, and returning a pair of arrays
        (scores, pvalues) or a single array with scores.
    
score_func_parameter_constraintsc                     || _         y Nr   )selfr   s     r"   __init__z_BaseFilter.__init__  s	    $r$   TrT   c                    |t        | |ddg      }nt        | ||ddgd      \  }}| j                  ||       | j                  ||      }t        |t        t
        f      r4|\  | _        | _        t        j                  | j                        | _        n|| _        d| _        t        j                  | j                        | _        | S )a  Run score function on (X, y) and get the appropriate features.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The training input samples.

        y : array-like of shape (n_samples,) or None
            The target values (class labels in classification, real numbers in
            regression). If the selector is unsupervised then `y` can be set to `None`.

        Returns
        -------
        self : object
            Returns the instance itself.
        NrX   rY   r[   T)r\   multi_output)
r   _check_paramsr   r   listtuplescores_pvalues_r   r1   )r   rR   rS   score_func_rets       r"   fitz_BaseFilter.fit  s    $ 9dAeU^DA a5%.tDAq 	1a A.ntUm4*8'DL$-JJt}}5DM)DL DMzz$,,/r$   c                      y r    r   rR   rS   s      r"   r   z_BaseFilter._check_paramsD  s    r$   c                 h    t         |          }d|j                  _        d|j                  _        |S )NT)super__sklearn_tags__target_tagsrequired
input_tagssparser   tags	__class__s     r"   r   z_BaseFilter.__sklearn_tags__G  s/    w')$(!!%r$   r   )__name__
__module____qualname____doc__callabler   dict__annotations__r   r
   r   r   r   __classcell__r   s   @r"   r   r     sJ     %18*#=D=% 5# 6#J r$   r   c                        e Zd ZU dZi ej
                  d eeddd      giZee	d<   e
fdd	 fd
Zd Z fdZ xZS )SelectPercentilea	  Select features according to a percentile of the highest scores.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    score_func : callable, default=f_classif
        Function taking two arrays X and y, and returning a pair of arrays
        (scores, pvalues) or a single array with scores.
        Default is f_classif (see below "See Also"). The default function only
        works with classification tasks.

        .. versionadded:: 0.18

    percentile : int, default=10
        Percent of features to keep.

    Attributes
    ----------
    scores_ : array-like of shape (n_features,)
        Scores of features.

    pvalues_ : array-like of shape (n_features,)
        p-values of feature scores, None if `score_func` returned only scores.

    n_features_in_ : int
        Number of features seen during :term:`fit`.

        .. versionadded:: 0.24

    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during :term:`fit`. Defined only when `X`
        has feature names that are all strings.

        .. versionadded:: 1.0

    See Also
    --------
    f_classif : ANOVA F-value between label/feature for classification tasks.
    mutual_info_classif : Mutual information for a discrete target.
    chi2 : Chi-squared stats of non-negative features for classification tasks.
    f_regression : F-value between label/feature for regression tasks.
    mutual_info_regression : Mutual information for a continuous target.
    SelectKBest : Select features based on the k highest scores.
    SelectFpr : Select features based on a false positive rate test.
    SelectFdr : Select features based on an estimated false discovery rate.
    SelectFwe : Select features based on family-wise error rate.
    GenericUnivariateSelect : Univariate feature selector with configurable
        mode.

    Notes
    -----
    Ties between features with equal scores will be broken in an unspecified
    way.

    This filter supports unsupervised feature selection that only requests `X` for
    computing the scores.

    Examples
    --------
    >>> from sklearn.datasets import load_digits
    >>> from sklearn.feature_selection import SelectPercentile, chi2
    >>> X, y = load_digits(return_X_y=True)
    >>> X.shape
    (1797, 64)
    >>> X_new = SelectPercentile(chi2, percentile=10).fit_transform(X, y)
    >>> X_new.shape
    (1797, 7)
    
percentiler   d   bothclosedr   
   )r   c                4    t         |   |       || _        y Nr   )r   r   r   )r   r   r   r   s      r"   r   zSelectPercentile.__init__  s    J/$r$   c                 T   t        |        | j                  dk(  r.t        j                  t	        | j
                        t              S | j                  dk(  r.t        j                  t	        | j
                        t              S t        | j
                        }t        j                  |d| j                  z
        }||kD  }t        j                  ||k(        d   }t	        |      r?t        t	        |      | j                  z  dz        }|d ||j                         z
   }d||<   |S )Nr   r`   r   T)r   r   r   onesr.   r   boolzerosr#   r4   intr)   )r   r!   	thresholdmaskties	max_feats	kept_tiess          r"   _get_support_maskz"SelectPercentile._get_support_mask  s     ??c!773t||,D99__!88C-T::T\\*MM&#*?@		!xx)+,Q/t9CK$//9C?@I5y488:56I"DOr$   c                 F    t         |          }d|j                  _        |S NFr   r   r   r   r   s     r"   r   z!SelectPercentile.__sklearn_tags__  #    w')$)!r$   )r   r   r   r   r   r   r   r   r   r   r^   r   r   r   r   r   s   @r"   r   r   Q  s_    DL$

,
,$xaV<=$D 
 #, %2 %& r$   r   c            	            e Zd ZU dZi ej
                  d edh       eeddd      giZe	e
d<   efd	d
 fdZd Zd Z fdZ xZS )SelectKBesta	  Select features according to the k highest scores.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    score_func : callable, default=f_classif
        Function taking two arrays X and y, and returning a pair of arrays
        (scores, pvalues) or a single array with scores.
        Default is f_classif (see below "See Also"). The default function only
        works with classification tasks.

        .. versionadded:: 0.18

    k : int or "all", default=10
        Number of top features to select.
        The "all" option bypasses selection, for use in a parameter search.

    Attributes
    ----------
    scores_ : array-like of shape (n_features,)
        Scores of features.

    pvalues_ : array-like of shape (n_features,)
        p-values of feature scores, None if `score_func` returned only scores.

    n_features_in_ : int
        Number of features seen during :term:`fit`.

        .. versionadded:: 0.24

    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during :term:`fit`. Defined only when `X`
        has feature names that are all strings.

        .. versionadded:: 1.0

    See Also
    --------
    f_classif: ANOVA F-value between label/feature for classification tasks.
    mutual_info_classif: Mutual information for a discrete target.
    chi2: Chi-squared stats of non-negative features for classification tasks.
    f_regression: F-value between label/feature for regression tasks.
    mutual_info_regression: Mutual information for a continuous target.
    SelectPercentile: Select features based on percentile of the highest
        scores.
    SelectFpr : Select features based on a false positive rate test.
    SelectFdr : Select features based on an estimated false discovery rate.
    SelectFwe : Select features based on family-wise error rate.
    GenericUnivariateSelect : Univariate feature selector with configurable
        mode.

    Notes
    -----
    Ties between features with equal scores will be broken in an unspecified
    way.

    This filter supports unsupervised feature selection that only requests `X` for
    computing the scores.

    Examples
    --------
    >>> from sklearn.datasets import load_digits
    >>> from sklearn.feature_selection import SelectKBest, chi2
    >>> X, y = load_digits(return_X_y=True)
    >>> X.shape
    (1797, 64)
    >>> X_new = SelectKBest(chi2, k=20).fit_transform(X, y)
    >>> X_new.shape
    (1797, 20)
    rG   r   r   Nleftr   r   r   )rG   c                4    t         |   |       || _        y r   )r   r   rG   )r   r   rG   r   s      r"   r   zSelectKBest.__init__  s    J/r$   c                     t        | j                  t              sQ| j                  |j                  d   kD  r4t	        j
                  d| j                   d|j                  d    d       y y y )Nr   zk=z is greater than n_features=z$. All the features will be returned.)r   rG   strr0   r7   r8   r   s      r"   r   zSelectKBest._check_params  s\    $&&#&466AGGAJ+>MMTVVH8 E5 5 ,?&r$   c                    t        |        | j                  dk(  r/t        j                  | j                  j
                  t              S | j                  dk(  r/t        j                  | j                  j
                  t              S t        | j                        }t        j                  |j
                  t              }d|t        j                  |d      | j                   d  <   |S )Nr   r`   r   r   	mergesort)kind)
r   rG   r   r   r   r0   r   r   r#   argsort)r   r!   r   s      r"   r   zSelectKBest._get_support_mask  s    66U?774<<--T::VVq[88DLL..d;; .F88FLL5D EFDF5tvvgi@AKr$   c                 F    t         |          }d|j                  _        |S r   r   r   s     r"   r   zSelectKBest.__sklearn_tags__#  r   r$   )r   r   r   r   r   r   r   r   r   r   r   r^   r   r   r   r   r   r   s   @r"   r   r     sm    FP$

,
,$j%!8Haf#MN$D 
 #, "   r$   r   c                   x     e Zd ZU dZi ej
                  d eeddd      giZee	d<   e
fdd	 fd
Zd Z xZS )	SelectFpra  Filter: Select the pvalues below alpha based on a FPR test.

    FPR test stands for False Positive Rate test. It controls the total
    amount of false detections.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    score_func : callable, default=f_classif
        Function taking two arrays X and y, and returning a pair of arrays
        (scores, pvalues).
        Default is f_classif (see below "See Also"). The default function only
        works with classification tasks.

    alpha : float, default=5e-2
        Features with p-values less than `alpha` are selected.

    Attributes
    ----------
    scores_ : array-like of shape (n_features,)
        Scores of features.

    pvalues_ : array-like of shape (n_features,)
        p-values of feature scores.

    n_features_in_ : int
        Number of features seen during :term:`fit`.

        .. versionadded:: 0.24

    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during :term:`fit`. Defined only when `X`
        has feature names that are all strings.

        .. versionadded:: 1.0

    See Also
    --------
    f_classif : ANOVA F-value between label/feature for classification tasks.
    chi2 : Chi-squared stats of non-negative features for classification tasks.
    mutual_info_classif: Mutual information for a discrete target.
    f_regression : F-value between label/feature for regression tasks.
    mutual_info_regression : Mutual information for a continuous target.
    SelectPercentile : Select features based on percentile of the highest
        scores.
    SelectKBest : Select features based on the k highest scores.
    SelectFdr : Select features based on an estimated false discovery rate.
    SelectFwe : Select features based on family-wise error rate.
    GenericUnivariateSelect : Univariate feature selector with configurable
        mode.

    Examples
    --------
    >>> from sklearn.datasets import load_breast_cancer
    >>> from sklearn.feature_selection import SelectFpr, chi2
    >>> X, y = load_breast_cancer(return_X_y=True)
    >>> X.shape
    (569, 30)
    >>> X_new = SelectFpr(chi2, alpha=0.01).fit_transform(X, y)
    >>> X_new.shape
    (569, 16)
    alphar   r   r   r   r   皙?r   c                4    t         |   |       || _        y r   r   r   r   r   r   r   r   s      r"   r   zSelectFpr.__init__o      J/
r$   c                 J    t        |        | j                  | j                  k  S r   )r   r   r   r   s    r"   r   zSelectFpr._get_support_masks  s    }}tzz))r$   r   r   r   r   r   r   r   r   r   r   r^   r   r   r   r   s   @r"   r   r   )  sT    >@$

,
,$(4Af56$D 
 #, d *r$   r   c                   x     e Zd ZU dZi ej
                  d eeddd      giZee	d<   e
fdd	 fd
Zd Z xZS )	SelectFdra6	  Filter: Select the p-values for an estimated false discovery rate.

    This uses the Benjamini-Hochberg procedure. ``alpha`` is an upper bound
    on the expected false discovery rate.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    score_func : callable, default=f_classif
        Function taking two arrays X and y, and returning a pair of arrays
        (scores, pvalues).
        Default is f_classif (see below "See Also"). The default function only
        works with classification tasks.

    alpha : float, default=5e-2
        The highest uncorrected p-value for features to keep.

    Attributes
    ----------
    scores_ : array-like of shape (n_features,)
        Scores of features.

    pvalues_ : array-like of shape (n_features,)
        p-values of feature scores.

    n_features_in_ : int
        Number of features seen during :term:`fit`.

        .. versionadded:: 0.24

    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during :term:`fit`. Defined only when `X`
        has feature names that are all strings.

        .. versionadded:: 1.0

    See Also
    --------
    f_classif : ANOVA F-value between label/feature for classification tasks.
    mutual_info_classif : Mutual information for a discrete target.
    chi2 : Chi-squared stats of non-negative features for classification tasks.
    f_regression : F-value between label/feature for regression tasks.
    mutual_info_regression : Mutual information for a continuous target.
    SelectPercentile : Select features based on percentile of the highest
        scores.
    SelectKBest : Select features based on the k highest scores.
    SelectFpr : Select features based on a false positive rate test.
    SelectFwe : Select features based on family-wise error rate.
    GenericUnivariateSelect : Univariate feature selector with configurable
        mode.

    References
    ----------
    https://en.wikipedia.org/wiki/False_discovery_rate

    Examples
    --------
    >>> from sklearn.datasets import load_breast_cancer
    >>> from sklearn.feature_selection import SelectFdr, chi2
    >>> X, y = load_breast_cancer(return_X_y=True)
    >>> X.shape
    (569, 30)
    >>> X_new = SelectFdr(chi2, alpha=0.01).fit_transform(X, y)
    >>> X_new.shape
    (569, 16)
    r   r   r   r   r   r   r   r   c                4    t         |   |       || _        y r   r   r   s      r"   r   zSelectFdr.__init__  r   r$   c                    t        |        t        | j                        }t        j                  | j                        }||t        | j                        |z  t        j                  d|dz         z  k     }|j                  dk(  r%t        j                  | j                  t              S | j                  |j                         k  S )Nr   r   r`   )r   r.   r   r   sortr2   r   aranger6   
zeros_liker   r   )r   
n_featuressvselecteds       r"   r   zSelectFdr._get_support_mask  s    '
WWT]]#%

#j0299Q
Q3OOO
 ==A==d;;}}..r$   r   r   s   @r"   r   r   y  sU    BH$

,
,$(4Af56$D 
 #, d 
/r$   r   c                   x     e Zd ZU dZi ej
                  d eeddd      giZee	d<   e
fdd	 fd
Zd Z xZS )	SelectFwea  Filter: Select the p-values corresponding to Family-wise error rate.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    score_func : callable, default=f_classif
        Function taking two arrays X and y, and returning a pair of arrays
        (scores, pvalues).
        Default is f_classif (see below "See Also"). The default function only
        works with classification tasks.

    alpha : float, default=5e-2
        The highest uncorrected p-value for features to keep.

    Attributes
    ----------
    scores_ : array-like of shape (n_features,)
        Scores of features.

    pvalues_ : array-like of shape (n_features,)
        p-values of feature scores.

    n_features_in_ : int
        Number of features seen during :term:`fit`.

        .. versionadded:: 0.24

    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during :term:`fit`. Defined only when `X`
        has feature names that are all strings.

        .. versionadded:: 1.0

    See Also
    --------
    f_classif : ANOVA F-value between label/feature for classification tasks.
    chi2 : Chi-squared stats of non-negative features for classification tasks.
    f_regression : F-value between label/feature for regression tasks.
    SelectPercentile : Select features based on percentile of the highest
        scores.
    SelectKBest : Select features based on the k highest scores.
    SelectFpr : Select features based on a false positive rate test.
    SelectFdr : Select features based on an estimated false discovery rate.
    GenericUnivariateSelect : Univariate feature selector with configurable
        mode.

    Examples
    --------
    >>> from sklearn.datasets import load_breast_cancer
    >>> from sklearn.feature_selection import SelectFwe, chi2
    >>> X, y = load_breast_cancer(return_X_y=True)
    >>> X.shape
    (569, 30)
    >>> X_new = SelectFwe(chi2, alpha=0.01).fit_transform(X, y)
    >>> X_new.shape
    (569, 15)
    r   r   r   r   r   r   r   r   c                4    t         |   |       || _        y r   r   r   s      r"   r   zSelectFwe.__init__  r   r$   c                 v    t        |        | j                  | j                  t        | j                        z  k  S r   )r   r   r   r.   r   s    r"   r   zSelectFwe._get_support_mask  s+    }}tzzC,>>>>r$   r   r   s   @r"   r  r    sT    9v$

,
,$(4Af56$D 
 #, d ?r$   r  c                        e Zd ZU dZeeeeedZ	e
ed<   i ej                   e ee	j!                                     g eeddd       edh      gd	Ze
ed
<   efddd	 fdZd Z fdZd Zd Z xZS )GenericUnivariateSelecta	  Univariate feature selector with configurable strategy.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    score_func : callable, default=f_classif
        Function taking two arrays X and y, and returning a pair of arrays
        (scores, pvalues). For modes 'percentile' or 'kbest' it can return
        a single array scores.

    mode : {'percentile', 'k_best', 'fpr', 'fdr', 'fwe'}, default='percentile'
        Feature selection mode. Note that the `'percentile'` and `'kbest'`
        modes are supporting unsupervised feature selection (when `y` is `None`).

    param : "all", float or int, default=1e-5
        Parameter of the corresponding mode.

    Attributes
    ----------
    scores_ : array-like of shape (n_features,)
        Scores of features.

    pvalues_ : array-like of shape (n_features,)
        p-values of feature scores, None if `score_func` returned scores only.

    n_features_in_ : int
        Number of features seen during :term:`fit`.

        .. versionadded:: 0.24

    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during :term:`fit`. Defined only when `X`
        has feature names that are all strings.

        .. versionadded:: 1.0

    See Also
    --------
    f_classif : ANOVA F-value between label/feature for classification tasks.
    mutual_info_classif : Mutual information for a discrete target.
    chi2 : Chi-squared stats of non-negative features for classification tasks.
    f_regression : F-value between label/feature for regression tasks.
    mutual_info_regression : Mutual information for a continuous target.
    SelectPercentile : Select features based on percentile of the highest
        scores.
    SelectKBest : Select features based on the k highest scores.
    SelectFpr : Select features based on a false positive rate test.
    SelectFdr : Select features based on an estimated false discovery rate.
    SelectFwe : Select features based on family-wise error rate.

    Examples
    --------
    >>> from sklearn.datasets import load_breast_cancer
    >>> from sklearn.feature_selection import GenericUnivariateSelect, chi2
    >>> X, y = load_breast_cancer(return_X_y=True)
    >>> X.shape
    (569, 30)
    >>> transformer = GenericUnivariateSelect(chi2, mode='k_best', param=20)
    >>> X_new = transformer.fit_transform(X, y)
    >>> X_new.shape
    (569, 20)
    )r   k_bestfprfdrfwe_selection_modesr   Nr   r   r   )modeparamr   r   gh㈵>c                B    t         |   |       || _        || _        y r   )r   r   r  r  )r   r   r  r  r   s       r"   r   z GenericUnivariateSelect.__init__u  s!    J/	
r$   c                      | j                   | j                     | j                        }|j                         }|j	                  d        |j
                  di |d   | j                  i |S )Nr   r   r   r   )r  r  r   _get_param_namesremove
set_paramsr  )r   selectorpossible_paramss      r"   _make_selectorz&GenericUnivariateSelect._make_selectorz  se    34((3tO #335|,?q14::>?r$   c                 J    t         |          }ddg|j                  _        |S )Nrc   rn   )r   r   transformer_tagspreserves_dtyper   s     r"   r   z(GenericUnivariateSelect.__sklearn_tags__  s(    w')1:I0F-r$   c                 D    | j                         j                  ||       y r   )r  r   r   s      r"   r   z%GenericUnivariateSelect._check_params  s    ++Aq1r$   c                     t        |        | j                         }| j                  |_        | j                  |_        |j	                         S r   )r   r  r   r   r   )r   r  s     r"   r   z)GenericUnivariateSelect._get_support_mask  s?    &&( MM<<))++r$   )r   r   r   r   r   r   r   r   r  r  r   r   r   r   r   setkeysr   r   r^   r   r  r   r   r   r   r   s   @r"   r  r  &  s    >B 'd $

,
,$C 0 5 5 789:4D8*eW:MN$D  #, \ 
	
2,r$   r  )1r   r7   numbersr   r   numpyr   scipyr   r   scipy.sparser   baser	   r
   preprocessingr   utilsr   r   r   r   r   utils._param_validationr   r   r   utils.extmathr   r   utils.validationr   r   _baser   r#   rQ   r^   ri   r~   r   r   r   r   r   r   r   r  r  r   r$   r"   <module>r(     s   $
  "    ! . * O O K K 6 =  	&JZ O,^ #'..b/& O,^ #'Y*Y*x O,^+"	 #' "&D ]#]#@ O,^+"	 #' "&D r!r!r<- <Df{ fRl+ l^M* M*`X/ X/vH? H?dm,k m,r$   