o
    i6                    @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZmZmZ d dlmZ dd	 Zejd
g ddd Zejd
g ddd Zejdejejejgejdejejejgdd Zejdejejejgdd Zdd Zdd Zdd Z dd Z!dd Z"ejjd g d!g d"ge#g d#g d$gej#g d%g d&ge$d'ej#g d(d)ej%d*gge$d'ej#g d(d)e&d+d*gge$d'ej#g d,g d-ge$d'ej#g d.d)ej%dgge$d'ej#g d.d)e&d+dgge$d'gg d/d0d1d2 Z'ejd
g dejd3d4d5gejd6dd7gd8d9 Z(ejd3d4d5gejd:d;d<gd=d<gd;d<ggg d>g d?g d>gfd@d)gdAd)gdBdCgdAd)ggg dDg dEg dFgfgdGdH Z)dIdJ Z*ejd6g dKejdLg dKdMdN Z+ejdOdPdQgejd d=d;ge#dRdSggdTdU Z,ejdOdPdQgdVdW Z-ejjdXdYd<gdZd<ggdYdZgd<ggej.fe#d=d;gd[d;ggd=d[gd;ggej/fej#d\d*gd]d*gge$d'd\d]gd*ggej.fe#d\d*gd]d*ggd\d]gd*ggej0fe#d=d;gej%d;ggd=ej%gd;ggejfej#d\ej%gdej%gge$d'd\dgej%ggej.fej#d\e&d+gde&d+gge$d'd\dge&d+ggej.fgg d^d0d_d` Z1ejd
g dejjdaej#d)dCgge$d'j2ej#d)dbgge$d'j2g dcgej.fej#d=d;ggddd'j2ej#d=deggddd'j2g dfgej3fej#d)dCgge$d'j2ej#d)dbgge$d'j2e#g dcgej.fej#dd)gge$d'j2ej#ddCgge$d'j2g dgge$fej#d)dCgge$d'j2ej#d)ej%gge$d'j2g dhge$fej#d)dgge$d'j2ej#d)ej%gge$d'j2g dige$fgg djd0dkdl Z4dmdn Z5ejdoe
egdpdq Z6drds Z7dtdu Z8ejjdvd7dwdxgfdyg dzfg d{d|d}gfgg d~d0dd Z9dd Z:ejjd g d"g d!ge#g dg dgej#g d&g d%ge$d'gg dd0dd Z;ejjdaej#d)dCgge$d'j2ej#d)dbgge$d'j2g dcgej.fej#d=d;ggddd'j2ej#d=deggddd'j2g dfgej3fej#d)dCgge$d'j2ej#d)dbgge$d'j2e#g dcgej.fgg dd0dd Z<dd Z=dd Z>ejde&e?gdd Z@dd ZAdd ZBdd ZCdd ZDdd ZEdd ZFejd6dyd7gdd ZGejdej%de&d+gdd ZHejd6dYd[gg dgdd ZIejjdd5d4gddgd0ejjd6d7g dgd7dgd0dd ZJejdoe
egdd ZKejddd;iddiddid;dddeddgejddg dggdd ZLejd6dyd7dCggdd ZMejd6d)gdbggdd ZNejddd[iddiddiddiddid[dddeddgddń ZOejd6d7dCggddǄ ZPejd6d)gdbggddɄ ZQdd˄ ZRejdd[d=dddeigdd̈́ ZSddτ ZTddф ZUddӄ ZVddՄ ZWddׄ ZXejddd=dٜgddۄ ZYejdd;d[dٜgdd݄ ZZejdg dߢejdg ddd Z[dd Z\ejdej%dgdd Z]dd Z^ejd
g dejdddgdd Z_ejd
g ddd Z`ejd
g ddd Zaejd
g ddd Zbdd Zcdd Zdejdej%dgdd Zeejdddgejdej%dgdd Zfejjdaej#d)ej%gge$d'j2ej#d)dCgge$d'j2ej#d)dbej%ge$d'gej.fej#d)ej%gge$d'j2ej#d)dCgge$d'j2ej#d)dbej%ge$d'gej.fej#dej%ggejd'j2ej#dRggejd'j2e#ddSej%ggejfgg d d0dd Zgejdoe
egdd Zhejde#dej%dRggj2e#dej%dggj2e#dSggfe#g dgj2e#g d	gj2e#ej%ggfej#d
ej%dCgge$d'j2e#dej%dggj2ej#dbgge$d'fej#g dge$d'j2e#g dgj2ej#ej%gge$d'fgdd Ziejdedd Zjdd Zkejddd]ggej#dd]ggdd'ej#dd]ggdd'gejdd\d]ggej#d\d]ggdd'ej#d\d]ggdd'gdd Zldd Zmdd Zndd  Zoejd!d5d4gd"d# Zpejd$ej#d)gd%gge$d'd gej%gej%ggejqd%gdgdgge$d'fej#ej%gd%gd)gge$d'd gej%gej%ggejqd%gej%gej%gge$d'fgd&d' Zrd(d) Zsd*d+ Ztd,d- Zud.d/ Zvd0d1 Zwejddd[iddiddiddiddid[dddeddgd2d3 Zxd4d5 Zyd6d7 Zzd8d9 Z{d:d; Z|ejdddidd;igd<d= Z}ejddd=idd>igd?d@ Z~dAdB ZdCdD Zejdoe
egdEdF ZdS (G      N)sparse)NotFittedError)OneHotEncoderOrdinalEncoder)is_scalar_nan)_convert_containerassert_allcloseassert_array_equal)CSR_CONTAINERSc                  C   s   t g dg dg} t }tdd}|| }|| }|jdks$J |jdks+J t|s2J t|r9J t| g dg dg t| | d S )N         r   r   r   Fsparse_outputr      )              ?r   r   r   )r   r   r   r   r   )	nparrayr   fit_transformshaper   issparser	   toarray)XZ
enc_sparseZ	enc_denseX_trans_sparseZX_trans_dense r   x/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/sklearn/preprocessing/tests/test_encoders.py!test_one_hot_encoder_sparse_dense   s   


r    handle_unknown)ignoreinfrequent_if_existwarnc                 C   s   t g dg dg dg}t g dg}tdd}|| tjtdd || W d    n1 s7w   Y  t| d}|| | }t	||
 t g d	g t|| d S )
N)r   r   r   )r   r   r   )r   r   r   )   r   r   errorr!   Found unknown categoriesmatch)r   r   r   r   r   r   r   )r   r   r   fitpytestraises
ValueError	transformcopyr	   r   r   r!   r   X2ohZ	X2_passedr   r   r   #test_one_hot_encoder_handle_unknown*   s   



r4   c                 C   sx   t g dd}t ddgd}t| d}|| | }t|| t g dg dg t|| d S )N)Z1111111122Z333Z4444)r   Z55555r5   r'   )r   r   r   r   r   r   r   r   )	r   r   reshaper   r+   r0   r	   r/   r   r1   r   r   r   +test_one_hot_encoder_handle_unknown_stringsB   s   

r9   output_dtypeinput_dtypec                 C   s   t jddgg| dj}t jddgddgg|d}td|d}t|| | t||| | td|dd}t||| t|||| d S )Nr   r   dtypeauto)
categoriesr=   F)r?   r=   r   )	r   asarrayTr   r	   r   r   r+   r/   )r;   r:   r   
X_expectedr3   r   r   r   test_one_hot_encoder_dtypeU   s   rC   c                 C   s   t d}|ddgddgd}tjg dg dg| d	}t| d	}t|| | t|	|
| | t| d
d}t||| t|	|
|| d S )Npandasabr   r   ABr   r   r   r   r   r   r   r   r<   F)r=   r   )r,   importorskip	DataFramer   r   r   r	   r   r   r+   r/   )r:   pdX_dfrB   r3   r   r   r   !test_one_hot_encoder_dtype_pandasd   s   

rP   c                  C   s   t  } g dg dg dg dg}| | |  }tg d| | g d}tg d| tjtdd	 | d
dg W d    d S 1 sIw   Y  d S )N)Maler   girlr   r   )Female)   rR   r   
   )rQ   3   boy   r   )rQ   [   rR         )Z	x0_FemaleZx0_MaleZx1_1Zx1_41Zx1_51Zx1_91Zx2_boyZx2_girlZx3_1Zx3_2Zx3_12Zx3_21Zx4_3Zx4_10Zx4_30)onetwothreefourfive)Z
one_FemaleZone_MaleZtwo_1Ztwo_41Ztwo_51Ztwo_91Z	three_boyZ
three_girlZfour_1Zfour_2Zfour_12Zfour_21Zfive_3Zfive_10Zfive_30z!input_features should have lengthr)   r\   r]   )r   r+   get_feature_names_outr	   r,   r-   r.   )encr   feature_namesZfeature_names2r   r   r   "test_one_hot_encoder_feature_namest   s(   
"rd   c                  C   s\   t  } tjddggtdj}| | |  }tddg| | jdgd}tdd	g| d S )
Nu   c❤t1Zdat2r<   u	   x0_c❤t1Zx0_dat2u   n👍meZinput_featuresu   n👍me_c❤t1u   n👍me_dat2)r   r   r   objectrA   r+   ra   r	   )rb   r   rc   r   r   r   *test_one_hot_encoder_feature_names_unicode   s   
rg   c                  C   s   dd } t | d}tjddggtdj}|| | }tddg| |jd	gd
}tddg| dd }t |d|}d}tj	t
|d |  W d   dS 1 sWw   Y  dS )z=Check the behaviour of `feature_name_combiner` as a callable.c                 S   s   | d t | S )N_)reprfeaturecategoryr   r   r   name_combiner   s   zHtest_one_hot_encoder_custom_feature_name_combiner.<locals>.name_combiner)Zfeature_name_combinerNoneNr<   z	x0_'None'Zx0_NonerE   re   za_'None'Za_Nonec                 S   s   dS )Nr   r   rj   r   r   r   wrong_combiner   s   zItest_one_hot_encoder_custom_feature_name_combiner.<locals>.wrong_combinerzMWhen `feature_name_combiner` is a callable, it should return a Python string.r)   )r   r   r   rf   rA   r+   ra   r	   r,   r-   	TypeError)rm   rb   r   rc   ro   err_msgr   r   r   1test_one_hot_encoder_custom_feature_name_combiner   s   


"rr   c                  C   s   t ddggj} t }|jg dgd | d g dgks"J ||  jdks.J |jg dgd ||  jdksCJ d S )	Nr   r   )r   r   r   r   r?   r?   )r   r%   )r   r   r   r   r%   r   )	r   r   rA   r   
set_params
get_paramsr   r   r   )r   r3   r   r   r   test_one_hot_encoder_set_params   s   rv   c                 C   sX   t dd}|| }t ddd}|| }t| | t|r&|jdks(J | S )Nr>   rs   Fr?   r   Zcsr)r   r   r   r   r   r   format)r   rb   ZXtr1ZXtr2r   r   r   check_categorical_onehot   s   


ry   r   defr   7   abcr   r|   )rU   r   r|   )r   r   r|   )rF   rH   cat)rE   rI   r   r<   )rF   r   r   rE   r   nan)Nr   r   )rE   r   r   )Nr   N)mixednumericrf   z	mixed-nanzmixed-float-nanz
mixed-Nonezmixed-None-nanzmixed-None-float-nan)Zidsc                 C   s   t t| d d dgf }t|ddgddgg t t| d d ddgf }t|g dg dg tdd| }t| g dg dg d S )	Nr   r   )r   r   r   r   r   r   r   r   r>   rs   )r   r   r   r   r   )r   r   r   r   r   )ry   r   r   r   r   r   r   )r   Xtrr   r   r   test_one_hot_encoder   s   r   sparse_FTdropfirstc                 C   s  g dg dg dg}t ||d}||}tj|td}t||| ddgddgd	dgg}t |d
|d}||}t|}t||| |d u rg dg dg dg}t || ddgddgg dgd}||}tj|td}d |d< t||| ddgddgd	dgg}t |ddgddgg| d}||}tj|td}d |d< d |d d df< t||| tg dg dg}td}t	j
t|d || W d    d S 1 sw   Y  d S )Nr}   rz   )r~   r   r|   r   r   r<   r   r|   r   r   r>   )r   r?   r   r~   r{   )6   r|   8   )r   r!   r?   )r   r   r   r   )r   r?   r!   r   r   r   r   r   r   )Shape of the passed X data is not correctr)   )r   r   r   r   rf   r	   inverse_transformreescaper,   r-   r.   )r!   r   r   r   rb   X_trexpmsgr   r   r   test_one_hot_encoder_inverse  sJ   





"r   z
X, X_transr   r|   r   r   r   r   r   r\   r]   r^   rF   r   r   r   r   r   )r   r   r   r   r   )r   r   r   r   r   c                 C   s`   t |d| }d}|rt|d}tjt|d || W d   dS 1 s)w   Y  dS )zCheck that `inverse_transform` raise an error with unknown samples, no
    dropped feature, and `handle_unknow="error`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/14934
    r   zqSamples \[(\d )*\d\] can not be inverted when drop=None and handle_unknown='error' because they contain all zerosr   r)   N)r   r+   r   r,   r-   r.   r   )r   X_transr   rb   r   r   r   r   ?test_one_hot_encoder_inverse_transform_raise_error_with_unknownA  s   
"r   c                  C   sJ   t jddgddgddggtd} tddd	}|| }t|||  d S )
NrQ   r   rS   r   r   r<   	if_binaryFr   r   )r   r   rf   r   r   r	   r   )r   oher   r   r   r   &test_one_hot_encoder_inverse_if_binarya  s    
r   )r   r   N
reset_dropc                 C   s   t jddgddgddggtd}t| dd}|| ||}| }|j|d	 t|	|| t
||| t| | d S )
NrQ   r   rS   r   r   r<   Fr   r   )r   r   rf   r   r+   r/   ra   rt   r	   r   r   )r   r   r   r   r   rc   r   r   r   test_one_hot_encoder_drop_reseth  s    

r   methodr+   r         @      @c                 C   sL   t  }d}tjt|d t|||  W d    d S 1 sw   Y  d S )Nz'Expected 2D array, got 1D array insteadr)   )r   r,   r-   r.   getattr)r   r   r3   r   r   r   r   test_X_is_not_1Dw  s
   "r   c                 C   sp   t d}|g d}t }dt| d}t jt|d t|| | W d    d S 1 s1w   Y  d S )NrD   )   r   r%   r   z+Expected a 2-dimensional container but got z	 instead.r)   )r,   rL   Seriesr   typer-   r.   r   )r   rN   r   r3   r   r   r   r   test_X_is_not_1D_pandas  s   
"r   zX, cat_exp, cat_dtyper~   r{   r   rH   rI   )r   r   rf   stringzmissing-floatzmissing-np.nan-objectzmissing-float-nan-objectc                 C   s   | | d d d fD ]Q}t dd}|| t|jtsJ t|j|D ]6\}}| }t|d rHt|d s9J |d d |d d ksGJ n| |ksPJ t	|j
|sYJ q#q	d S )Nr6   r>   rs   )r   r+   
isinstancecategories_listziptolistr   r   
issubdtyper=   )r   Zcat_exp	cat_dtypeXirb   resr   Zres_listr   r   r   test_one_hot_encoder_categories  s   #

r   zX, X2, cats, cat_dtypedrE   rF   cint64r%   r   r   r   )NrE   z)rE   rF   r   )rE   Nr   )rf   r   zobject-stringzobject-string-nonezobject-string-nanzobject-None-and-nanc                 C   s  t |d}tg dg dg}t||  | t|jd t|d ks)J |jd 	 t|d ks8J |jd j
|ksBJ t |d}tjtdd || W d    n1 s^w   Y  t ||d}tg dg dg}t||| | d S )	Nrs   r   r   r   r   r   r   r   r(   r)   r?   r!   )r   r   r   )r   r   r   r	   r   r   r   r?   r   r   r=   r,   r-   r.   r+   r/   )r   r2   catsr   r!   rb   r   r   r   r   )test_one_hot_encoder_specified_categories  s   
3
r   c                  C   s  t jddggtdj} tg dgd}t g dg dg}t|| |  | t|	|  | |j
d  g dksBJ t |j
d jt jsOJ t d	d
ggj} tg dgd}d}tjt|d |	|  W d    d S 1 szw   Y  d S )NrE   rF   r<   )rF   rE   r   rs   r   r   r   r   r   )r   r   r   z%Unsorted categories are not supportedr)   )r   r   rf   rA   r   r	   r+   r/   r   r   r   r   r   r=   object_r,   r-   r.   )r   rb   r   r   r   r   r   (test_one_hot_encoder_unsorted_categories  s   "r   Encoderc                 C   sr   t dt jdgg}| |d}t jddggtdj}tjtdd || W d   dS 1 s2w   Y  dS )zTest encoder for specified categories that nan is at the end.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/27088
    r   r   rs   r<   zNan should be the last elementr)   N)	r   r   r   rf   rA   r,   r-   r.   r+   r   r   rb   r   r   r   r   ,test_encoder_nan_ending_specified_categories  s   
"r   c                  C   s   t jddgddggtdj} tg dg dgd}t g d	g d
g}t||  | |jd 	 g dks;J t 
|jd jt jsHJ |jd 	 g dksUJ t 
|jd jt jsbJ d S )NrE   rF   r   r   r<   r   )r   r   r   rs   )r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   rf   rA   r   r	   r   r   r   r   r   r=   r   r   rb   r   r   r   r   7test_one_hot_encoder_specified_categories_mixed_columns$  s   r   c                  C   sD   t d} | ddgddgd}t|}t|g dg dg d S )	NrD   rE   rF   r   r   rG   rJ   rK   )r,   rL   rM   ry   r   )rN   rO   r   r   r   r   test_one_hot_encoder_pandas1  s   
r   zdrop, expected_namesx0_cx2_br   )r   Zx1_2r   )r   r   rF   x0_bZx2_a)r   binarymanualc                 C   s:   g dg dg}t | d}|| | }t|| d S )N)r   r   rE   )rF   r   rF   r   )r   r+   ra   r	   )r   Zexpected_namesr   r   rc   r   r   r   'test_one_hot_encoder_feature_names_drop:  s
   


r   c                  C   s   ddgddgddgg} t g dg dg dg}t d d	g}td
dd}|| }t|j| t|| ddgddgddgg} t ddgddgddgg}t d	d g}td
dd}|| }t|j| t|| d S )NrU   yes   nor[   )r   r   r   r   r7   )r   r   r   r   r   r   Fr   truerE   falser   r   )r   r   r   r   r	   	drop_idx_r   )r   expectedZexpected_drop_idxr   resultr   r   r   *test_one_hot_encoder_drop_equals_if_binaryL  s    


r   )rU   r   r|   )r   r   r|   )r   r   rf   c                 C   sT   t  }tjg dg dgdd}t|| |d t dd}t|| | d S )Nr   r   r   r   r   r   r   r<   float64)r   r   r   r	   r   Zastyper   r   r   r   test_ordinal_encoderd  s
   

r   )rf   r   zobject-string-catc                 C   s   t |d}tdgdgg}t|| | t|jd t|d ks%J |jd  t|d ks4J |jd j	|ks>J t |d}t
jtdd || W d    d S 1 s[w   Y  d S )Nrs   r   r   r   r(   r)   )r   r   r   r	   r   r   r?   r   r   r=   r,   r-   r.   r+   )r   r2   r   r   rb   r   r   r   r   )test_ordinal_encoder_specified_categoriesu  s   

"r   c                  C   s   g dg dg} t  }|| }tj| td}t||| tg dg dg}td}t	j
t|d || W d    d S 1 sGw   Y  d S )Nr}   rz   r<   )r   r   r   r   rJ   r   r)   )r   r   r   r   rf   r	   r   r   r   r,   r-   r.   )r   rb   r   r   r   r   r   r   test_ordinal_encoder_inverse  s   

"r   c                  C   s   t ddd} tjddgddgdd	ggtd
}tjddgddgddggtd
}| | | |}tjddgddgddggdd
}t|| | |}tjdd gd dgddggtd
}t|| d S )Nuse_encoded_valuer!   unknown_valuerE   xrF   yr   r   r<   ZxyZblar   r   r   r   )r   r   r   rf   r+   r/   r	   r   )rb   X_fitr   X_trans_encr   X_trans_invinv_expr   r   r   +test_ordinal_encoder_handle_unknowns_string  s     

 

 r   r=   c                 C   s   t ddd}tjddgddgdd	gg| d
}tjddgddgddgg| d
}|| ||}tjddgddgddggdd
}t|| ||}tjdd gd dgddggtd
}t|| d S )Nr   r   r      r      r   	   r<   rX      r   r   )r   r   r   r+   r/   r	   r   rf   )r=   rb   r   r   r   r   r   r   r   r   r   ,test_ordinal_encoder_handle_unknowns_numeric  s     

 

 r   c                  C   s`   t dtjd} tdgdgdgg}| | | dgdgdgg}t|dgdgtjgg d S )Nr   r   r   r   r   r%   r   )r   r   r   r   r+   r/   r	   )rb   r   r   r   r   r   (test_ordinal_encoder_handle_unknowns_nan  s
   
r   c                  C   sd   t dtjtd} tdgdgdgg}tjtdd | | W d    d S 1 s+w   Y  d S )Nr   )r!   r   r=   r   r   r   z'dtype parameter should be a float dtyper)   )	r   r   r   intr   r,   r-   r.   r+   )rb   r   r   r   r   8test_ordinal_encoder_handle_unknowns_nan_non_float_dtype  s   "r   c                  C   sj   t jg dgtdj} g d}t|d}d}tjt|d ||  W d    d S 1 s.w   Y  d S )N)LowMediumHighr   r   r<   )r   r   r   rs   z*Shape mismatch: if categories is an array,r)   )	r   r   rf   rA   r   r,   r-   r.   r+   )r   r   rb   r   r   r   r   +test_ordinal_encoder_raise_categories_shape  s   
"r   c                     sx  t ddtjg dg dgdd} tjddgd	d
ggddtjddgd	d
ggddtddgddggtddgddggtjddgd	dggddfD ]!   t fddtdD scJ t  |  qLddgd	d
gg   tfddtdD sJ t  |  ddgd	dgg   tfddtdD sJ t  |  d S )Nr>   rs   )r   r   r   r   )r   r   r   r   r   r<   r   r   r   r%   r   rE   rF   r   r      a   b   c   drf   c                    s   g | ]}j | j jkqS r   r   r=   .0ir   rb   r   r   
<listcomp>  s    z'test_encoder_dtypes.<locals>.<listcomp>c                    s"   g | ]}t  j| jt jqS r   )r   r   r   r=   integerr   rb   r   r   r     s   " c                       g | ]
} j | jd kqS )rf   r   r   r   r   r   r         )	r   r   r   r+   allranger	   r/   r   )r   r   r   r   test_encoder_dtypes  s&   

 

r  c                     s   t d} tdd tjg dg dgdd}| jdd	gd
dgddgddd} | t fddtd	D s<J t	 
| | | dd	gddgddgd}g d | t fddtd
D snJ t	 
| | d S )NrD   r>   rs   )r   r   r   r   r   r   )r   r   r   r   r   r   r   r<   r   r   r   r%   r   r   rH   rI   Cr   c                    r  )r   r   r   r   r   r   r     r  z.test_encoder_dtypes_pandas.<locals>.<listcomp>rE   rF   r   r   )r   rf   r   c                    s    g | ]} j | j| kqS r   r   r   rb   Zexpected_cat_typer   r   r     s     )r,   rL   r   r   r   rM   r+   r  r  r	   r/   r   )rN   r   r   r   r  r   test_encoder_dtypes_pandas  s   

"

 r	  c                  C   sX   t  } ddgddgg}t  td | | W d    d S 1 s%w   Y  d S )NrQ   r   rS   r   r&   )r   warningscatch_warningssimplefilterr   )rb   r   r   r   r   test_one_hot_encoder_warning  s   

"r  c                 C   s   ddgddgddgg}t | ddddgddggd}|| d	dgg}tddgg}d
}tjt|d ||}W d   n1 sDw   Y  t|| dS )z,Check handle_unknown='warn' works correctly.rE   r   rF   r   r   Fr$   r   r   r!   r?   r   qFound unknown categories in columns \[0\] during transform. These unknown categories will be encoded as all zerosr)   N	r   r+   r   r   r,   warnsUserWarningr/   r   )r   r   r   X_testrB   warn_msgr   r   r   r   test_ohe_handle_unknown_warn%  s    

r  missing_valuec           	      C   sn  dddd| g}t |d}g dg ddddd| gg}|| }g dg d	g d
g}t|| |j|u s8J dd t|j|jD }||}t	j
|td}t|d rt|d d |d d  t|d skJ t|d ssJ t|d d d df |d d d df  t|dd df |dd df  t|d sJ t|d sJ d S t|| t|| d S )Nr{   rX   r   r   r   )r~   rX   r   r|   rE   )r{   rX   r   r|   rE   )r   r   r   r   r   )r   r   r   r   r   r   c                 S   s   g | ]\}}|| qS r   r   )r   r   rk   r   r   r   r   M  s    z4test_one_hot_encoder_drop_manual.<locals>.<listcomp>r<   r6   )r6   r6   )r   r   r   r	   r   r   r   r   r   r   r   rf   r   )	r  Zcats_to_droprb   r   Ztransr   Zdropped_catsZX_inv_transZX_arrayr   r   r    test_one_hot_encoder_drop_manual?  s2   


*"
r  )r~   r   rT   rE   c                 C   s^   t | d}d}tjt|d |g dg dg dg W d    d S 1 s(w   Y  d S )Nr   z-`drop` should have length equal to the numberr)   r}   rz   )r{   r   ;   )r   r,   r-   r.   r+   )r   rb   rq   r   r   r   test_invalid_drop_lengthd  s
   
"r  densityr   ZdenserE   r   rF   r   c                 C   s   t | d}t | |d}g dg dg}|| || t|j|j |dkr/t|jd nt||j|jD ]\}}}|t| |ksFJ q7t|jtj	sPJ |jj
tksXJ d S )Nr   r   )r   r   rE   r  r   r   )r   r+   r	   r   r   r   r   r   r   Zndarrayr=   rf   )r  r   Zohe_baseZohe_testr   Zdrop_catZdrop_idxZcat_listr   r   r   test_categoriesl  s   



r  c                 C   s   |    jjs	J d S )N)Z__sklearn_tags__Z
input_tagscategorical)r   r   r   r   "test_encoders_has_categorical_tags  s   r  kwargsmax_categoriesmin_frequency   g(\?r   )r   r!  rX   r?   r>   rE   rF   r   r   c           
      C   s   t dgd dgd  dgd  dgd  gj}td|d	d
d| |}t|jg dg dgdgdgdgdgg}t ddgddgddgddgddgg}||}t|| dd dgdgd  D }|	|}t|| |
 }	tddg|	 dS )zpTest that different parameters for combine 'a', 'c', and 'd' into
    the infrequent category works as expected.rE   r   rF   r   r   rU   r   r   r#   F)r?   r!   r   rE   r   r   er   r   c                 S      g | ]}|gqS r   r   r   colr   r   r   r         z2test_ohe_infrequent_two_levels.<locals>.<listcomp>infrequent_sklearnr%   r   x0_infrequent_sklearnNr   r   r   rA   r   r+   r	   infrequent_categories_r/   r   r   ra   )
r  r?   X_trainr   r  r   r   expected_invX_invrc   r   r   r   test_ohe_infrequent_two_levels  s(   2(



r1  c                 C   s   t dgd dgd  dgd  dgd  gj}td	d
d| d|}|jd |jd  dks2J t dgdgg}||}tdgdgg| |	 }t
dg| ||}t
dgdgg| dS )z3Test two levels and dropping the frequent category.rE   r   rF   r   r   rU   r   r   r#   Fr   r!   r   r   r   r   r   r+  r*  N)r   r   rA   r   r+   r   r   r/   r   ra   r	   r   )r   r.  r   r  r   rc   	X_inverser   r   r   ,test_ohe_infrequent_two_levels_drop_frequent  s"   2

r4  c                 C   s   t dgd dgd  dgd  dgd  gj}td	d
d| d}d| d d}tjt|d || W d   dS 1 sAw   Y  dS )z_Test two levels and dropping any infrequent category removes the
    whole infrequent category.rE   r   rF   r   r   rU   r   r   r#   Fr   r2  Unable to drop category r   ( from feature 0 because it is infrequentr)   Nr   r   rA   r   r,   r-   r.   r+   r   r.  r   r   r   r   r   5test_ohe_infrequent_two_levels_drop_infrequent_errors  s   2"r9  r   gQ?g{Gz?r   c           	      C   s   t dgd dgd  dgd  dgd  gj}tdd	d
d| |}t|jddgg dgdgdgdgdgg}t g dg dg dg dg dg}||}t|| dgdgdgdgdgg}|	|}t|| |
 }tg d| dS )zkTest that different parameters for combing 'a', and 'd' into
    the infrequent category works as expected.rE   r   rF   r   r   rU   r   r   r#   Fr!   r   r%  r   r   r   r   r   r*  )r   r   r+  Nr   r,  )	r  r.  r   r  r   r   r/  r0  rc   r   r   r    test_ohe_infrequent_three_levels  s.   2(



r<  c                 C   s   t dgd dgd  dgd  dgd  gj}td	d
d| d|}t dgdgdgg}tddgddgddgg|| |jdd| d}tj	t
|d |dgdgg}W d   n1 sfw   Y  tddgddgg| dS )z5Test three levels and dropping the frequent category.rE   r   rF   r   r   rU   r   r   r#   Fr2  r   r   r"   r'   r(   r)   r%  N)r   r   rA   r   r+   r   r/   rt   r,   r  r  )r   r.  r   r  r   r   r   r   r   .test_ohe_infrequent_three_levels_drop_frequent  s"   2"r=  c                 C   s   t dgd dgd  dgd  dgd  gj}td	d
d| d}d| d d}tjt|d || W d   dS 1 sAw   Y  dS )z7Test three levels and dropping the infrequent category.rE   r   rF   r   r   rU   r   r   r#   Fr2  r5  r   r6  r)   Nr7  r8  r   r   r   7test_ohe_infrequent_three_levels_drop_infrequent_errors  s   2"r>  c                  C   s   t dgd dgd  dgd  dgd  gj} td	d
dd| }t|jddgg dgdgdgdgg}t g dg dg dg dg}||}t|| dgg}d}t	j
t|d || W d   dS 1 sow   Y  dS )zmTest that different parameters for combining 'a', and 'd' into
    the infrequent category works as expected.rE   r   rF   r   r   rU   r   r   r&   F)r!   r   r   r   r;  r   badz.Found unknown categories \['bad'\] in column 0r)   N)r   r   rA   r   r+   r	   r-  r/   r   r,   r-   r.   )r.  r   r  r   r   r   r   r   r   (test_ohe_infrequent_handle_unknown_error'  s    2"

"r@  c                 C   s   t jdgd dgd  gtdj}tdg dgddd	| |}dgd
gdgdgdgg}t ddgddgddgddgddgg}||}t|| dddgg}dgdgg}|D ]}|j|d| tdgdgg|| qZdS )zG'a' is the only frequent category, all other categories are infrequent.rE   r   r%  r[   r<   r   r   rE   rF   Fr#   r?   r   r!   rF   r   r   r   r   r   r   r   Nr   )	r   r   rf   rA   r   r+   r/   r   rt   )r  r.  r   r  r   r   Zdropsr   r   r   r   5test_ohe_infrequent_two_levels_user_cats_one_frequent?  s(   "(

rC  c                  C   s   t jdgd dgd  dgd  dgd  gtd	j} tg d
gdddd| }t|jg dg dgdgdgdgdgg}t ddgddgddgddgddgg}||}t	|| dd dgdgd  D }|
|}t|| dS )zFTest that the order of the categories provided by a user is respected.rE   r   rF   r   r   rU   r   r   r<   rA  Fr#   r   r?   r   r!   r   )r   r   rE   r%  r   r   c                 S   r&  r   r   r'  r   r   r   r   q  r)  z<test_ohe_infrequent_two_levels_user_cats.<locals>.<listcomp>r*  r%   Nr   r   rf   rA   r   r+   r	   r-  r/   r   r   r.  r   r  r   r   r/  r0  r   r   r   (test_ohe_infrequent_two_levels_user_cats[  s*   *(


rG  c                  C   s   t jdgd dgd  dgd  dgd  gtd	j} tg d
gdddd| }t|jddgg dgdgdgdgdgg}t g dg dg dg dg dg}||}t	|| dgdgdgdgdgg}|
|}t|| dS )zTest that the order of the categories provided by a user is respected.
    In this case 'c' is encoded as the first category and 'b' is encoded
    as the second one.rE   r   rF   r   r   rU   r   r   r<   r   r   rF   rE   Fr#   rD  r%  r   r;  r   r*  NrE  rF  r   r   r   *test_ohe_infrequent_three_levels_user_catsv  s4   *(


rI  c                  C   sb   t jg dg df } tdddd}||  ddgddgg}||}t|g d	g d
g dS )zaTest infrequent categories where feature 0 has infrequent categories,
    and feature 1 does not.	r   r   r   r   r   r   r   r   r   	r   r   r   r   r   r   r   r   r   r   r   F)r   r   r   r   r   r   r   r   r   )r   r   r   r   N)r   c_r   r+   r/   r   )r   r   r  r   r   r   r   test_ohe_infrequent_mixed  s   

rN  c            	   
   C   s  t jg dg dg df } tdddd}||  }t|jd d	d
g t|jd	 d	dg t|jd
 d | }tg d| g dg dg dg dg dg dg dg dg dg	}t|| g dg dg}|	|}g dg dg}t||  |
|}t jg dg dgtd}t|| tdddd| }tjtdd |	| W d   n1 sw   Y  g d g d!g}|	|}g d"g dg}t||  |
|}t jg d#g d$gtd}t|| dS )%z?Test infrequent categories with feature matrix with 3 features.rJ  )	r   r   r   r   r   rU   r   r   r   )	r   r   r   r   r   r   r   r   r   r>   r   r#   r?   r   r!   r   r   r   rU   N)Zx0_0Zx0_3r+  Zx1_0Zx1_5Zx1_infrequent_sklearnZx2_0Zx2_1)r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   )r%   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r*  N)r*  r   Nr<   r&   r(   r)   )r   r   r   )r   rU   r   )r   r   r   r   r   r   r   r   )r*  r*  r   )r   r*  r   )r   rM  r   r   r   r	   r-  ra   r   r/   r   r   rf   r+   r,   r-   r.   )	r   r   r   rc   r   r  X_test_transr0  r/  r   r   r   'test_ohe_infrequent_multiple_categories  sp   






rQ  c            	   
   C   s  t d} | jg dg ddddgd}tdd	d
d}|| }t|jd ddg t|jd g d g dg dg dg dg dg dg dg dg dg	}t|| | jddgddgdddgd}g dg dg}|	|}t||  |
|}tjddgddggtd}t|| | jddgddgdddgd}|	| }g dg dg}t|| |
|}tjddgddggtd}t|| dS )zHTest infrequent categories with a pandas dataframe with multiple dtypes.rD   	rE   fr   rS  rS  rE   r   rF   rF   	r   r   r   rU   rU   rX   r   r   r   )strr   rU  r   columnsr>   r   r#   rO  r   rE   rF   r   r   r   rX   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   rS     rX   r*  r<   r   r   N)r,   rL   rM   r   r   r   r	   r-  r   r/   r   r   r   rf   )	rN   r   r   r   r   r  rP  r0  r/  r   r   r   .test_ohe_infrequent_multiple_categories_dtypes  sV   
	
 


 

rZ  rZ   )r!  r   c                 C   sp   t dgd dgd  dgd  dgd  gj}tdd	d
d| }|| |dgg}t|dgg dS ),All user provided categories are infrequent.rE   r   rF   r   r   rU   r   r   r#   Fr:  r   Nr   )r   r   rA   r   r+   r/   r   r  r.  r   r   r   r   r   $test_ohe_infrequent_one_level_errorsH  s   2
r]  c                 C   sb   t jdgd gtdj}tdg dgddd| |}|dgdgg}t|d	gd	gg d
S )r[  r%  r   r<   rA  Fr#   rB  rE   r   Nr   )r   r   rf   rA   r   r+   r/   r   r\  r   r   r   5test_ohe_infrequent_user_cats_unknown_training_errorsV  s   r^  zinput_dtype, category_dtype)ZOOZOUZUOZUUSOZSUZSS
array_type)r   r   Z	dataframec           
      C   s   t jdgdgg| d}t jddg|dg}t|dd|}tdgdgdgdgg|| d}||}t ddgddgddgddgg}t|| t|d|}	|	|}t dgdgdgdgg}t|| d	S )
a"  Check that encoding work with object, unicode, and byte string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/15616
    https://github.com/scikit-learn/scikit-learn/issues/15726
    https://github.com/scikit-learn/scikit-learn/issues/19677
    rF   rE   r<   Frw   r   r   rs   N)	r   r   r   r+   r   r/   r   r   r	   )
r;   Zcategory_dtyper`  r   r?   r   r  r   r   oer   r   r   test_encoders_string_categoriesg  s   
"

rb  c                  C   s~   t jdgdggdd} t jddgddg}t|dd}td}tjt|d	 ||  W d
   d
S 1 s8w   Y  d
S )zCheck that this mixture of predefined categories and X raises an error.

    Categories defined as bytes can not easily be compared to data that is
    a string.
    rF   rE   Ur<   SFrw   zjIn column 0, the predefined categories have type 'bytes' which is incompatible with values of type 'str_'.r)   N)	r   r   r   r   r   r,   r-   r.   r+   )r   r?   r   r   r   r   r   $test_mixed_string_bytes_categoricals  s   "re  c                 C   sP   t jdd| d| ggtdj}tddd|}| }t|ddd	|  g d S )
NrE   rF   r<   Fr"   r   r!   Zx0_ar   Zx0_)r   r   rf   rA   r   r+   ra   r	   )r  r   r   namesr   r   r   )test_ohe_missing_values_get_feature_names  s   rh  c                  C   sr   t d} | jg dtjdddtjgtdddd	gd
}tg dg dg dg dg}t|}t|| d S )NrD   )dogr   Nr   r   r   r%   r<   )col1col2rj  rk  rV  )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )	r,   rL   rM   r   r   r   floatry   r   )rN   dfexpected_df_transr   r   r   r   %test_ohe_missing_value_support_pandas  s    
	ro  pd_nan_typepd.NAznp.nanc              	   C   s   t d}| dkr|jntj}|d|jdd|ddgddi}tg d	g d
g dg dg d
g}td|d}|	|}t
|| t|jdksMJ t|jd d d g d t|jd d sgJ d S )NrD   rq  rj  r   rE   rF   rl   r<   )r   r   r   r   )r   r   r   r   )r   r   r   r   rL  Frf  r   r   r6   r   )r,   rL   NAr   r   rM   r   r   r   r   r   lenr   r	   isnan)rp  r!   rN   pd_missing_valuerm  rn  r   df_transr   r   r   1test_ohe_missing_value_support_pandas_categorical  s(   



rw  c                 C   s   ddgddgddgg}t dd| d}||}tg d	g d
g dg}t|| ddgg}tg d	g}d}tjt|d ||}W d   n1 sPw   Y  t|| |	|}t
|tjddggtd dS )zZCheck drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
    during transform.rE   r   rF   r   r   r   Fr   r   r!   r   r   )r   r   r   r   r   tFound unknown categories in columns \[0, 1\] during transform. These unknown categories will be encoded as all zerosr)   Nr<   r   r   r   r   r   r,   r  r  r/   r   r	   rf   r!   r   r   r   rB   r  r  r0  r   r   r   /test_ohe_drop_first_handle_unknown_ignore_warns  s,   




r|  c                 C   s   ddgddgddgg}t dd| d}||}tg d	g d
g dg}t|| ddgg}tg dg}d}tjt|d ||}W d   n1 sPw   Y  t|| |	|}t
|tjddggtd dS )zDCheck drop='if_binary' and handle_unknown='ignore' during transform.rE   r   rF   r   r   r   Frx  rL  r   rJ   r   r   )r   r   r   r   ry  r)   Nr<   rz  r{  r   r   r   3test_ohe_drop_if_binary_handle_unknown_ignore_warns  s,   




r}  c                 C   s   ddgddgddgg}t dd| ddgddggd}|| d	dgg}tddgg}d
}tjt|d ||}W d   n1 sDw   Y  t|| dS )znCheck drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
    during fit with categories passed in.rE   r   rF   r   r   r   Fr  r   r  r)   Nr  )r!   r   r   r  rB   r  r   r   r   r   'test_ohe_drop_first_explicit_categories&  s    

r~  c                  C   s   t d} | jg dg ddddgd}tdd	}|jdd
 d}t jt|d || W d   n1 s9w   Y  || t jt|d |	| W d   dS 1 s[w   Y  dS )zJRaise informative error message when pandas output and sparse_output=True.rD   r   )r   rF   rF   )rE   rF   rE   rF   rV  Tr   r/   zxPandas output does not support sparse data. Set sparse_output=False to output pandas dataframes or disable Pandas outputr)   N)
r,   rL   rM   r   
set_outputr-   r.   r   r+   r/   )rN   rm  r   r   r   r   r   'test_ohe_more_informative_error_messageA  s   
 

"r  c                  C   sn   t t jdddggj} tt jd}dt j }tjt|d |	|  W d   dS 1 s0w   Y  dS )zDTest ordinal encoder with nan passthrough fails when dtype=np.int32.r   r   r<   zdThere are missing values in features \[0\]. For OrdinalEncoder to encode missing values with dtype: r)   N)
r   r   r   rA   r   int32r,   r-   r.   r+   )r   ra  r   r   r   r   Btest_ordinal_encoder_passthrough_missing_values_float_errors_dtypeU  s   "r  encoded_missing_valuer   c                 C   s   t jt jdddggt jdj}t| d|}t|jdks J t	|jd ddt jg |
|}t	|| gdgdgdgg ||}t	|| dS )	z.Test ordinal encoder with nan on float dtypes.r   r   r<   r  r   r   r   N)r   r   r   r   rA   r   r+   rs  r   r   r/   r   )r  r   ra  r   r3  r   r   r   5test_ordinal_encoder_passthrough_missing_values_floatc  s   

r  c              	   C   s   t d}| dkr|jntj}|d|jdd|ddgddi}t|d	|}t	|j
d
ks1J t|j
d dd g d t|j
d d sKJ ||}t|dgdg|gdgdgg ||}|jdkskJ t|dddf ddg t|dddf ddg t|d sJ dS )z0Check ordinal encoder is compatible with pandas.rD   rq  rj  r   rE   rF   rl   r<   r  r   r   Nr   r   r6          @r   r   )r   r   r   r   )r,   rL   rr  r   r   rM   r   r   r+   rs  r   r	   rt  r/   r   r   r   )rp  r  rN   ru  rm  ra  rv  r3  r   r   r   =test_ordinal_encoder_missing_value_support_pandas_categoricalu  s"   


r  r  )zobject-None-missing-valuezobject-nan-missing_valueznumeric-missing-valuec                 C   s   t |d}tdgtjgg}t|| | |jd j|ks!J t |d}tj	t
dd || W d   dS 1 s>w   Y  dS )z.Test ordinal encoder for specified categories.rs   r   r   r(   r)   N)r   r   r   r   r	   r   r   r=   r,   r-   r.   r+   )r   r2   r   r   ra  r   r   r   r   =test_ordinal_encoder_specified_categories_missing_passthrough  s   
&
"r  c                 C   sr   t jg dtdg}| |d}t jddggtdj}tjtdd || W d   dS 1 s2w   Y  dS )	zTest encoder for specified categories have duplicate values.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/27088
    )rE   rF   rE   r<   rs   rE   rF   z5the predefined categories contain duplicate elements.r)   N)r   r   rf   rA   r,   r-   r.   r+   r   r   r   r   +test_encoder_duplicate_specified_categories  s   
"r  zX, expected_X_trans, X_testr   r   )r   r   r   )r   r  r   r   )r   rE   rF   )r  r   r   c                 C   s8   t ddd}|| }t|| t||dgg dS )z>Test the interaction between missing values and handle_unknownr   r6   r   g      N)r   r   r   r/   )r   Zexpected_X_transr  ra  r   r   r   r   /test_ordinal_encoder_handle_missing_and_unknown  s   

r  csr_containerc                 C   s   t g dg dg}| |}t }d}tjt|d || W d   n1 s+w   Y  tjt|d || W d   n1 sGw   Y  ||}| |}tjt|d || W d   dS 1 smw   Y  dS )zCheck that we raise proper error with sparse input in OrdinalEncoder.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19878
    r   r   z2Sparse data was passed, but dense data is requiredr)   N)	r   r   r   r,   r-   rp   r+   r   r   )r  r   ZX_sparseencoderrq   r   r   r   r   r   test_ordinal_encoder_sparse  s   
"r  c                  C   s   t g dddt jf } tg dgddd}||  tg dgdd}tjtd	d
 ||  W d   dS 1 s>w   Y  dS )zCheck OrdinalEncoder.fit works with unseen category when
    `handle_unknown="use_encoded_value"`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    )r   r   r   r   r   r   N)r6   r   r   r   r   )r?   r!   r   r&   r   r(   r)   )r   r   Znewaxisr   r+   r,   r-   r.   )r   ra  r   r   r   -test_ordinal_encoder_fit_with_unseen_category  s   
"r  r.  ZAAOrc  r  c                 C   s4   t ddd}||  ||}t|ddgg dS )zChecks that `OrdinalEncoder` transforms string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    r   ir   r   N)r   r+   r/   r   )r.  r  rb   r   r   r   r   1test_ordinal_encoder_handle_unknown_string_dtypes&  s   

r  c                  C   sb   t g ddd} t | }t|jt j| ddj |	| }t|dgdgdgdgg dS )	zCheck that `OrdinalEncoder` accepts Python integers that are potentially
    larger than 64 bits.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/20721
    )l   	HP
1& l   	H]viel   	 :?i}Ga l   IRK2e6kr6   r   r   )Zaxisr   r   N)
r   r   r8   r   r+   r	   r   sortrA   r/   )r   r  r   r   r   r   #test_ordinal_encoder_python_integerB  s   
r  c                  C   sH   t d} g d}| jg dg|d}t |}| }t|| dS )z-Check feature names out is same as the input.rD   )rF   r   rE   r   rV  N)r,   rL   rM   r   r+   ra   r	   )rN   rg  r   rb   Zfeature_names_outr   r   r   .test_ordinal_encoder_features_names_out_pandasV  s   
r  c                  C   s   t jdgdgt jggtd} tdt jdd| }|| }t|dgdgdgg t jd	gt jggtd}||}t|t jgdgg ||}|d d d
u sSJ t 	|d d s^J d
S )zECheck interactions between encode_unknown and missing value encoding.rE   rF   r<   r   r!   r   r  r   r   r   N)
r   r   r   rf   r   r+   r/   r   r   rt  )r   ra  r   r  rP  X_roundtripr   r   r   0test_ordinal_encoder_unknown_missing_interactionb  s    


r  with_pandasc                 C   s   t jddgddgdt jggtd}d}| r(td}|j|d	d
gd}|d }n|d }tdd}tjt	|d |
| W d   dS 1 sIw   Y  dS )zXCheck OrdinalEncoder errors when encoded_missing_value is used by
    an known category.rE   ri  rF   r   r   r<   zTencoded_missing_value \(1\) is already used to encode a known category in features: rD   letterZpetrV  z	\['pet'\]z\[1\]r   r  r)   N)r   r   r   rf   r,   rL   rM   r   r-   r.   r+   )r  r   	error_msgrN   ra  r   r   r   0test_ordinal_encoder_encoded_missing_value_error  s   "


"r  z4X_train, X_test_trans_expected, X_roundtrip_expected1c                 C   s   t dtjtjd| }tdgtjgdgg}||}t|| ||}|jd }t	|D ]+}||df }	||df }
|	du rI|
du sHJ q0t
|	rUt|
sTJ q0|
|	ks[J q0dS )znCheck transform when unknown_value and encoded_missing_value is nan.

    Non-regression test for #24082.
    r   r  r  rF   r   N)r   r   r   r+   r   r/   r   r   r   r  r   rt  )r.  ZX_test_trans_expectedZX_roundtrip_expectedra  r  rP  r  Z	n_samplesr   Zexpected_valvalr   r   r   9test_ordinal_encoder_unknown_missing_interaction_both_nan  s*   



r  c                  C   s   t d} | ddgddgd}t }|jdd d}t jt|d	 || W d
   n1 s3w   Y  tddjdd}tddjdd}||}||}t|	 | t
| |j d
S )z*Check OneHotEncoder works with set_output.rD   rE   rF   r   r   rG   r  zCPandas output does not support sparse data. Set sparse_output=Falser)   NFr   default)r,   rL   rM   r   r  r-   r.   r   r   to_numpyr	   ra   rW  )rN   rO   r   r*   Zohe_defaultZ
ohe_pandas	X_defaultX_pandasr   r   r   test_one_hot_encoder_set_output  s   


r  c                  C   st   t d} | ddgddgd}t jdd}t jdd}||}||}t| | t|	 |j
 d	S )
z+Check OrdinalEncoder works with set_output.rD   rE   rF   r   r   rG   r  r  N)r,   rL   rM   r   r  r   r   r  r	   ra   rW  )rN   rO   Zord_defaultZ
ord_pandasr  r  r   r   r   test_ordinal_set_output  s   


r  c                  C   st   g dddgg} t | d}|ddgg t| t|jks J t|jD ]\}}|jtks0J t| | | q%dS )zjCheck that the categories_ dtype is `object` for string categories

    Regression test for gh-25171.
    )asZmmasZeasZrasZacsr  2rs   r  N)r   r+   rs  r   	enumerater=   rf   r	   )r?   rb   nr   r   r   r    test_predefined_categories_dtype  s   
r  c                  C   s~   t jdgdgt jggtd} tdd| }t|dgdgdgg tddd	| }t d
gg}||}t|dgg dS )zBCheck missing value or unknown encoding can equal the cardinality.ri  r   r<   r   r  r   r   r   r   snakeN)	r   r   r   rf   r   r   r   r+   r/   )r   r   rb   r  r   r   r   1test_ordinal_encoder_missing_unknown_encoding_max  s   
r  c                  C   s  t jdgd dgd  dgd  dgd  dgd  gtdj} tdd	d
d| }t| g d |jd |j	d  dksAJ t jdgd dgd  dgd  gtdj} tdd	dd| }t| dg |jd |j	d  dkswJ t jdgd dgd  dgd  dgd  dgd  gtdj} tdd	dgd| }t| g d |jd |j	d  dksJ tdd	dd| }t| g d |j	du sJ dS )zkCheck drop_idx is defined correctly with infrequent categories.

    Non-regression test for gh-25550.
    rE   r   rF   r%   r   r   r%  r<   Fr   )r!  r   r   )r   x0_dx0_er+  r   rU   r   r+  )r   r   r  r+  N)r   r   r  r  r+  )
r   r   rf   rA   r   r+   r	   ra   r   r   )r   r   r   r   r   #test_drop_idx_infrequent_categories  s<   4,4r  c                 C   s   t dgd dgd  dgd  dgd  gj}tdd	d
d| |}t|jg dg t|jddgg dgdgdgdgdgg}dgdgdgdgd
gg}||}t	|| |
|}dgdgdgdgdgg}t|| dS )zGTest parameters for grouping 'a', and 'd' into the infrequent category.rE   r   rF   r   r   rU   r   r   r   r6   r   r#  r   r   r   r   r*  Nr   )r   r   rA   r   r+   r	   r   r-  r/   r   r   )r  r.  ordinalr  expected_transr   r3  expected_inverser   r   r   ,test_ordinal_encoder_infrequent_three_levels6  s,   2


r  c                  C   s   t jdgd dgd  dgd  dgd  gtd	j} tg d
gdddd| }t|jg d
g t|jddgg dgdgdgdgdgg}dgdgdgdgdgg}|	|}t
|| ||}dgdgdgdgdgg}t|| dS )zTest that the order of the categories provided by a user is respected.

    In this case 'c' is encoded as the first category and 'b' is encoded
    as the second one.
    rE   r   rF   r   r   rU   r   r   r<   rH  r   r6   )r?   r   r!   r   r   r   r   r   r*  N)r   r   rf   rA   r   r+   r	   r   r-  r/   r   r   )r.  r  r  r  r   r3  r  r   r   r   6test_ordinal_encoder_infrequent_three_levels_user_cats]  s6   *


r  c                  C   s   t g dg df} tdd| }t|jd ddg |jd du s&J ddgddgg}ddgddgg}||}t|| ||}t j	ddgd	dggt
d
}t|| dS )zETest when feature 0 has infrequent categories and feature 1 does not.rJ  rK  r   r   r   r   r   Nr*  r<   )r   Zcolumn_stackr   r+   r	   r-  r/   r   r   r   rf   )r   r  r  r  r   r3  r  r   r   r   %test_ordinal_encoder_infrequent_mixed  s   


r  c                  C   s   t d} | g d}| jg dg d| jdgd dgd  d	g d
g |ddg dd}tdd|}t|jd ddg t|jd g d t|jd d
d	g | jg dg d| jdgd	g d
g dg |ddg dd}g dg dg dg dg}|	|}t
|| dS )zHTest infrequent categories with a pandas DataFrame with multiple dtypes.rD   )birdr   ri  r  rR  rT  ri  r%   r   r   r  r  r<   )rU  r   r  rV  r  r   rE   rF   r   rX  r   )rE   rF   rS  r   )rX   r   rU   r   )r   r   r   )r   r   r   )r   r   r   r;  N)r,   rL   ZCategoricalDtyperM   r   r   r+   r	   r-  r/   r   )rN   Zcategorical_dtyper   r  r  r  r   r   r   r   :test_ordinal_encoder_infrequent_multiple_categories_dtypes  s:   


r  c                  C   s   t jdgd dgd  dgd  dgd  t jg gtd	j} td
dddd| }t|jg dg t jdgdgdgdgdgt jggtd	}dgdgdgdgdgdgg}|	|}t
|| dS )zJCheck behavior of unknown_value and encoded_missing_value with infrequent.rE   r   rF   r   r   rU   r   r   r<   r   r   )r!   r   r   r  r$  r%  r   r   N)r   r   r   rf   rA   r   r+   r	   r-  r/   r   )r.  r  r  r  r   r   r   r   .test_ordinal_encoder_infrequent_custom_mapping  s$   2(
r  c                 C   s   t jdgd dgd  dgd  dgd  gtd	j}tdi | d
dd|}td
dd|}dgdgdgdgdgg}t|||| dS )zMAll categories are considered frequent have same encoding as default encoder.rE   r   rF   r   r   rU   r   r   r<   r   r6   r   r%  Nr   r   r   rf   rA   r   r+   r   r/   )r  r.  Zadjusted_encoderZdefault_encoderr  r   r   r   !test_ordinal_encoder_all_frequent  s*   	*r  d   c                 C   s   t jdgd dgd  dgd  dgd  gtd	j}tdi | d
dd|}dgdgdgdgdgg}t||dgdgdgdgdgg dS )zAWhen all categories are infrequent, they are all encoded as zero.rE   r   rF   r   r   rU   r   r   r<   r   r6   r   r%  r   Nr   r  )r  r.  r  r  r   r   r   #test_ordinal_encoder_all_infrequent  s   	*(r  c                  C   s   t jt jgd dgd  dgd  dg dg gtdj} td	d
| }t jdddt jggtdj}||}t|dgdgdgt jgg dS )z5Check behavior when missing value appears frequently.r   ri  rU   r   r   r  deerr<   r   r  r   r   r   N	r   r   r   rf   rA   r   r+   r/   r   r   r  r  r   r   r   r   -test_ordinal_encoder_missing_appears_frequent
	  s   ,
 r  c                  C   s   t jt jgdgd  dgd  dg dg dgd d	gd  gtd
j} tdd| }t jddgdd	gt jd	gdd	gddggtd
}||}t|ddgddgt jdgddgddgg dS )z7Check behavior when missing value appears infrequently.ri  rU   r   r   r  r  redr   greenr<   r%   )r!  r   r   r   Nr  r  r   r   r   /test_ordinal_encoder_missing_appears_infrequent	  s(   &

.r  c                 C   sd   t jdgdgdggtd}| g dgd}tt || W d   dS 1 s+w   Y  dS )a!  Check that we raise a `NotFittedError` by calling transform before fit with
    the encoders.

    One could expect that the passing the `categories` argument to the encoder
    would make it stateless. However, `fit` is making a couple of check, such as the
    position of `np.nan`.
    rH   rI   r  r<   r  rs   N)r   r   rf   r,   r-   r   r/   )r   r   r  r   r   r   test_encoder_not_fitted3	  s
   	"r  )r   r
  numpyr   r,   Zscipyr   Zsklearn.exceptionsr   Zsklearn.preprocessingr   r   Zsklearn.utils._missingr   Zsklearn.utils._testingr   r   r	   Zsklearn.utils.fixesr
   r    markZparametrizer4   r9   r  Zfloat32r   rC   rP   rd   rg   rr   rv   ry   r   rf   r   rl  r   r   r   r   r   r   r   r   r   Zstr_r   rA   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r	  r  r  r  r  r  r  r1  r4  r9  r<  r=  r>  r@  rC  rG  rI  rN  rQ  rZ  r]  r^  rb  re  rh  ro  rw  r|  r}  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r@   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   r   r   <module>   s   


<


/*


 &&* 
!&1
	


		
	




$








$[A



%
$

		"

!$$0