o
    \il                  	   @   s  d dl Z d dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZ d dlmZ d dlmZmZmZmZ dd	 Zejd
ejg dejdgdfejddejgejdgdfejg dedgdfdgejdddgejdddgdd Z ejdejg dejdgdfejg dedgddgfgejdeg deg d gejdddgd!d" Z!ejd#ejd gd$ d%gd$  d&g gejdj"g dgfejd'gd$ d(gd$  d)g gedj"g d*gfgejdd+dgd,d- Z#ejd.g d/d0feg d1g dgj"d2fgd3d4 Z$d5d6 Z%ejd7d%d8gd$ d9d:gfg dd; d%d8g g d<fg d=d; d>d?g g d@fgdAdB Z&ejdCdDdEgejdddgejdg dFdGdH Z'ejjdIedJgdK dJfed gdK d fejdLgdK edd fgg dMdNejdg dOdPdQ Z(dRdS Z)dTdU Z*ejdg dVdWdX Z+ejddYdgdZd[ Z,d\d] Z-dS )^    N)assert_allcloseassert_array_equal)RandomForestRegressor)Ridge)KFoldShuffleSplitStratifiedKFoldcross_val_scoretrain_test_split)make_pipeline)KBinsDiscretizerLabelBinarizerLabelEncoderTargetEncoderc                 C   s   t j|t jd}t |}|dkrQt |}t|D ]4}|| |k }|jd }	|	dkr0|||< qt |}
|
| }|	|	|  }|t | d| |  ||< q|S t|D ]}|| |k }t |||  }|jd | }|| ||< qU|S )z0Simple Python implementation of target encoding.dtypeautor      )npzerosfloat64meanvarrangeshapesum)	X_ordinal	y_numericn_categoriessmoothcur_encodingsy_mean
y_variancecy_subsetn_iy_subset_variancemlambda_current_sumcurrent_cnt r+   /var/www/www-root/data/www/176.119.141.140/sports-predictor/venv/lib/python3.10/site-packages/sklearn/preprocessing/tests/test_target_encoder.py_encode_target   s*   



 r-   zcategories, unknown_valuer   r      r            ?      @      @)catdogsnakebear)r      r         @r   target_typebinary
continuousc                 C   s  d}t jdgd dgd  dgd  gt jdj}t jg d	gt jdj}|jd }| d
kr3|}	|}
n| d | }	| d | }
t |
|ggf}
t j|}d}|dkrj|jdd|d}t jddgt	d}|| }n|dkspJ |j
dd|d}|}||}|| }|	| }	|| }|| }|dkrt||dd}nt||dd}t j|t jd}|||D ]"\}}||df || }}t||||}|||df  ||df< qt|| ||d}||	|}|j|ksJ t|| t|jdksJ |dkrt|j| n|jdu s
J t |}t|dddf |||}t|jd | |jt|ks/J t |t |gfdd}||
}t|| dS )zCheck encoding for binary and continuous targets.

    Compare the values returned by `TargetEncoder.fit_transform` against the
    expected encodings for cv splits from a naive reference Python
    implementation in _encode_target.
    r8   r      r      r/   (   r   r.   r   r;   lowhighsizer4   r5   r<   Tn_splitsrandom_stateshuffle)r   
categoriescvrG   N) r   arrayint64Tr   concatenaterandomRandomStaterandintobjectuniformpermutationr   r   
empty_liker   splitr-   r   fit_transformtarget_type_r   len
encodings_r   classes_r   target_mean_pytestapproxreshape	transform)rI   unknown_valueglobal_random_seedr   r:   r   X_train_int_arrayX_test_int_array	n_samplesX_trainX_testdata_rngrF   r   target_namesy_trainshuffled_idxrJ   expected_X_fit_transform	train_idxtest_idxX_y_r    target_encoderX_fit_transformr!   expected_encodingsexpected_X_test_transformX_test_transformr+   r+   r,   test_encoding7   s|   .







rw   zcategories, unknown_valuesrabbittarget_labels)r   r/   r8   )abr#   c           .      C   s  t j| }d}d}t |jdd|d}t |jdd|d}	|d | }
|d |	 }t |
|f}t ||	f}ddgg dg}d}t |jd||d}|| }t |}d}t|| dd	}t j	|j
d |j
d | ft jd
}t|D ]D\}}t|D ];}|||D ]2\}}|dd|f }|||f || }}t||t||}|||  }||||f  |||f< qqqxt||| d} | ||}!| jdksJ t|!| g }"t|D ]'\}}t|D ]}|dd|f }t|dd|f |t||}|"| qqt| j|| ksJ t|| D ]}#t| j|# |"|#  qt| j| t ddgddgddgg}$|dkr@|$}%n3t j|$ddddf td
}%t|$j
d D ]}&|d |$dd|&f  |%dd|&f< qVt |%|f}%t j|dd}'t j	|$j
d |$j
d | ft jd
}(|$j
d })g d}t|)d D ]}*t|"D ]\}#}+|+|$|*||# f  |(|*|#f< qqg d},t|| D ]}#|'|,|#  |(|)d |#f< q| |%}-t|-|( dS )z&Check encoding for multiclass targets.P   r/   r   r@   r8   r   r.   TrE   r   Nr   rJ   rG   
multiclassr0      r   rK   axis)r   r   r   r   r   r   )r   r   r/   r   r   r/   )r   rP   rQ   rL   rR   column_stackr   rX   r   emptyr   r   	enumerater   rW   r-   rZ   r   rY   r   appendr[   r   r\   rV   rS   vstackr   ra   ).rc   rI   unknown_valuesry   r   rngrf   
n_features
feat_1_int
feat_2_intfeat_1feat_2rg   X_train_intcategories_	n_classesy_train_intrk   y_train_encrF   rJ   rm   f_idxcatsc_idxrn   ro   y_classrp   rq   current_encodingexp_idxrr   rs   rt   i
X_test_intrh   
column_idxr!   ru   n_rowsrow_idxencmean_idxrv   r+   r+   r,   test_encoding_multiclass   s   


(
 
r   zX, categories
   r   r8   r4   r5   r6   )r5   r4   cow      @c                 C   s   t jd}|jdd| jd d}t||dd| |}| }|| dd }|d t	
|ks4J t|jd	ks=J |jd d t	
|ksKJ dS )
zHCustom categories with unknown categories that are not in training data.r   rD   r=   r@   )rI   r   rG   rK   N)r   r   r   )r   rP   rQ   rT   r   r   fitr   ra   r^   r_   rZ   r[   )XrI   r   r   yr   r!   X_transr+   r+   r,   test_custom_categories  s    r   zy, msg)r   r/   r   r   z'Found input variables with inconsistent)r   r/   r   z7Target type was inferred to be 'multiclass-multioutput'c                 C   sX   t g dgj}t }tjt|d |||  W d   dS 1 s%w   Y  dS )zCheck invalidate input.)r   r   r   matchN)r   rL   rN   r   r^   raises
ValueErrorrX   )r   msgr   r   r+   r+   r,   test_errors5  s
   "r   c                  C   s   t g dgj} t g d}tdd}tjttdd |	| | W d   n1 s0w   Y  |j
dks<J tdd	d
}|	| | |j
d	ksOJ dS )z@Check inferred and specified `target_type` on regression target.)r   r   r   r   r   r   )r1          @r2   r   r2   r   r/   rJ   zQThe least populated class in y has only 1 members, which is less than n_splits=2.r   Nr~   r<   )rJ   r:   )r   rL   rN   r   r^   warnsUserWarningreescaperX   rY   )r   r   r   r+   r+   r,   test_use_regression_targetH  s   
r   zy, feature_namesr/   AB   )A_1A_2A_3B_1B_2B_3)y1y2y3r   r   )A_y1A_y2A_y3B_y1B_y2B_y3c                 C   s   t d}|ddgd ddgd d}tddd	d
}|jdd tddd	d
}|jdd ||| }||| }t| | t|	 | t|	 |j
 dS )z*Check TargetEncoder works with set_output.pandasrz   r{   r   r   r/   )r   r   r2   r   rJ   r   rG   default)ra   N)r^   importorskip	DataFramer   
set_outputrX   r   to_numpyr   get_feature_names_outcolumns)r   feature_namespdX_dfenc_default
enc_pandas	X_defaultX_pandasr+   r+   r,   !test_feature_names_out_set_output]  s   
 r   	to_pandasTF)binary-ints
binary-strr<   c              
   C   s  t jddgddgddgddgddgddgddgddggt jd}|dkr9t g d}t |}tdddd}n,|d	krRt g d
}t |}tdddd}nt jg dt jd}|}tdddd}t |}g dddgg}t jddgddgddggt jd}	| rt	
d}
|
|dddf t jddgtd|dddf  d}|
|	dddf g dd}	n|}t j|t jd}t|D ]/\}}|||D ]$\}}|||f || }}t||t||}||||f  |||f< qqg }t|D ]\}}t|dd|f |t||}|| qt j|d d |d d g||d d g|d d |ggt jd}t|ddd}|||}t|| t|jdksRJ tdD ]}t|j| ||  qV||	}t|| dS )z,Check target encoder with multiple features.r   r   r/   r   r   )rz   r{   rz   rz   r{   r{   rz   r{   T)rG   rH   r   )r8   r0   r8   r8   r8   r0   r0   r0   )r2   gffffff@g333333@g      @gffffff@g      @皙$@g333333@r.   r8   r   r   Nr4   r5   )feat0feat1)r5   r4   r6   r}   )r   rL   rM   r   rX   r   float32r   r   r^   r   r   rS   rV   r   r   rW   r-   rZ   r   r   r   r[   r   ra   )r   r   r:   r   rk   	y_integerrJ   r!   rI   rh   r   rg   rm   r   r   rn   ro   rp   rq   r   rt   ru   r   rs   r   rv   r+   r+   r,   test_multiple_features_quick{  s|   6
	
 "
	

r   z	y, y_meang333333@r=   rz   )r<   r;   zbinary-string)ids)r   r           c           	      C   s   t dgd gj}|jd }td|dd}||| }t|t j|gg|dd |jd d t	
|ks7J |jt	
|ksAJ t dgdgg}||}t|t j|ggddd dS )z5Check edge case where feature and target is constant.r   r=   r   r/   r   r   N)r   rL   rN   r   r   rX   r   repeatr[   r^   r_   r]   ra   )	r   r!   r   r   rf   r   r   rh   X_test_transr+   r+   r,    test_constant_target_and_feature  s   

r   c                 C   s   d}d}t j| }|j|d}|jd||ddd}| }|| }|| }td| d}|||}td	d
}|||}	t	dd| d}
t
d| d}t|
|||d dk sZJ t|
|||d dk shJ t|
|	||d dksvJ d S )Nr>   i  rC   r   rK   r   T)rH   rG   F)rH   r   r=   )n_estimatorsmin_samples_leafrG   2   )rF   rG   r   皙?      ?)r   rP   rQ   normalrR   r`   argsortr   rX   r   r   r	   r   )rc   cardinalityrf   r   rk   rg   y_sorted_indicesrr   X_encoded_train_shuffledX_encoded_train_no_shuffled	regressorrJ   r+   r+   r,   Ftest_fit_transform_not_associated_with_y_if_ordinal_categorical_is_not  s.   
	r   c                  C   sv   t g dgj} t g d}tdddd}|| |}t|d t |dd	  t|d
 t |d	d  d	S )zECheck edge case with zero smoothing and cv does not contain category.)
r   r   r   r   r   r   r   r   r   r   )
g @g333333@g333333?g@r1   g      "@r   gffffff,@g*@g      .@r   Fr/   )r   rH   rJ   r   r   NrK   )r   rL   rN   r   rX   r   r   )r   r   r   r   r+   r+   r,   test_smooth_zero  s    r   )r   g     @@r   c                 C   s   t j|}|jdd}d}t|ddd|dd}t|||d	\}}}}	||}
|
|	t j
 }|
|	t j
 }t| |d
}|||}||}|||}||}t|| t|| d S )Ni  r   r>   averaged_inverted_cdfordinal)n_binsquantile_methodencoderK   r   rG   r   rG   )r   rP   rQ   r   r   rX   r`   r
   rU   astypeint32r   ra   r   )r   rc   r   r   r   r   rg   rh   rk   y_testpermutated_labelsX_train_permutedX_test_permutedrr   X_train_encodedX_test_encodedX_train_permuted_encodedX_test_permuted_encodedr+   r+   r,   3test_invariance_of_encoding_under_label_permutation)  s*   



r  r   c                 C   s  t dddd}d}tj|}||}d|| }d}t|dd	|d
|| dd}||}	|	|	tj
 }||}
|jtd| |dddd}tj||
|gdd}t||dd\}}}}|||}|||dk suJ |||dk sJ tt| |d|||}|d j}|||dksJ ||||dksJ ||d tjdddksJ t|dd  dk  sJ t| |d||}||}||}|||}|j}|||dksJ ||||dk sJ |t|d t|d k s	J d S )Ngư>lsqrF)alphasolverfit_interceptiP  g?d   r   rT   )r   r   strategyrG   rK   r   g?T)rC   replacer   r   r   r   r   r   g{Gz?)absg?gffffff?r/   )r   r   rP   rQ   randnr   rX   r`   rU   r   r   choiceintrO   r
   r   scorer   r   coef_r^   r_   r  allra   )r   rc   linear_regressionrf   r   r   noiser   X_informativer   
X_shuffledX_near_unique_categoriesr   rg   rh   rk   r   	raw_modelmodel_with_cvcoefrr   X_enc_no_cv_trainX_enc_no_cv_testmodel_no_cvr+   r+   r,   *test_target_encoding_for_linear_regressionM  sd   






"r!  c                  C   st   t jddd} | dd# | g dg dd}td	d
|dg |d  W d   dS 1 s3w   Y  dS )z
    Test target-encoder cython code when y is read-only.

    The numpy array underlying df["y"] is read-only when copy-on-write is enabled.
    Non-regression test for gh-27879.
    r   z2.0)
minversionzmode.copy_on_writeT)rz   r{   r{   )r   r9   r3   )xr   r<   )r:   r#  r   N)r^   r   option_contextr   r   r   )r   dfr+   r+   r,   test_pandas_copy_on_write  s
   "r&  ).r   numpyr   r^   numpy.testingr   r   sklearn.ensembler   sklearn.linear_modelr   sklearn.model_selectionr   r   r   r	   r
   sklearn.pipeliner   sklearn.preprocessingr   r   r   r   r-   markparametrizerL   rM   nanr   rS   rw   r   rN   r   r   r   r   r   r   r   r   r  r!  r&  r+   r+   r+   r,   <module>   s    	`h(


	
R	.
#
r