o
    \i                     @   s<  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlZd dlZd dlmZmZ d dlmZ d d	lmZ d d
lmZmZmZmZmZmZmZm Z  d dl!m"Z"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0 dZ1dZ2e1e2 Z3dd Z4dd Z5dd Z6dd Z7dd Z8dd Z9ej:;deefdd  Z<d!d" Z=d#d$ Z>d%d& Z?d'd( Z@d)d* ZAd+d, ZBd-d. ZCd/d0 ZDd1d2 ZEd3d4 ZFd5d6 ZGd7d8 ZHd9d: ZId;d< ZJd=d> ZKd?d@ ZLdAdB ZMej:jNe.dCdDdEdF ZOdGdH ZPdIdJ ZQdKdL ZRdMdN ZSdOdP ZTej:;deefdQdR ZUdSdT ZVdUdV ZWdWdX ZXdYdZ ZYd[d\ ZZej:;deefd]d^ Z[d_d` Z\dadb Z]dcdd Z^dedf Z_dgdh Z`didj Zaej:;dkejbejcejdgdldm Zedndo Zfdpdq Zgdrds Zhdtdu Zidvdw Zjdxdy Zkdzd{ Zld|d} Zmd~d Zndd Zodd Zpej:;deeefdd Zqej:;dejrejsgdd Ztej:;dee/e0dd Zuej:;dejvejsdfejwejsdfejrejrdfejsejsdfgdd Zxej:;deddeddeddgdd Zydd Zzdd Z{e,ej:;de0dd Z|ej:;deeegdd Z}ej:;deeegej:;dde~dfdedfgdd Zej:;deeeegej:;ddd dd gej:;dddgdd Zej:;deeegdd Zej:;deeegej:;dddgddddddddf	ddd dddddddf	ddd dddddddf	dddd dddd dddf	ddddddd dddf	dgddȄ Zej:;deddd̜dddΜgfee1ffddЄ Zdd҄ ZddԄ Zej:;deeeegddք Zej:;de0dd؄ Zej:;dejrejsgddۄ Zdd݄ Zdd߄ ZdS )    N)defaultdict)Mapping)partial)StringIO)product)assert_array_almost_equalassert_array_equal)sparse)clone)ENGLISH_STOP_WORDSCountVectorizerHashingVectorizerTfidfTransformerTfidfVectorizerstrip_accents_asciistrip_accents_unicode
strip_tags)GridSearchCVcross_val_scoretrain_test_split)Pipeline)	LinearSVC)assert_allclose_dense_sparseassert_almost_equalskip_if_32bit)_IS_WASMCSC_CONTAINERSCSR_CONTAINERS)zthe pizza pizza beer copyrightzthe pizza burger beer copyrightz!the the pizza beer beer copyrightzthe burger beer beer copyrightzthe coke burger coke copyrightzthe coke burger burger)zthe salad celeri copyrightz)the salad salad sparkling water copyrightzthe the celeri celeri copyrightzthe tomato tomato salad waterz the tomato salad water copyrightc                 C   s   t |  S N)r   uppers r"   /var/www/www-root/data/www/176.119.141.140/sports-predictor/venv/lib/python3.10/site-packages/sklearn/feature_extraction/tests/test_text.py	uppercase9      r$   c                 C   s   |  ddS )N   ée)replacer    r"   r"   r#   strip_eacute=   r%   r)   c                 C      |   S r   splitr    r"   r"   r#   split_tokenizeA      r-   c                 C   s   dgS )Nthe_ultimate_featurer"   r    r"   r"   r#   lazy_analyzeE   s   r0   c                  C   s   d} d}t | |ksJ d} d}t | |ksJ d} d}t | |ks$J d} d}t | |ks0J d	} d
}t | |ks<J d} d}t | |ksHJ d} d
}t | |ksTJ d S )N   àáâãäåçèéêëaaaaaaceeee   ìíîïñòóôõöùúûüýiiiinooooouuuuy   إu   ا   this is à testthis is a testu   öou   ̀́̂̃ u   ȫ)r   aexpectedr"   r"   r#   test_strip_accentsI   s*   r=   c                  C   sd   d} d}t | |ksJ d} d}t | |ksJ d} d}t | |ks$J d} d}t | |ks0J d S )	Nr1   r2   r3   r4   r5   r9   r6   r7   )r   r:   r"   r"   r#   test_to_asciim   s   r>   
Vectorizerc                 C   s   | dd  }d}g d}|||ksJ d}g d}|||ks#J | dd  }td	}g d
}|||ks:J | td  }d}g d}|||ksOJ | tdd  }d}g d}|||kseJ d S )Nasciistrip_accents:   J'ai mangé du kangourou  ce midi, c'était pas très bon.)
aimangedu	kangouroucemidietaitpastresbonz0This is a test, really.

 I met Harry yesterday.)thisistestreallymetharry	yesterdayfile)input'This is a test with a file-like object!)rN   rO   rP   withrU   likeobjectpreprocessoru;   J'ai mangé du kangourou  ce midi,  c'était pas très bon.)
AIMANGEDU	KANGOUROUCEMIDIETAITPASTRESBON)	tokenizerrB   )
zj'airE   rF   rG   rH   zmidi,zc'etaitrK   rL   zbon.)build_analyzerr   r$   r-   )r?   watextr<   r"   r"   r#   test_word_analyzer_unigrams   s&   rk   c                  C   s2   t dddd } d}g d}| ||ksJ d S )Nwordunicode      analyzerrB   ngram_rangerC   )rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   zai mangezmange duzdu kangourouzkangourou cezce midiz
midi etaitz	etait paszpas tresztres bon)r   rh   )ri   rj   r<   r"   r"   r#   'test_word_analyzer_unigrams_and_bigrams   s   rt   c                  C   s   d} |  d}tddd }tt || W d    n1 s#w   Y  tdddd }tt || W d    d S 1 sFw   Y  d S )	NrC   zutf-8rn   r@   )rs   encodingchar      )rr   rs   ru   )encoder   rh   pytestraisesUnicodeDecodeError)rj   
text_bytesri   car"   r"   r#   test_unicode_decode_error   s   


"r   c                  C   s   t dddd } d}g d}| |d d |ksJ g d}| |d	d  |ks+J d
}g d}| |d d |ks=J g d}| |d	d  |ksMJ t dddd } td}g d}| |d d |ksjJ d S )Nrv   rm   rw   rq   u9   J'ai mangé du kangourou  ce midi, c'était pas très bon)zj'az'aizai zi mz ma   )zs tresz tres ztres bzres bozes bon1This 
	is a test, really.

 I met Harry yesterday)thihisis zs iz is)z yesteyesteresterdsterdaterdayrU   rV   rr   rs   rW   r   rh   r   cngarj   r<   r"   r"   r#   test_char_ngram_analyzer   s.   r   c                  C   s   t dddd } d}g d}| |d d |ksJ g d}| |d	d  |ks+J t d
ddd } td}g d}| |d d |ksHJ d S )Nchar_wbrm   rw   rq   r   )z thr   r   r   z thir   )r   r   r   r   zerday r   rU   r   zA test with a file-like object!)z a z tetesestzst z tesry   r   r   r"   r"   r#   test_char_wb_ngram_analyzer  s$   r   c                  C   s   t dddd } d}g d}| |d d |ksJ g d}| |d	d  |ks+J t d
ddd }t|}||| |ksBJ d S )Nrl   rm   rw   rq   r   )zthis is testzis test reallyztest really metrx   )ztest really met harry yesterdayzthis is test really met harryz"is test really met harry yesterdayrU   r   r   )r   rj   r<   	cnga_filerU   r"   r"   r#   test_word_ngram_analyzer  s"   r   c                  C   s   ddd} t |  }ttttttfD ]O}|| }t|d}|	t
 t|tr1|j| ks0J n	t |j|ks:J |t
}|jd t|ksJJ || }t|d}||}t||jd kscJ qd S )Nr   ro   pizzabeer
vocabulary)setkeysdictlistiterr   r   intr   fitJUNK_FOOD_DOCS
isinstancer   vocabulary_	transformshapeleninverse_transform)vocabtermstypvvectXinvr"   r"   r#   &test_countvectorizer_custom_vocabulary6  s    






r   c                  C   sd   ddg} t dt| dfdt fg}|t}t|jd jt| ks%J |jd t	| ks0J d S )Nr   r   countr   tfidfro   )
r   r   r   fit_transformALL_FOOD_DOCSr   named_stepsr   r   r   )what_we_likepiper   r"   r"   r#   /test_countvectorizer_custom_vocabulary_pipelineK  s   
r   c                  C   sX   ddd} d}t jt|d t| d}|dg W d    d S 1 s%w   Y  d S )Nr   r   z$Vocabulary contains repeated indicesmatchr   pasta_sizilianar{   r|   
ValueErrorr   r   )r   msgr   r"   r"   r#   7test_countvectorizer_custom_vocabulary_repeated_indicesX  s   

"r   c                  C   sT   ddd} t jtdd t| d}|dg W d    d S 1 s#w   Y  d S )Nro   rp   r   zdoesn't contain indexr   r   pasta_verdurar   r   r   r"   r"   r#   0test_countvectorizer_custom_vocabulary_gap_index`  s
   

"r   c                  C   s   t  } | jdd |  tksJ | jdd tt |   W d    n1 s+w   Y  | jdd tt |   W d    n1 sJw   Y  g d}| j|d |  t|kscJ d S )Nenglish
stop_words_bad_str_stop__bad_unicode_stop_)someotherwords)r   
set_paramsget_stop_wordsr   r{   r|   r   r   )cvstoplistr"   r"   r#   test_countvectorizer_stop_wordsg  s   

r   c                  C   s   t jtdd tg d} | dg W d    n1 sw   Y  t jtdd tddd}|g d W d    d S 1 sBw   Y  d S )	Nzempty vocabularyr   r   foo      ?r   )max_dfr   )zto be or not to bez
and me toozand so do your   )r   r   r"   r"   r#   %test_countvectorizer_empty_vocabularyv  s   
"r   c                  C   sF   t  } | td d }| tdd  }|jd |jd ks!J d S )Nr   ro   )r   r   r   r   )r   X1X2r"   r"   r#   test_fit_countvectorizer_twice  s   r   c                  C   s>   g d} d}t |d}||  g d}| }t|| dS )zCheck `get_feature_names_out()` when a custom token pattern is passed.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    z&This is the 1st document in my corpus.z This document is the 2nd sample.zAnd this is the 3rd one.zIs this the 4th document?z'[0-9]{1,3}(?:st|nd|rd|th)\s\b(\w{2,})\btoken_pattern)documentonesampleN)r   r   get_feature_names_outr   )corpusr   
vectorizerr<   feature_names_outr"   r"   r#   )test_countvectorizer_custom_token_pattern  s   

r   c                  C   sX   g d} d}d}t |d}tjt|d ||  W d   dS 1 s%w   Y  dS )zCheck that we raise an error if token pattern capture several groups.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    r   z)([0-9]{1,3}(?:st|nd|rd|th))\s\b(\w{2,})\bz,More than 1 capturing group in token patternr   r   Nr   r{   r|   r   r   )r   r   err_msgr   r"   r"   r#   <test_countvectorizer_custom_token_pattern_with_several_group  s   
"r   c                  C   s   g d} d}t d| d}tjt|d ||  W d    n1 s#w   Y  t  tdt ||  W d    d S 1 sCw   Y  d S )N)SampleUpperCase
VocabularyzyUpper case characters found in vocabulary while 'lowercase' is True. These entries will not be matched with any documentsT)	lowercaser   r   error)	r   r{   warnsUserWarningr   warningscatch_warningssimplefilterr   )r   messager   r"   r"   r#   'test_countvectorizer_uppercase_in_vocab  s   
"r   c                  C   sH   g dg dg dg} t ddd| }g d}||}t|| dS )	z0Check get_feature_names_out for TfidfTransformerro   ro   ro   ro   ro   r   ro   r   r   Tl2
smooth_idfnorm)r;   cbN)r   r   r   r   )r   trfeature_names_inr   r"   r"   r#   %test_tf_transformer_feature_names_out  s
   
r   c                  C   s   g dg dg dg} t ddd}||  }|dk s J t|d jd	d
g d g dg dg dg} t ddd}||  }|dk sMJ d S )Nr   r   r   Tr   r   r   rp   ro   axisr   r   r   )r   r   toarrayallr   sumr   r   r   r"   r"   r#   test_tf_idf_smoothing  s   r  zcno floating point exceptions, see https://github.com/numpy/numpy/pull/21895#issuecomment-1311525881)reasonc                  C   s   g dg dg dg} t ddd}||  }|dk s J t|d jd	d
g d g dg dg dg} t ddd}d}tjt|d ||   W d    d S 1 sZw   Y  d S )Nr   r   r   Fr   r   r   rp   ro   r   r  zdivide by zeror   )	r   r   r  r  r   r  r{   r   RuntimeWarning)r   r   r   in_warning_messager"   r"   r#   test_tfidf_no_smoothing  s   "r
  c                  C   s   dgdgdgg} t ddd d}||  }|d dksJ |d |d ks(J |d |d ks2J |d dk s:J |d dk sBJ d S )Nro   rp   rx   TF)sublinear_tfuse_idfr   r   )r   r   r  r  r"   r"   r#   test_sublinear_tf  s   r  c                  C   s  t td d } td g}ttd }tdd}|| }t|dr&| }|d|jd f dks3J t|jd	}||fD ]s}||}t|drM| }|j}|d|d
 f dks\J |d|d f dkshJ |d|d f dkstJ d|vszJ d|vsJ |d|d f dksJ |d|d f dksJ |d|d f dksJ |d|d f dksJ q=t	dd}	|	
|| }
t|	jt|jksJ |
j|t|jfksJ |	| }|jt|t|jfksJ t	ddd}|
|| }t|drJ t	dd}tt || W d    n	1 s w   Y  ttj|dddg|  t td d } tdd}|j|_||  }|jrPJ t|
| || }t|| td d	}tt ||  W d    n	1 s|w   Y  |jddd | }d}t|}||}||ksJ |jdd d tt |  W d    n	1 sw   Y  d |_tt |  W d    d S 1 sw   Y  d S )!Nro         ?r   tocsrr   r   rp   r   saladtomatowaterthe	copyrightcokeburgerr   l1r   F)r   r  idf_Tr  r   r   r@   )rB   r   rC   _gabbledegook_)rB   r\   _invalid_analyzer_type_)r   r   r   r   r   hasattrr  r   r   r   r   r  r  r   r{   r|   r   r   npr  r   r   fixed_vocabulary_r   build_preprocessorr   rh   )
train_data	test_datan_trainv1counts_trainv2r   counts_testr   t1r   
tfidf_testt2tft3tvtfidf2tfidf_test2v3	processorrj   r<   resultr"   r"   r#   test_vectorizer  s~   













$r5  c                  C   s  d\} }}}t | |||d}|t |jj| ksJ |jj|ks#J |jj|ks+J |jj|ks3J d|_d|_d|_d|_|jj| ksGJ |jj|ksOJ |jj|ksWJ |jj|ks_J |t |jj|jksmJ |jj|jksvJ |jj|jksJ |jj|jksJ d S )N)r   FFF)r   r  r   r  r  T)r   r   r   _tfidfr   r  r   r  )r   r  r   r  r/  r"   r"   r#   test_tfidf_vectorizer_settersi  s,   

r7  c                  C   sv  t  } | t}|j}|jtt| jfksJ |j| jksJ t	|j
dks)J t	|j
dk s3J t|j
dks=J t|j
dk sGJ t|jd D ]}ttj|d j
dd qNt ddd} | t}|jtt| jfksuJ |j| jks}J |j}||ksJ |d| k sJ t	|j
dksJ t|j
dk sJ t|jd D ]}ttj|d j
dd qd S )	Nr  r   ro   rp   r   rn   r  )rs   r   )r   r   r   nnzr   r   
n_featuresdtyper   mindatamaxranger   linalgr   )r   r   	token_nnzi
ngrams_nnzr"   r"   r#   test_hashing_vectorizer  s.   

rC  c                  C   s2  t dd} tt |   W d    n1 sw   Y  | jr#J | t}|j\}}t	| j
|ks6J |  }t|tjsBJ |jtksIJ t	||ksQJ tg d| t|D ]\}}|| j
|ksjJ q\g d}t |d} |  }tg d| | jsJ t|D ]\}}|| j
|ksJ qd S )Nr  r  	r   r  celerir  r   r  	sparklingr  r  r   )r   r{   r|   r   r   r!  r   r   r   r   r   r   r   ndarrayr:  rZ   r   	enumerateget)r   r   	n_samplesr9  feature_namesidxnamer   r"   r"   r#   test_feature_names  s:   






rN  c                 C   s4   h d}| ddd}| t t|j|ksJ d S )N>   r   r   r  r  g333333?   )r   max_features)r   r   r   r   )r?   expected_vocabularyr   r"   r"   r#   test_vectorizer_max_features  s   
rR  c            	      C   s   t dd} t dd}t d d}| tjdd}|tjdd}|tjdd}|  }| }| }d| ks>J d| ksFJ d| ksNJ d|t| ksYJ d|t| ksdJ d|t| ksoJ d S )Nro   rP  rx   r   r      r  )r   r   r   r  r   r=  r   argmax)	cv_1cv_3cv_Nonecounts_1counts_3counts_None
features_1
features_3features_Noner"   r"   r#   "test_count_vectorizer_max_features  s   


r_  c                  C   s   g d} t ddd}||  d|j v sJ t|j dks#J d|_||  d|j vs4J t|j dks?J d	|_||  d|j vsPJ t|j dks[J d S )
Nabcdeaeatrv   r   rr   r   r;   ry   r  rO  ro   )r   r   r   r   r   r   r$  r   r"   r"   r#   test_vectorizer_max_df     


rf  c                  C   s   g d} t ddd}||  d|j v sJ t|j dks#J d|_||  d|j vs4J t|j dks?J d	|_||  d|j vsPJ t|j dks[J d S )
Nr`  rv   ro   )rr   min_dfr;   ry   rp   r   g?)r   r   r   r   r   rh  re  r"   r"   r#   test_vectorizer_min_df)  rg  ri  c                  C   s   ddg} t ddd}||  }tg d|  tg dg dg| t ddd	d
}||  }tg dg dg| t ddd	tjd}|| }|jtjksTJ d S )Naaabcabbderv   r   rd  )r;   r   r   dr'   )rx   ro   ro   r   r   )ro   rp   r   ro   ro   T)rr   r   binary)ro   ro   ro   r   r   )ro   ro   r   ro   ro   )rr   r   rm  r:  )r   r   r  r   r   r   float32r:  )r$  r   r   X_sparser"   r"   r#   test_count_binary_occurrences;  s   
rp  c                  C   s   ddg} t ddd d}|| }t|dd jdksJ t|dd	 jd	ks,J |jtjks4J t ddd
d d}|| }t|jdksKJ |jtjksSJ t ddd
d tjd}|| }|jtjksjJ d S )Nrj  rk  Frv   )alternate_signrr   r   r   ro   rx   rp   T)rr   rq  rm  r   )rr   rq  rm  r   r:  )r   r   r   r=  r<  r:  float64)r$  r   r   r"   r"   r#   test_hashed_binary_occurrencesO  s"   


rs  c                 C   s  t }|  }||}||}t|tsJ | }t||D ]\}}tt	||}tt	|}t
|| qt|sBJ |jdksIJ | }	||	}
t||
D ]\}}t
t|t| qW| }||}t||D ]\}}t
t|t| qud S )Ncsr)r   r   r   r   r   rh   zipr   sortuniquer   r	   issparseformatr  tocsc)r?   r<  r   transformed_datainversed_dataanalyzedocinversed_termsr   transformed_data2inversed_data2terms2transformed_data3inversed_data3terms3r"   r"   r#   !test_vectorizer_inverse_transformi  s*   



r  c                  C   s   t t } dgtt  dgtt  }t| |ddd\}}}}tdt fdt fg}dd	gd
d}t||ddd}|||	|}	t
|	| |jdksNJ |jjd }
|
jdks[J d S )Nr  ro   g?r   	test_sizerandom_stater   svcro   ro   rn   hingesquared_hinge)vect__ngram_range	svc__lossrx   )n_jobsr   r   )r   NOTJUNK_FOOD_DOCSr   r   r   r   r   r   r   predictr   best_score_best_estimator_r   rs   r<  targetr#  r$  target_traintarget_testpipeline
parametersgrid_searchpredbest_vectorizerr"   r"   r#   -test_count_vectorizer_pipeline_grid_selection  s   
r  c                  C   s   t t } dgtt  dgtt  }t| |ddd\}}}}tdt fdt fg}dd	gd
dd}t||dd}|||	|}	t
|	| |jdksNJ |jjd }
|
jdks[J |
jdksbJ |
jrgJ d S )Nr  ro   g?r   r  r   r  r  rn   )r  r   r  )r  
vect__normr  )r  r   r   )r   r  r   r   r   r   r   r   r   r  r   r  r  r   rs   r   r!  r  r"   r"   r#   'test_vectorizer_pipeline_grid_selection  s$   
r  c                  C   s^   t t } dgtt  dgtt  }tdt fdt fg}t|| |dd}t|g d d S )Nr  ro   r   r  rx   )r   r  )r   r  r   r   r   r   r   r   )r<  r  r  	cv_scoresr"   r"   r#   )test_vectorizer_pipeline_cross_validation  s
   r  c                  C   sx   d} t  }|| g}|jdksJ td dd}|| g}|jdks%J |j|jks-J tt|j	t|j	 d S )Nu   Машинное обучение — обширный подраздел искусственного интеллекта, изучающий методы построения алгоритмов, способных обучаться.)ro      F)r   rq  )ro   i   )
r   r   r   r   r   r8  r   r   rv  r<  )r   r   	X_countedX_hashedr"   r"   r#   test_vectorizer_unicode  s   r  c                  C   sF   ddg} t | d}|t}|t}t| |  |js!J d S )Nr   rE  r   )r   r   r   r   r   r  r!  )r   r   X_1X_2r"   r"   r#   +test_tfidf_vectorizer_with_fixed_vocabulary  s   


r  c                  C   s   t  t ddt ddt ddt ttdttdttdtttd	tt ttdt tg} | D ]*}t	|}t
|}t||jksJJ | | ksTJ t|t|t q5d S )
Nr  r  T)rm  rn   rs   r[   )rr   rA   )r   r   r   r0   r   r   r)   r   pickledumpsloadstype	__class__
get_paramsr   r   )	instancesorigr!   copyr"   r"   r#   test_pickling_vectorizer  s.   


r  factoryc                 C   sB   t  }| |}d}tt|}||}||}||ksJ dS )z_Tokenizers cannot be pickled
    https://github.com/scikit-learn/scikit-learn/issues/12833
    rC   N)r   r  r  r  )r  vecfunctionrj   roundtripped_functionr<   r4  r"   r"   r#   test_pickling_built_processors  s   r  c                  C   s   t jd} t g d}tddD ],}t| j|ddd}t|d}t	t
|}|t |t t| |  qd S Nr   rD  d   r   F)sizer(   r   )r   randomRandomStatearrayr>  r   choicer   r  r  r  r   r   r   r   )rngvocab_wordsx	vocab_setr   unpickled_cvr"   r"   r#   -test_countvectorizer_vocab_sets_when_pickling3  s   


r  c                  C   s   t jd} t g d}tddD ];}t }| j|ddd}tddD ]}|||| < q$t|d}t	t
|}|t |t t| |  qd S r  )r   r  r  r  r>  r   r  r   r  r  r  r   r   r   r   )r  r  r  
vocab_dictr   yr   r  r"   r"   r#   .test_countvectorizer_vocab_dicts_when_picklingO  s"   


r  c                  C   s`   t  t} t | }t|}t|}t||j	ksJ t
||  ||   d S r   )r   r   r   r   r   r  r  r  r  r  r   r  )r   r  r!   r  r"   r"   r#   test_pickling_transformerl  s   

"r  c                  C   sH   t  t} t | }t }|j|_t||  ||   d S r   )	r   r   r   r   r   r  r   r   r  )r   r  r  r"   r"   r#   test_transformer_idf_setteru  s
   "r  c                  C   s   t dd} | t t | jdd}| j|_t|t | t  t | jdd}d}tj	t
|d | j|_W d    d S 1 sDw   Y  d S )NTr  r   r  Fz+`idf_` cannot be set when `user_idf=False`.r   )r   r   r   r   r  r   r   r  r{   r|   r   )r  r  r   r"   r"   r#   test_tfidf_vectorizer_setter}  s   


"r  c                  C   sv   t dd} | t t | jdd}t| j}dg|d  }tt t	|d| W d    d S 1 s4w   Y  d S )NTr  r  r   ro   r  )
r   r   r   r   r   r  r{   r|   r   setattr)r   r  expected_idf_leninvalid_idfr"   r"   r#   %test_tfidfvectorizer_invalid_idf_attr  s   


"r  c                  C   sL   g d} t | d}tt |g  W d    d S 1 sw   Y  d S )N)r;   r   r   r;   r;   r   r   r   r"   r"   r#   test_non_unique_vocab  s
   
"r  c                  C   sJ   d} t }dd }tj|| d |  W d    d S 1 sw   Y  d S )Nz?np.nan is an invalid document, expected byte or unicode string.c                  S   s   t  } | dtjdg d S )Nhello worldhello hello)r   r   r   nan)hvr"   r"   r#   func  s   z0test_hashingvectorizer_nan_in_docs.<locals>.funcr   )r   r{   r|   )r   	exceptionr  r"   r"   r#   "test_hashingvectorizer_nan_in_docs  s   "r  c                  C   sd   t ddd d} | jsJ | ddg }t| g d | ddg }t| g d d S )NTF)rm  r  r   r  r  )ro   ro   ro   r   )r   rm  r   r  r   ravelr   )r   r   r   r"   r"   r#   test_tfidfvectorizer_binary  s   
r  c                  C   s(   t dd} | t t| j| jj d S )NTr  )r   r   r   r   r  r6  )r   r"   r"   r#   test_tfidfvectorizer_export_idf  s   

r  c                  C   s<   t dgd} t| }| t |t |j| jksJ d S )Nr  r   )r   r
   r   r   r   )
vect_vocabvect_vocab_cloner"   r"   r#   test_vectorizer_vocab_clone  s
   

r  c                 C   s   d}|  }t jt|d |d W d    n1 sw   Y  t jt|d |d W d    n1 s8w   Y  |ddg t jt|d |d W d    d S 1 s\w   Y  d S )NzBIterable over raw text documents expected, string object received.r   zhello world!	some textzsome other text)r{   r|   r   r   r   r   )r?   r   r  r"   r"   r#   &test_vectorizer_string_object_as_input  s   "r  X_dtypec                 C   s2   t jdd| dd}t |}|j|jksJ d S N
    N  *   r:  r  )r	   randr   r   r:  )r  r   X_transr"   r"   r#   test_tfidf_transformer_type  s   r  zcsc_container, csr_containerc                 C   sZ   t jddtjdd}| |}||}t |}t |}t|| |j|jks+J d S r  )r	   r  r   rr  r   r   r   ry  )csc_containercsr_containerr   X_cscX_csrX_trans_cscX_trans_csrr"   r"   r#   test_tfidf_transformer_sparse  s   
r  z0vectorizer_dtype, output_dtype, warning_expectedTFc                 C   s   t g d}t| d}d}|r-tjt|d ||}W d    n1 s'w   Y  nt  t	dt ||}W d    n1 sGw   Y  |j
|ksSJ d S )N)numpyscipysklearnr:  z'dtype' should be used.r   r   )r   r  r   r{   r   r   r   r   r   r   r:  )vectorizer_dtypeoutput_dtypewarning_expectedr   r   warning_msg_matchX_idfr"   r"   r#   test_tfidf_vectorizer_type  s   


r  r  )rp   ro   r  c                 C   s   | j }td| d}tjt|d | dg W d    n1 s$w   Y  tjt|d | dg W d    n1 sAw   Y  t| t	rktjt|d | 
dg W d    d S 1 sdw   Y  d S d S )NzInvalid value for ngram_range=z/ lower boundary larger than the upper boundary.r   zgood news everyone)rs   reescaper{   r|   r   r   r   r   r   r   )r  invalid_ranger   r"   r"   r#   $test_vectorizers_invalid_ngram_range  s   

"r   c                 C   s&   |   }|  }|  }| |||S r   )r   build_tokenizerr"  _check_stop_words_consistency)	estimatorr   tokenize
preprocessr"   r"   r#   r     s   r  c               	   C   s   d} d|  }t  t t fD ]1}|jg dd tjt|d |dg W d    n1 s0w   Y  |`t	|du s?J qt
  t
dt |dg W d    n1 s[w   Y  t	|d u shJ |jg d	d tjt|d |dg W d    d S 1 sw   Y  d S )
Nz\['and', 'll', 've'\]z}Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens %s not in stop_words.)you'veyouyou'llANDr   r   r  Fr   )r  r  r  blahr	  )r   r   r   r   r{   r   r   r   _stop_words_idr  r   r   r   )lstrr   r  r"   r"   r#   'test_vectorizer_stop_words_inconsistent'  s*   
"r  r  c                 C   s^   | dt jd}t j}|j||_|j||_dddd}t ||}||jjks-J dS )z
    Check that CountVectorizer._sort_features preserves the dtype of its sparse
    feature matrix.

    This test is skipped on 32bit platforms, see:
        https://github.com/scikit-learn/scikit-learn/pull/11295
    for more details.
    )r   r   r  r   ro   rp   )zscikit-learnrO   zgreat!N)r   int64indicesastypeindptrr   _sort_featuresr:  )r  r   INDICES_DTYPEr   Xsr"   r"   r#   7test_countvectorizer_sort_features_64bit_sparse_indicesB  s   r  	Estimatorc                 C   s   ddig}|  }t |du sJ | dd dgd}t |dks!J t |d u s)J || G d	d
 d
| }|dgd}t |dksDJ | dd dgd}t |du sUJ d S )Nrj   r  Tc                 S      | d S Nrj   r"   r  r"   r"   r#   <lambda>e      z?test_stop_word_validation_custom_preprocessor.<locals>.<lambda>and)r\   r   r   c                   @   s   e Zd Zdd ZdS )zFtest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimatorc                 S   s   dd S )Nc                 S   r  r  r"   r  r"   r"   r#   r  m  r  zktest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimator.build_preprocessor.<locals>.<lambda>r"   )selfr"   r"   r#   r"  l  r.   zYtest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimator.build_preprocessorN)__name__
__module____qualname__r"  r"   r"   r"   r#   CustomEstimatork  s    r!  r   c                 S   s   t d| S )Nz\w{1,})r  compilefindallr~  r"   r"   r#   r  s  s    )rg   r   )r  r   )r  r<  r  r!  r"   r"   r#   -test_stop_word_validation_custom_preprocessor\  s   


r%  zinput_type, err_type, err_msgfilenamer9   rU   z$'str' object has no attribute 'read'c                 C   sP   dg}t j||d | dd |d| W d    d S 1 s!w   Y  d S )N"this is text, not file or filenamer   c                 S   r*   r   r+   r  r"   r"   r#   r    r  z.test_callable_analyzer_error.<locals>.<lambda>rr   rV   )r{   r|   r   )r  
input_typeerr_typer   r<  r"   r"   r#   test_callable_analyzer_errorx  s   "r+  rr   c                 C   s
   t | dS )Nr)openr$  r"   r"   r#   r    s   
 r  c                 C   r*   r   )readr$  r"   r"   r#   r    r  r)  c                 C   sL   dg}t ttf | ||d| W d    d S 1 sw   Y  d S )Nr'  r(  )r{   r|   FileNotFoundErrorAttributeErrorr   )r  rr   r)  r<  r"   r"   r#   &test_callable_analyzer_change_behavior  s   "r1  c                 C   sd   dd }|  d}|d tjtdd ||dd|g W d    d S 1 s+w   Y  d S )	Nc                 S   s   t d)Ntesting)	Exceptionr$  r"   r"   r#   rr     r.   z6test_callable_analyzer_reraise_error.<locals>.analyzerzfile.txtzsample content
r2  r   rU   r(  )joinwriter{   r|   r3  r   )tmpdirr  rr   fr"   r"   r#   $test_callable_analyzer_reraise_error  s   

"r8  zjstop_words, tokenizer, preprocessor, ngram_range, token_pattern,analyzer, unused_name, ovrd_name, ovrd_msgr  r  r  rv   z'stop_words'
'analyzer'	!= 'word'c                 C   r*   r   r+   r    r"   r"   r#   r    r  z'tokenizer'c                 C   r*   r   r+   r    r"   r"   r#   r    r  \w+rl   'token_pattern'zis not Nonec                 C   r*   r   r   r    r"   r"   r#   r    r  c                 C   r*   r   r=  r    r"   r"   r#   r    r  z'preprocessor'zis callablern   c                 C   r*   r   r=  r    r"   r"   r#   r    r  z'ngram_range')	NNNr  r;  rv   r<  r9  r:  c
                 C   sl   t }
|  }|j||||||d d|||	f }tjt|d ||
 W d    d S 1 s/w   Y  d S )N)r   rg   r\   rs   r   rr   z-The parameter %s will not be used since %s %sr   )r   r   r{   r   r   r   )r?   r   rg   r\   rs   r   rr   unused_name	ovrd_nameovrd_msgr#  r   r   r"   r"   r#   test_unused_parameters_warn  s$   Y"rA  zVectorizer, Xro   rp   )r   barrx   )r   bazc                 C   s0   |  }t |dr
J || t |drJ d S )Nn_features_in_)r  r   )r?   r   r   r"   r"   r#   test_n_features_in  s   	
rE  c                  C   s:   t dd} | ddgj}| ddgj}||ksJ d S )Nro   rS  helloworld)r   r   r   )r  vocab1vocab2r"   r"   r#   )test_tie_breaking_sample_order_invariance%  s   
rJ  c                  C   s.   t ddd} | dgj}|d dksJ d S )Ni@B )rp   rx   )r9  rs   z22pcs efuturer   )r   r   r  )hashingr  r"   r"   r#   2test_nonnegative_hashing_vectorizer_result_indices.  s   rL  c                 C   s   |  }t |dr
J dS )z0Check that vectorizers do not define set_output.
set_outputN)r  )r  r   r"   r"   r#   'test_vectorizers_do_not_have_set_output5  s   rN  c                 C   s   t jddtjdd}| |}| }t |}|j|dd}t|| ||us*J |j|dd}||u s7J t	
t t|| W d   dS 1 sMw   Y  dS )	zJCheck the behaviour of TfidfTransformer.transform with the copy parameter.r  r  r  r  T)r  FN)r	   r  r   rr  r  r   r   r   r   r{   r|   AssertionError)r  r   r  X_csr_originaltransformerX_transformr"   r"   r#   test_tfidf_transformer_copy>  s   
"rS  r:  c                 C   s6   dd t dD }t| d|}|jj| ksJ dS )zCheck that `idf_` has the same dtype as the input data.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/30016
    c                 S   s   g | ]}t t qS r"   )struuiduuid4).0rA  r"   r"   r#   
<listcomp>Z  s    z<test_tfidf_vectorizer_perserve_dtype_idf.<locals>.<listcomp>i r  N)r>  r   r   r  r:  )r:  r   r   r"   r"   r#   (test_tfidf_vectorizer_perserve_dtype_idfS  s   rY  c                  C   s   t  } |  }|jrJ dS )z7Test that HashingVectorizer has requires_fit=False tag.N)r   __sklearn_tags__requires_fit)r   tagsr"   r"   r#   (test_hashing_vectorizer_requires_fit_tag_  s   r]  c                  C   s.   t dd} ddg}| |}|jdksJ dS )z:Test that HashingVectorizer can transform without fitting.r  )r9  zThis is testzAnother test)rp   r  N)r   r   r   )r   r   r4  r"   r"   r#   -test_hashing_vectorizer_transform_without_fitf  s   

r^  )r  r  rU  r   collectionsr   collections.abcr   	functoolsr   ior   	itertoolsr   r  r   r{   numpy.testingr   r   r  r	   sklearn.baser
   sklearn.feature_extraction.textr   r   r   r   r   r   r   r   sklearn.model_selectionr   r   r   sklearn.pipeliner   sklearn.svmr   sklearn.utils._testingr   r   r   sklearn.utils.fixesr   r   r   r   r  r   r$   r)   r-   r0   r=   r>   markparametrizerk   rt   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  xfailr
  r  r5  r7  rC  rN  rR  r_  rf  ri  rp  rs  r  r  r  r  r  r  r  rh   r"  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rn  rr  r  r  int32r  r  r   r  r  r  r%  r/  r0  r+  paramr1  r8  rA  rE  rJ  rL  rN  rS  rY  r]  r^  r"   r"   r"   r#   <module>   s   (
	$
=

g&G
	
$'

	






	






J 
	


