
    Ug                     T   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlZd dlZd dlmZmZ d dlmZ d d	lmZ d d
lmZmZmZmZmZmZmZmZ d dl m!Z!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0m1Z1 dZ2dZ3e2e3z   Z4d Z5d Z6d Z7d Z8d Z9d Z:ej;        <                    deef          d             Z=d Z>d Z?d Z@d ZAd ZBd ZCd  ZDd! ZEd" ZFd# ZGd$ ZHd% ZId& ZJd' ZKd( ZLd) ZMd* ZNej;        O                    e/d+,          d-             ZPd. ZQd/ ZRd0 ZSe+d1             ZTd2 ZUej;        <                    deef          d3             ZVd4 ZWd5 ZXd6 ZYd7 ZZe+d8             Z[ej;        <                    deef          d9             Z\d: Z]d; Z^d< Z_e+d=             Z`d> Zad? Zbej;        <                    d@ejc        ejd        eje        g          dA             ZfdB ZgdC ZhdD ZidE ZjdF ZkdG ZldH Zme+dI             ZndJ ZodK ZpdL Zqej;        <                    deeef          dM             Zrej;        <                    dNejs        ejt        g          dO             Zuej;        <                    dP ee0e1                    dQ             Zvej;        <                    dRejw        ejt        dSfejx        ejt        dSfejs        ejs        dTfejt        ejt        dTfg          dU             Zyej;        <                    dV edWX           edWX           edWX          g          dY             ZzdZ Z{e+d[             Z|e,ej;        <                    d\e1          d]                         Z}e+ej;        <                    d^eeeg          d_                         Z~ej;        <                    d^eeeg          ej;        <                    d`daedbfdceddfg          de                         Zej;        <                    d^ee ej        ee+f          g          ej;        <                    dgdh di g          ej;        <                    djdcdag          dk                                     Zej;        <                    d^eeeg          dl             Zej;        <                    deeeg          ej;        <                    dmdndogdddpddqdrdsdtf	ddu ddpddqdvdsdtf	ddw ddpdxdydzdvd{f	ddd| dpdxd} d~dsdf	dddddd ddsdf	dg          d                         Zej;        <                    deddddddgfee2ff          d             Zd Ze+d             Zej;        <                    d^eeeeg          d             Zej;        <                    d\e1          d             ZdS )    N)defaultdict)Mapping)partial)StringIO)product)assert_array_almost_equalassert_array_equal)sparse)clone)ENGLISH_STOP_WORDSCountVectorizerHashingVectorizerTfidfTransformerTfidfVectorizerstrip_accents_asciistrip_accents_unicode
strip_tags)GridSearchCVcross_val_scoretrain_test_split)Pipeline)	LinearSVC)assert_allclose_dense_sparseassert_almost_equalfails_if_pypyskip_if_32bit)_IS_PYPY_IS_WASMCSC_CONTAINERSCSR_CONTAINERS)zthe pizza pizza beer copyrightzthe pizza burger beer copyrightz!the the pizza beer beer copyrightzthe burger beer beer copyrightzthe coke burger coke copyrightzthe coke burger burger)zthe salad celeri copyrightz)the salad salad sparkling water copyrightzthe the celeri celeri copyrightzthe tomato tomato salad waterz the tomato salad water copyrightc                 D    t          |                                           S N)r   upperss    i/var/www/surfInsights/venv3-11/lib/python3.11/site-packages/sklearn/feature_extraction/tests/test_text.py	uppercaser'   9   s     ##))+++    c                 .    |                      dd          S )N   ée)replacer$   s    r&   strip_eacuter-   =   s    99T3r(   c                 *    |                                  S r"   splitr$   s    r&   split_tokenizer1   A   s    7799r(   c                     dgS )Nthe_ultimate_feature r$   s    r&   lazy_analyzer5   E   s    "##r(   c                  d   d} d}t          |           |k    sJ d} d}t          |           |k    sJ d} d}t          |           |k    sJ d} d}t          |           |k    sJ d	} d
}t          |           |k    sJ d} d}t          |           |k    sJ d} d
}t          |           |k    sJ d S )N   àáâãäåçèéêëaaaaaaceeee   ìíîïñòóôõöùúûüýiiiinooooouuuuy   إu   ا   this is à testthis is a testu   öou   ̀́̂̃ u   ȫ)r   aexpecteds     r&   test_strip_accentsrC   I   s    AH ##x////(A H ##x//// 	AH ##x//// 	AH ##x//// 	AH ##x//// 	#AH ##x//// 	AH ##x//////r(   c                      d} d}t          |           |k    sJ d} d}t          |           |k    sJ d} d}t          |           |k    sJ d} d}t          |           |k    sJ d S )	Nr7   r8   r9   r:   r;   r?   r<   r=   )r   r@   s     r&   test_to_asciirE   m   s     AHq!!X----(A Hq!!X---- 	AHq!!X---- 	AHq!!X------r(   
Vectorizerc                     | d                                           }d}g d} ||          |k    sJ d}g d} ||          |k    sJ  | d                                           }t          d	          }g d
} ||          |k    sJ  | t                                                     }d}g d} ||          |k    sJ  | t          d                                           }d}g d} ||          |k    sJ d S )Nasciistrip_accents:   J'ai mangé du kangourou  ce midi, c'était pas très bon.)
aimangedu	kangouroucemidietaitpastresbonz0This is a test, really.

 I met Harry yesterday.)thisistestreallymetharry	yesterdayfile)input'This is a test with a file-like object!)rV   rW   rX   withr]   likeobjectpreprocessoru;   J'ai mangé du kangourou  ce midi,  c'était pas très bon.)
AIMANGEDU	KANGOUROUCEMIDIETAITPASTRESBON)	tokenizerrJ   )
zj'airM   rN   rO   rP   zmidi,zc'etaitrS   rT   zbon.)build_analyzerr   r'   r1   )rF   watextrB   s       r&   test_word_analyzer_unigramsrs      st   	'	*	*	*	9	9	;	;BGD  H 2d88x?DLLLH2d88x	&	!	!	!	0	0	2	2B=>>DGGGH2d88x 
	+	+	+	:	:	<	<BHD  H 2d88x 
nG	D	D	D	S	S	U	UBGD  H 2d88xr(   c                  |    t          ddd                                          } d}g d} | |          |k    sJ d S )Nwordunicode      analyzerrJ   ngram_rangerK   )rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   zai mangezmange duzdu kangourouzkangourou cezce midiz
midi etaitz	etait paszpas tresztres bon)r   rp   )rq   rr   rB   s      r&   'test_word_analyzer_unigrams_and_bigramsr}      se    	yf
 
 
n  HD  H* 2d88xr(   c                     d} |                      d          }t          dd                                          }t          j        t
                    5   ||           d d d            n# 1 swxY w Y   t          ddd                                          }t          j        t
                    5   ||           d d d            d S # 1 swxY w Y   d S )	NrK   zutf-8rw   rH   )r|   encodingchar      )r{   r|   r   )encoder   rp   pytestraisesUnicodeDecodeError)rr   
text_bytesrq   cas       r&   test_unicode_decode_errorr      s]    HDW%%J 
Vg	>	>	>	M	M	O	OB	)	*	*  
:               
Vg
 
 
n  
)	*	*  
:                 s$   A,,A03A04CCCc                     t          ddd                                          } d}g d} | |          d d         |k    sJ g d} | |          d	d          |k    sJ d
}g d} | |          d d         |k    sJ g d} | |          d	d          |k    sJ t          ddd                                          } t          d          }g d} | |          d d         |k    sJ d S )Nr   rv   r   rz   u9   J'ai mangé du kangourou  ce midi, c'était pas très bon)zj'az'aizai zi mz ma   )zs tresz tres ztres bzres bozes bon1This 
	is a test, really.

 I met Harry yesterday)thihisis zs iz is)z yesteyesteresterdsterdaterdayr]   r^   r{   r|   r_   r   rp   r   cngarr   rB   s      r&   test_char_ngram_analyzerr      sS   yf  n 	 GD222H4::bqb>X%%%%AAAH4::bcc?h&&&&BD222H4::bqb>X%%%%AAAH4::bcc?h&&&&v6  n 	 =>>D222H4::bqb>X%%%%%%r(   c                  f   t          ddd                                          } d}g d} | |          d d         |k    sJ g d} | |          d	d          |k    sJ t          d
dd                                          } t          d          }g d} | |          d d         |k    sJ d S )Nchar_wbrv   r   rz   r   )z thr   r   r   z thir   )r   r   r   r   zerday r   r]   r   zA test with a file-like object!)z a z tetesestzst z tesr   r   r   s      r&   test_char_wb_ngram_analyzerr     s    )  n 	 CD333H4::bqb>X%%%%AAAH4::bcc?h&&&&yf  n 	 566D:::H4::bqb>X%%%%%%r(   c                  `   t          ddd                                          } d}g d} | |          d d         |k    sJ g d} | |          d	d          |k    sJ t          d
dd                                          }t          |          } ||           | |          k    sJ d S )Nru   rv   r   rz   r   )zthis is testzis test reallyztest really metr   )ztest really met harry yesterdayzthis is test really met harryz"is test really met harry yesterdayr]   r   r   )r   rr   rB   	cnga_filer]   s        r&   test_word_ngram_analyzerr     s    yf  n 	 CDDDDH4::bqb>X%%%%  H
 4::bcc?h&&&&v6  n  D>>D9T??dd4jj((((((r(   c                     ddd} t          |                                           }t          t          t          t          t          t                    fD ]} ||           }t          |          }|	                    t                     t          |t                    r|j        | k    sJ nt          |j                  |k    sJ |                    t                    }|j        d         t!          |          k    sJ  ||           }t          |          }|                    |          }t!          |          |j        d         k    sJ d S )Nr   rx   pizzabeer
vocabulary)setkeysdictlistiterr   r   intr   fitJUNK_FOOD_DOCS
isinstancer   vocabulary_	transformshapeleninverse_transform)vocabtermstypvvectXinvs          r&   &test_countvectorizer_custom_vocabularyr   6  sD   ##E

E dD'+s";";< & &CJJ!,,,   a!! 	2#u,,,,,t'((E1111NN>**wqzSZZ''''CJJ!,,,$$Q''3xx171:%%%%%& &r(   c                  D   ddg} t          dt          |           fdt                      fg          }|                    t                    }t          |j        d         j                  t          |           k    sJ |j        d         t          |           k    sJ d S )Nr   r   countr   tfidfrx   )
r   r   r   fit_transformALL_FOOD_DOCSr   named_stepsr   r   r   )what_we_likepiper   s      r&   /test_countvectorizer_custom_vocabulary_pipeliner   K  s    V$Lo>>>?&(()	
 D 	=))At(455\9J9JJJJJ71:\********r(   c                      ddd} d}t          j        t          |          5  t          |           }|                    dg           d d d            d S # 1 swxY w Y   d S )Nr   r   z$Vocabulary contains repeated indicesmatchr   pasta_sizilianar   r   
ValueErrorr   r   )r   msgr   s      r&   7test_countvectorizer_custom_vocabulary_repeated_indicesr   X  s    ##E
0C	z	-	-	- & &%000#$%%%& & & & & & & & & & & & & & & & & &s   'AAAc                      ddd} t          j        t          d          5  t          |           }|                    dg           d d d            d S # 1 swxY w Y   d S )Nrx   ry   r   zdoesn't contain indexr   r   pasta_verdurar   r   r   s     r&   0test_countvectorizer_custom_vocabulary_gap_indexr   `  s    ##E	z)@	A	A	A $ $%000/"###$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $s   'AAAc                  z   t                      } |                     d           |                                 t          k    sJ |                     d           t	          j        t                    5  |                                  d d d            n# 1 swxY w Y   |                     d           t	          j        t                    5  |                                  d d d            n# 1 swxY w Y   g d}|                     |           |                                 t          |          k    sJ d S )Nenglish
stop_words_bad_str_stop__bad_unicode_stop_)someotherwords)r   
set_paramsget_stop_wordsr   r   r   r   r   )cvstoplists     r&   test_countvectorizer_stop_wordsr   g  s   			BMMYM'''"44444MM-M...	z	"	"  
              MM1M222	z	"	"  
              )))HMMXM&&&#h--//////s$   3BBBC//C36C3c                  p   t          j        t          d          5  t          g           } |                     dg           d d d            n# 1 swxY w Y   t          j        t          d          5  t          dd          }|                    g d           d d d            d S # 1 swxY w Y   d S )	Nzempty vocabularyr   r   foo      ?r   )max_dfr   )zto be or not to bez
and me toozand so do your   )r   r   s     r&   %test_countvectorizer_empty_vocabularyr   v  sL   	z);	<	<	<  "---%               
z);	<	<	< E E39===	CCCDDDE E E E E E E E E E E E E E E E E Es#   'AAA5)B++B/2B/c                      t                      } |                     t          d d                   }|                     t          dd                    }|j        d         |j        d         k    sJ d S )Nr   rx   )r   r   r   r   )r   X1X2s      r&   test_fit_countvectorizer_twicer     sh    			B			-+	,	,B			-+	,	,B8A;"(1+%%%%%%r(   c                      g d} d}t          |          }|                    |            g d}|                                }t          ||           dS )zCheck `get_feature_names_out()` when a custom token pattern is passed.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    z&This is the 1st document in my corpus.z This document is the 2nd sample.zAnd this is the 3rd one.zIs this the 4th document?z'[0-9]{1,3}(?:st|nd|rd|th)\s\b(\w{2,})\btoken_pattern)documentonesampleN)r   r   get_feature_names_outr	   )corpusr   
vectorizerrB   feature_names_outs        r&   )test_countvectorizer_custom_token_patternr     sr    
  F ?M }===JV$$$,,,H"88::((33333r(   c                      g d} d}d}t          |          }t          j        t          |          5  |                    |            ddd           dS # 1 swxY w Y   dS )zCheck that we raise an error if token pattern capture several groups.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    r   z)([0-9]{1,3}(?:st|nd|rd|th))\s\b(\w{2,})\bz,More than 1 capturing group in token patternr   r   Nr   r   r   r   r   )r   r   err_msgr   s       r&   <test_countvectorizer_custom_token_pattern_with_several_groupr     s    
  F AM<G }===J	z	1	1	1  v                 s   AAAc                  z   g d} d}t          d|           }t          j        t          |          5  |                    |            d d d            n# 1 swxY w Y   t          j                    5  t          j        dt                     |                    |            d d d            d S # 1 swxY w Y   d S )N)SampleUpperCase
VocabularyzyUpper case characters found in vocabulary while 'lowercase' is True. These entries will not be matched with any documentsT)	lowercaser   r   error)	r   r   warnsUserWarningr   warningscatch_warningssimplefilterr   )r   messager   s      r&   'test_countvectorizer_uppercase_in_vocabr	    sE    ;::J	)  !4JGGGJ	k	1	1	1 # #z"""# # # # # # # # # # # # # # # 
	 	"	" ) )g{333Z((() ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) )s#   AAA30B00B47B4c                      g dg dg dg} t          dd                              |           }g d}|                    |          }t          ||           dS )	z0Check get_feature_names_out for TfidfTransformerrx   rx   rx   rx   rx   r   rx   r   r   Tl2
smooth_idfnorm)rA   cbN)r   r   r   r	   )r   trfeature_names_inr   s       r&   %test_tf_transformer_feature_names_outr    sq    	IIIyyy)A	T	5	5	5	9	9!	<	<B&001ABB'):;;;;;r(   c                     g dg dg dg} t          dd          }|                    |                                           }|dk                                    sJ t	          |dz                      d	
          g d           g dg dg dg} t          dd          }|                    |                                           }|dk                                    sJ d S )Nr  r  r  Tr  r  r   ry   rx   axisr   r   r   )r   r   toarrayallr   sumr   r  r   s      r&   test_tf_idf_smoothingr    s    	IIIyyy)A	T	5	5	5BQ''))EQJ uaxnn!n44oooFFF 
IIIyyy)A	T	5	5	5BQ''))EQJr(   zcno floating point exceptions, see https://github.com/numpy/numpy/pull/21895#issuecomment-1311525881reasonc                     g dg dg dg} t          dd          }|                    |                                           }|dk                                    sJ t	          |dz                      d	
          g d           g dg dg dg} t          dd          }d}t          j        t          |          5  |                    |                                            d d d            d S # 1 swxY w Y   d S )Nr  r  r  Fr  r  r   ry   rx   r  r  zdivide by zeror   )	r   r   r  r  r   r  r   r  RuntimeWarning)r   r  r   in_warning_messages       r&   test_tfidf_no_smoothingr%    s]    
IIIyyy)A	U	6	6	6BQ''))EQJ uaxnn!n44oooFFF 
IIIyyy)A	U	6	6	6B)	n,>	?	?	? & &
##%%%& & & & & & & & & & & & & & & & & &s    (C55C9<C9c                  ,   dgdgdgg} t          ddd           }|                    |                                           }|d         dk    sJ |d         |d         k    sJ |d         |d         k    sJ |d         dk     sJ |d         dk     sJ d S )Nrx   ry   r   TF)sublinear_tfuse_idfr  r   )r   r   r  r  s      r&   test_sublinear_tfr)    s    
qcA3A	tU	F	F	FBQ''))E8q====8eAh8eAh8a<<<<8a<<<<<<r(   c                  
   t          t          d d                   } t          d         g}t          t                    dz
  }t          d          }|                    |           }t          |d          r|                                }|d|j        d         f         dk    sJ t          |j        	          }||fD ]}|                    |          }t          |d          r|                                }|j        }|d|d
         f         dk    sJ |d|d         f         dk    sJ |d|d         f         dk    sJ d|vsJ d|vsJ |d|d         f         dk    sJ |d|d         f         dk    sJ |d|d         f         dk    sJ |d|d         f         dk    sJ t          d          }	|	
                    |                              |                                          }
t          |	j                  t          |j                  k    sJ |
j        |t          |j                  fk    sJ |	                    |                                          }|j        t          |          t          |j                  fk    sJ t          dd          }|
                    |                              |                                          }t          |d          rJ t          d          }t          j        t                     5  |                    |           d d d            n# 1 swxY w Y   t#          t%          j        |d          dg|z             t          t          d d                   } t)          d          }|j        |_        |                    |                                           }|j        rJ t#          |
|           |                    |                                          }t#          ||           t          d 	          }t          j        t                     5  |                    |            d d d            n# 1 swxY w Y   |                    dd           |                                }d}t3          |          } ||          }||k    sJ |                    dd            t          j        t                     5  |                                 d d d            n# 1 swxY w Y   d |_        t          j        t                     5  |                                 d d d            d S # 1 swxY w Y   d S )!Nrx         ?r   tocsrr   r   ry   r   saladtomatowaterthe	copyrightcokeburgerr   l1r  F)r  r(  idf_Tr(  r  r   rH   )rJ   r  rK   _gabbledegook_)rJ   rd   _invalid_analyzer_type_)r   r   r   r   r   hasattrr.  r   r   r   r   r  r8  r   r   r   r   r   npr  r   r   fixed_vocabulary_r   build_preprocessorr   rp   )
train_data	test_datan_trainv1counts_trainv2r   counts_testr   t1r   
tfidf_testt2tft3tvtfidf2tfidf_test2v3	processorrr   rB   results                          r&   test_vectorizerrR    s~   mCRC())Jr"#I-  1$G 
	$	$	$B##J//L|W%% ,#))++2>'223q8888 
BN	3	3	3B "X 8 8kk),,;(( 	.%++--K]
1j112a77771j223q88881j112a7777 J&&&& *,,,, 1j001Q66661j223q88881j001Q66661j112a77777 
t	$	$	$BFF<  **<88@@BBErw<<3r~......;7C$7$788888 k**2244JIBN0C0CDDDDD 
tU	3	3	3B				'	'	5	5	=	=	?	?Br6""""" 
$	'	'	'B	z	"	" # #
\"""# # # # # # # # # # # # # # # bfRa0003%'/BBB mCRC())J	d	#	#	#B	BIj))1133F####eV,,, ,,y))1133Kj+666 
D	)	)	)B	z	"	" ! !
Z   ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! MM5M999%%''IGD"4((HYt__Fv MM 0tMDDD	z	"	"    
                              .BM	z	"	"  
                 sH   L<<M M Q))Q-0Q-3TTT?U!!U%(U%c                     d\  } }}}t          | |||          }|                    t                     |j        j        | k    sJ |j        j        |k    sJ |j        j        |k    sJ |j        j        |k    sJ d|_        d|_        d|_        d|_        |j        j        | k    sJ |j        j        |k    sJ |j        j        |k    sJ |j        j        |k    sJ |                    t                     |j        j        |j        k    sJ |j        j        |j        k    sJ |j        j        |j        k    sJ |j        j        |j        k    sJ d S )N)r  FFF)r  r(  r  r'  r6  T)r   r   r   _tfidfr  r(  r  r'  )r  r(  r  r'  rL  s        r&   test_tfidf_vectorizer_settersrU  i  s   .G+D':|	7z
 
 
B FF>9>T!!!!9''''9:----9!\1111 BGBJBMBO9>T!!!!9''''9:----9!\1111FF>9>RW$$$$9
****92=00009!R_444444r(   c                     t                      } |                     t                    }|j        }|j        t          t                    | j        fk    sJ |j        | j        k    sJ t          j	        |j
                  dk    sJ t          j	        |j
                  dk     sJ t          j        |j
                  dk    sJ t          j        |j
                  dk     sJ t          |j        d                   D ];}t          t          j                            |d         j
        d          d           <t          dd          } |                     t                    }|j        t          t                    | j        fk    sJ |j        | j        k    sJ |j        }||k    sJ |d|z  k     sJ t          j	        |j
                  dk    sJ t          j        |j
                  dk     sJ t          |j        d                   D ];}t          t          j                            |d         j
        d          d           <d S )	Nr+  r   rx   ry   r   rw   r6  )r|   r  )r   r   r   nnzr   r   
n_featuresdtyper=  mindatamaxranger   linalgr  )r   r   	token_nnzi
ngrams_nnzs        r&   test_hashing_vectorizerrb    s'   A	M""AI7s=))1<888887ag 6!&>>B6!&>>A6!&>>A6!&>>A 171: ? ?BINN1Q49a88#>>>> 	f4888A	M""A7s=))1<888887ag J	!!!!I%%%% 6!&>>B6!&>>A 171: ? ?BINN1Q49a88#>>>>? ?r(   c                  j   t          d          } t          j        t                    5  |                                  d d d            n# 1 swxY w Y   | j        rJ |                     t                    }|j        \  }}t          | j
                  |k    sJ |                                 }t          |t          j                  sJ |j        t          k    sJ t          |          |k    sJ t!          g d|           t#          |          D ]%\  }}|| j
                            |          k    sJ &g d}t          |          } |                                 }t!          g d|           | j        sJ t#          |          D ]%\  }}|| j
                            |          k    sJ &d S )Nr,  r-  	r   r5  celerir4  r   r/  	sparklingr0  r1  r   )r   r   r   r   r   r>  r   r   r   r   r   r   r=  ndarrayrY  rb   r	   	enumerateget)r   r   	n_samplesrX  feature_namesidxnamer   s           r&   test_feature_namesrn    sG   		$	$	$B 
z	"	" # #
  """# # # # # # # # # # # # # # ##### 	''AGIzr~*,,,,,,..MmRZ00000&((((}++++
	
 
	
 
	
 	   }-- / /	Tbn((.......
 
 
E 
E	*	*	*B,,..M
	
 
	
 
	
 	   }-- / /	Tbn((......./ /s   AAAc                     h d} | dd          }|                     t                     t          |j                  |k    sJ d S )N>   r   r   r/  r5  g333333?   )r   max_features)r   r   r   r   )rF   expected_vocabularyr   s      r&   test_vectorizer_max_featuresrs    sY    >>> 3Q777JNN=!!!z%&&*=======r(   c                  N   t          d          } t          d          }t          d           }|                     t                                        d          }|                    t                                        d          }|                    t                                        d          }|                                 }|                                }|                                }d|                                k    sJ d|                                k    sJ d|                                k    sJ d|t          j        |                   k    sJ d|t          j        |                   k    sJ d|t          j        |                   k    sJ d S )Nrx   rq  r   r   r     r2  )r   r   r   r  r   r\  r=  argmax)	cv_1cv_3cv_Nonecounts_1counts_3counts_None
features_1
features_3features_Nones	            r&   "test_count_vectorizer_max_featuresr    s    ***D***D4000G!!.115515==H!!.115515==H''77;;;CCK++--J++--J1133M !!!!!! Jry2233333Jry2233333M")K"8"89999999r(   c                     g d} t          dd          }|                    |            d|j                                        v sJ t	          |j                                                  dk    sJ d|_        |                    |            d|j                                        vsJ t	          |j                                                  dk    sJ d	|_        |                    |            d|j                                        vsJ t	          |j                                                  dk    sJ d S )
Nabcdeaeatr   r   r{   r   rA   r   r,  rp  rx   )r   r   r   r   r   r   rA  r   s     r&   test_vectorizer_max_dfr    sA   %%%IF3777DHHY$"''))))))t$$&&''1,,,,DKHHYd&++------t$$&&''1,,,,DKHHYd&++------t$$&&''1,,,,,,r(   c                     g d} t          dd          }|                    |            d|j                                        v sJ t	          |j                                                  dk    sJ d|_        |                    |            d|j                                        vsJ t	          |j                                                  dk    sJ d	|_        |                    |            d|j                                        vsJ t	          |j                                                  dk    sJ d S )
Nr  r   rx   )r{   min_dfrA   r   ry   r  g?)r   r   r   r   r   r  r  s     r&   test_vectorizer_min_dfr  *  sA   %%%IF1555DHHY$"''))))))t$$&&''1,,,,DKHHYd&++------t$$&&''1,,,,DKHHYd&++------t$$&&''1,,,,,,r(   c                  "   ddg} t          dd          }|                    |                                           }t          g d|                                           t          g dg dg|           t          ddd	
          }|                    |                                           }t          g dg dg|           t          ddd	t
          j                  }|                    |           }|j        t
          j        k    sJ d S )Naaabcabbder   r   r  )rA   r  r  dr+   )r   rx   rx   r   r   )rx   ry   r   rx   rx   T)r{   r   binary)rx   rx   rx   r   r   )rx   rx   r   rx   rx   )r{   r   r  rY  )r   r   r  r	   r   r=  float32rY  )rA  r   r   X_sparses       r&   test_count_binary_occurrencesr  <  s   '"IF3777D9%%--//A000$2L2L2N2NOOO91=== F3tDDDD9%%--//A91=== F3t2:VVVD!!),,H>RZ''''''r(   c                  v   ddg} t          ddd           }|                    |           }t          j        |dd         j                  dk    sJ t          j        |dd	         j                  d	k    sJ |j        t          j        k    sJ t          ddd
d           }|                    |           }t          j        |j                  dk    sJ |j        t          j        k    sJ t          ddd
d t          j                  }|                    |           }|j        t          j        k    sJ d S )Nr  r  Fr   )alternate_signr{   r  r   rx   r   ry   T)r{   r  r  r  )r{   r  r  r  rY  )r   r   r=  r\  r[  rY  float64)rA  r   r   s      r&   test_hashed_binary_occurrencesr  P  sE    '"IEFNNNDy!!A6!AaC&+!####6!AaC&+!####7bj     d  D 	y!!A6!&>>Q7bj     dRZ  D 	y!!A7bj      r(   c                    t           } |             }|                    |          }|                    |          }t          |t                    sJ |                                }t          ||          D ]j\  }}t          j        t          j	         ||                              }t          j        t          j	        |                    }t          ||           kt          j        |          sJ |j        dk    sJ |                                }	|                    |	          }
t          ||
          D ]9\  }}t          t          j        |          t          j        |                     :|                                }|                    |          }t          ||          D ]9\  }}t          t          j        |          t          j        |                     :d S )Ncsr)r   r   r   r   r   rp   zipr=  sortuniquer	   r
   issparseformatr  tocsc)rF   r[  r   transformed_datainversed_dataanalyzedocinversed_termsr   transformed_data2inversed_data2terms2transformed_data3inversed_data3terms3s                  r&   !test_vectorizer_inverse_transformr  k  s    DJ!//55001ABBMmT*****''))G"477 2 2^	''#,,//00>!:!:;;5.1111?+,,,,,"e++++ )0022112CDDN]N;; < <v275>>276??;;;; )..00112CDDN]N;; < <v275>>276??;;;;< <r(   c                     t           t          z   } dgt          t                     z  dgt          t                    z  z   }t          | |dd          \  }}}}t	          dt                      fdt                      fg          }dd	gd
d}t          ||dd          }|                    ||          	                    |          }	t          |	|           |j        dk    sJ |j        j        d         }
|
j        dk    sJ d S )Nr+  rx   g?r   	test_sizerandom_stater   svcrx   rx   rw   hingesquared_hinge)vect__ngram_range	svc__lossr   )n_jobsr   r   )r   NOTJUNK_FOOD_DOCSr   r   r   r   r   r   r   predictr	   best_score_best_estimator_r   r|   r[  targetr@  rA  target_traintarget_testpipeline
parametersgrid_searchpredbest_vectorizers              r&   -test_count_vectorizer_pipeline_grid_selectionr    s-   --D TC'''1#4E0F0F*FFF 8Hf!8 8 84J	< &/"3"34uikk6JKLLH %f-/ J xA!DDDK ??:|44<<YGGDt[)))
 "c))))!1=fEO&&000000r(   c                  :   t           t          z   } dgt          t                     z  dgt          t                    z  z   }t          | |dd          \  }}}}t	          dt                      fdt                      fg          }dd	gd
dd}t          ||d          }|                    ||          	                    |          }	t          |	|           |j        dk    sJ |j        j        d         }
|
j        dk    sJ |
j        dk    sJ |
j        rJ d S )Nr+  rx   g?r   r  r   r  r  rw   )r6  r  r  )r  
vect__normr  )r  r   r  )r   r  r   r   r   r   r   r   r   r  r	   r  r  r   r|   r  r>  r  s              r&   'test_vectorizer_pipeline_grid_selectionr    sQ   --D TC'''1#4E0F0F*FFF 8Hf!8 8 84J	< &/"3"34uikk6JKLLH %f-"/ J xA>>>K ??:|44<<YGGDt[)))
 "c))))!1=fEO&&00004''''000000r(   c                  *   t           t          z   } dgt          t                     z  dgt          t                    z  z   }t          dt	                      fdt                      fg          }t          || |d          }t          |g d           d S )Nr+  rx   r   r  r   )r   r  )r   r  r   r   r   r   r   r	   )r[  r  r  	cv_scoress       r&   )test_vectorizer_pipeline_cross_validationr    s    --D TC'''1#4E0F0F*FFF&/"3"34uikk6JKLLH$1===Iy///22222r(   c                  t   d} t                      }|                    | g          }|j        dk    sJ t          d d          }|                    | g          }|j        dk    sJ |j        |j        k    sJ t          t          j        |j	                  t          j        |j	                             d S )Nu   Машинное обучение — обширный подраздел искусственного интеллекта, изучающий методы построения алгоритмов, способных обучаться.)rx      F)r  r  )rx   i   )
r   r   r   r   r   rW  r	   r=  r  r[  )r   r   	X_countedX_hasheds       r&   test_vectorizer_unicoder    s    	1  D""H:..I?g%%%%$u===D~~xj))H>Z'''' =HL(((( rwy~..0F0FGGGGGr(   c                     ddg} t          |           }|                    t                    }|                    t                    }t	          |                                |                                           |j        sJ d S )Nr   re  r   )r   r   r   r   r   r  r>  )r   r   X_1X_2s       r&   +test_tfidf_vectorizer_with_fixed_vocabularyr    su    8$Jj111D


]
+
+C
..
'
'CckkmmS[[]];;;!!!!!!r(   c                     t                      t          d          t          d          t          d          t                      t          t                    t          t                    t          t                                        t
                    t          t          	                              t
                    t                      t          t                    t                                          t
                    g} | D ]}t          j	        |          }t          j
        |          }t          |          |j        k    sJ |                                |                                k    sJ t          rt          |t                     rt!          |                    t
                    |                    t
                               d S )
Nr6  r7  T)r  rw   r|   rc   )r{   rI   )r   r   r   r5   r   r   r-   r   pickledumpsloadstype	__class__
get_paramsr   r   r   r   )	instancesorigr%   copys       r&   test_pickling_vectorizerr    s   t$$$&&&f---Z000...Z00044^DDl33377GG...n--I   L|ADzzT^++++  DOO$5$55555 	
4):;; 	("">22"">22    r(   factoryc                     t                      } | |          }d}t          j        t          j        |                    } ||          } ||          }||k    sJ dS )z_Tokenizers cannot be pickled
    https://github.com/scikit-learn/scikit-learn/issues/12833
    rK   N)r   r  r  r  )r  vecfunctionrr   roundtripped_functionrB   rQ  s          r&   test_pickling_built_processorsr  $  so     

Cws||HGD"Lh)?)?@@x~~H""4((FXr(   c                     t           j                            d          } t          j        g d          }t	          dd          D ]}t          |                     |dd                    }t          |          }t          j	        t          j
        |                    }|                    t                     |                    t                     t          |                                |                                           d S Nr   rd  d   r   F)sizer,   r   )r=  randomRandomStatearrayr]  r   choicer   r  r  r  r   r   r	   r   )rngvocab_wordsx	vocab_setr   unpickled_cvs         r&   -test_countvectorizer_vocab_sets_when_picklingr  9  s     )


"
"C(
	
 
	
 
	
 K 1c]] 
 


;Q
FFGG		222|FL$4$455
}'''$$&&(J(J(L(L	
 	
 	
 	

 
r(   c                  Z   t           j                            d          } t          j        g d          }t	          dd          D ]}t                      }|                     |dd          }t	          dd          D ]}||||         <   t          |          }t          j	        t          j
        |                    }|                    t                     |                    t                     t          |                                |                                           d S r  )r=  r  r  r  r]  r   r  r   r  r  r  r   r   r	   r   )r  r  r  
vocab_dictr   yr   r  s           r&   .test_countvectorizer_vocab_dicts_when_picklingr  U  s%   
)


"
"C(
	
 
	
 
	
 K 1c]] 
 
VV


;Q
>>q! 	% 	%A#$JuQx  
333|FL$4$455
}'''$$&&(J(J(L(L	
 	
 	
 	

 
r(   c                     t                                          t                    } t                                          |           }t          j        |          }t          j        |          }t          |          |j	        k    sJ t          |                    |                                           |                    |                                                      d S r"   )r   r   r   r   r   r  r  r  r  r  r	   r  )r   r  r%   r  s       r&   test_pickling_transformerr  r  s    ''77A!!!$$DTA<??D::''''t))!,,44668J8J18M8M8U8U8W8WXXXXXr(   c                  |   t                                          t                    } t                                          |           }t                      }|j        |_        t          |                    |                                           |                    |                                                      d S r"   )	r   r   r   r   r   r8  r	   r   r  )r   r  r  s      r&   test_transformer_idf_setterr  {  s    ''77A!!!$$DD	DIt~~a((0022DNN14E4E4M4M4O4OPPPPPr(   c                     t          d          } |                     t                     t          | j        d          }| j        |_        t          |                    t                                                    |                     t                                                               t          | j        d          }d}t          j	        t          |          5  | j        |_        d d d            d S # 1 swxY w Y   d S )NTr9  r   r(  Fz+`idf_` cannot be set when `user_idf=False`.r   )r   r   r   r   r8  r	   r   r  r   r   r   )r  r  r   s      r&   test_tfidf_vectorizer_setterr     s(   4(((DHH^d&6EEED	DI~&&..00~&&..00  
 d&6FFFD;G	z	1	1	1  I	                 s   $C>>DDc                  F   t          d          } |                     t                     t          | j        d          }t	          | j                  }dg|dz   z  }t          j        t                    5  t          |d|           d d d            d S # 1 swxY w Y   d S )NTr9  r  r   rx   r8  )
r   r   r   r   r   r8  r   r   r   setattr)r   r  expected_idf_leninvalid_idfs       r&   %test_tfidfvectorizer_invalid_idf_attrr    s    4(((DHH^d&6EEED49~~%+a/0K	z	"	" + +fk***+ + + + + + + + + + + + + + + + + +s   7BBBc                      g d} t          |           }t          j        t                    5  |                    g            d d d            d S # 1 swxY w Y   d S )N)rA   r  r  rA   rA   r   r   r   s     r&   test_non_unique_vocabr    s    %%%Ee,,,D	z	"	"                   s   AAAc                      d} t           }d }t          j        ||           5   |             d d d            d S # 1 swxY w Y   d S )Nz?np.nan is an invalid document, expected byte or unicode string.c                  f    t                      } |                     dt          j        dg           d S )Nhello worldhello hello)r   r   r=  nan)hvs    r&   funcz0test_hashingvectorizer_nan_in_docs.<locals>.func  s0      
-?@@@@@r(   r   )r   r   r   )r  	exceptionr  s      r&   "test_hashingvectorizer_nan_in_docsr    s     PGIA A A 
y	0	0	0                   s   ;??c                  p   t          ddd           } | j        sJ |                     ddg                                          }t	          |                                g d           |                     ddg                                          }t	          |                                g d           d S )NTF)r  r(  r  r
  r  )rx   rx   rx   r   )r   r  r   r  r	   ravelr   )r   r   r   s      r&   test_tfidfvectorizer_binaryr    s    tU>>>A8OOO	677??AAAqwwyy,,,///	
m]3	4	4	<	<	>	>Brxxzz<<<00000r(   c                      t          d          } |                     t                     t          | j        | j        j                   d S )NTr9  )r   r   r   r   r8  rT  )r   s    r&   test_tfidfvectorizer_export_idfr    sA    4(((DHH^di)9:::::r(   c                      t          dg          } t          |           }|                     t                     |                    t                     |j        | j        k    sJ d S )Nr2  r   )r   r   r   r   r   )
vect_vocabvect_vocab_clones     r&   test_vectorizer_vocab_cloner    se     UG444JZ((NN=!!!'''':+AAAAAAAr(   c                    d} |             }t          j        t          |          5  |                    d           d d d            n# 1 swxY w Y   t          j        t          |          5  |                    d           d d d            n# 1 swxY w Y   |                    ddg           t          j        t          |          5  |                    d           d d d            d S # 1 swxY w Y   d S )NzBIterable over raw text documents expected, string object received.r   zhello world!	some textzsome other text)r   r   r   r   r   r   )rF   r  r  s      r&   &test_vectorizer_string_object_as_inputr    s    SG
*,,C	z	1	1	1 * *.)))* * * * * * * * * * * * * * * 
z	1	1	1                                 GG[+,---	z	1	1	1 & &n%%%& & & & & & & & & & & & & & & & & &s5   A

AA0BBBC22C69C6X_dtypec                     t          j        dd| d          }t                                          |          }|j        |j        k    sJ d S N
    N  *   rY  r  )r
   randr   r   rY  )r  r   X_transs      r&   test_tfidf_transformer_typer&    sN    BW2>>>A  ..q11G=AG######r(   zcsc_container, csr_containerc                 >   t          j        ddt          j        d          } | |          } ||          }t	                                          |          }t	                                          |          }t          ||           |j        |j        k    sJ d S r  )r
   r$  r=  r  r   r   r   r  )csc_containercsr_containerr   X_cscX_csrX_trans_cscX_trans_csrs          r&   test_tfidf_transformer_sparser.    s     	BRZbAAAAM!EM!E"$$22599K"$$22599K k:::!3333333r(   z0vectorizer_dtype, output_dtype, warning_expectedTFc                    t          j        g d          }t          |           }d}|rIt          j        t
          |          5  |                    |          }d d d            n# 1 swxY w Y   nZt          j                    5  t          j	        dt
                     |                    |          }d d d            n# 1 swxY w Y   |j
        |k    sJ d S )N)numpyscipysklearnrY  z'dtype' should be used.r   r  )r=  r  r   r   r  r  r   r  r  r  rY  )vectorizer_dtypeoutput_dtypewarning_expectedr   r   warning_msg_matchX_idfs          r&   test_tfidf_vectorizer_typer9    s_    	...//A '7888J1 0\+->??? 	0 	0,,Q//E	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 $&& 	0 	0!';777,,Q//E	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 ;,&&&&&&s$   A((A,/A,0CC
Cr  )ry   rx   r  c                    | j         }t          j        d| d          }t          | t                    rt
          rt          j        d           t          j        t          |          5  | 
                    dg           d d d            n# 1 swxY w Y   t          j        t          |          5  |                     dg           d d d            n# 1 swxY w Y   t          | t                    rLt          j        t          |          5  |                     dg           d d d            d S # 1 swxY w Y   d S d S )NzInvalid value for ngram_range=z/ lower boundary larger than the upper boundary.*HashingVectorizer is not supported on PyPyr   r   zgood news everyone)r|   reescaper   r   r   r   xfailr   r   r   r   r   )r  invalid_ranger  s      r&   $test_vectorizers_invalid_ngram_ranger@    s    OMi	9 	9 	9 	9 G #()) Jh JHIIII	z	1	1	1 ( (%&'''( ( ( ( ( ( ( ( ( ( ( ( ( ( ( 
z	1	1	1 2 2/01112 2 2 2 2 2 2 2 2 2 2 2 2 2 2 #()) 2]:W555 	2 	2MM/0111	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	22 2s6   ,BBB5CCCD77D;>D;c                     |                                  }|                                 }|                                 }|                     |||          S r"   )r   build_tokenizerr?  _check_stop_words_consistency)	estimatorr   tokenize
preprocesss       r&   rC  rC  )  sM    ))++J((**H--//J22:z8TTTr(   c                     d} d| z  }t                      t                      t                      fD ]x}|                    g d           t	          j        t          |          5  |                    dg           d d d            n# 1 swxY w Y   |`t          |          du sJ yt          j                    5  t          j        dt                     |                    dg           d d d            n# 1 swxY w Y   t          |          J |                    g d	           t	          j        t          |          5  |                    dg           d d d            d S # 1 swxY w Y   d S )
Nz\['and', 'll', 've'\]z}Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens %s not in stop_words.)you'veyouyou'llANDr   r   r
  Fr  )rH  rI  rJ  blahrK  )r   r   r   r   r   r  r  r   _stop_words_idrC  r  r  r  )lstrr  r  s      r&   'test_vectorizer_stop_words_inconsistentrO  0  sT   #D	')-	. 
  !!?#4#46G6I6IJ ; ;"D"D"DEEE\+W555 	/ 	/}o...	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ ,S11U::::: 
	 	"	" + +g{333=/***+ + + + + + + + + + + + + + + )--555 NNHHHNIII	k	1	1	1 + +=/***+ + + + + + + + + + + + + + + + + +s6   &B		B	B	=1C::C>C>	E--E14E1r)  c                 <    | dt           j                  }t           j        }|j                            |          |_        |j                            |          |_        dddd}t                                          ||          }||j        j        k    sJ dS )z
    Check that CountVectorizer._sort_features preserves the dtype of its sparse
    feature matrix.

    This test is skipped on 32bit platforms, see:
        https://github.com/scikit-learn/scikit-learn/pull/11295
    for more details.
    )r   r   r3  r   rx   ry   )zscikit-learnrW   zgreat!N)r=  int64indicesastypeindptrr   _sort_featuresrY  )r)  r   INDICES_DTYPEr   Xss        r&   7test_countvectorizer_sort_features_64bit_sparse_indicesrX  L  s     	fBH---A HM	  //AIx}--AH"#1::J				)	)!Z	8	8BBJ,,,,,,,r(   	Estimatorc                    ddig} |             }t          |          du sJ  | d dg          }t          |          dk    sJ t          |          J |                    |            G d d	|           } |dg
          }t          |          dk    sJ  | d dg          }t          |          du sJ d S )Nrr   r  Tc                     | d         S Nrr   r4   r  s    r&   <lambda>z?test_stop_word_validation_custom_preprocessor.<locals>.<lambda>p  s
    1V9 r(   and)rd   r   r  c                       e Zd Zd ZdS )Ftest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimatorc                     d S )Nc                     | d         S r\  r4   r]  s    r&   r^  zktest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimator.build_preprocessor.<locals>.<lambda>x  s
    QvY r(   r4   )selfs    r&   r?  zYtest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimator.build_preprocessorw  s    &&&r(   N)__name__
__module____qualname__r?  r4   r(   r&   CustomEstimatorra  v  s#        	' 	' 	' 	' 	'r(   rh  r   c                 P    t          j        d                              |           S )Nz\w{1,})r<  compilefindallr  s    r&   r^  z?test_stop_word_validation_custom_preprocessor.<locals>.<lambda>~  s    bj33;;C@@ r(   )ro   r   )rC  r   )rY  r[  r  rh  s       r&   -test_stop_word_validation_custom_preprocessorrm  f  s*   
 [!"D
)++C(--5555
)!4!4%
I
I
IC(--8888(--555d' ' ' ' ') ' ' ' /eW
-
-
-C(--8888
)@@eW  C )--555555r(   zinput_type, err_type, err_msgfilenamer?   r]   z$'str' object has no attribute 'read'c                    t          | t                    rt          rt          j        d           dg}t          j        ||          5   | d |                              |           d d d            d S # 1 swxY w Y   d S )Nr;  "this is text, not file or filenamer   c                 *    |                                  S r"   r/   r]  s    r&   r^  z.test_callable_analyzer_error.<locals>.<lambda>  s    QWWYY r(   r{   r^   )
issubclassr   r   r   r>  r   r   )rY  
input_typeerr_typer   r[  s        r&   test_callable_analyzer_errorrv    s     ).// CH CABBB01D	xw	/	/	/ V V	..jAAAOOPTUUUV V V V V V V V V V V V V V V V V Vs   
"A99A= A=)marksr{   c                 "    t          | d          S )Nr)openrl  s    r&   r^  r^    s    T#s^^ r(   c                 *    |                                  S r"   )readrl  s    r&   r^  r^    s     r(   rt  c                     dg}t          j        t          t          f          5   | ||                              |           d d d            d S # 1 swxY w Y   d S )Nrp  rr  )r   r   FileNotFoundErrorAttributeErrorr   )rY  r{   rt  r[  s       r&   &test_callable_analyzer_change_behaviorr    s     11D	)>:	;	; K K	8:666DDTJJJK K K K K K K K K K K K K K K K K Ks   !AAAc                 j   d }t          |t                    rt          rt          j        d           |                     d          }|                    d           t          j        t          d          5   ||d          	                    |g           d d d            d S # 1 swxY w Y   d S )	Nc                      t          d          )Ntesting)	Exceptionrl  s    r&   r{   z6test_callable_analyzer_reraise_error.<locals>.analyzer  s    	"""r(   r;  zfile.txtzsample content
r  r   r]   rr  )
rs  r   r   r   r>  joinwriter   r  r   )tmpdirrY  r{   fs       r&   $test_callable_analyzer_reraise_errorr    s   
# # # ).// CH CABBBJAGG	y		2	2	2 F F	86222@@!EEEF F F F F F F F F F F F F F F F F Fs   9"B((B,/B,zjstop_words, tokenizer, preprocessor, ngram_range, token_pattern,analyzer, unused_name, ovrd_name, ovrd_msgrH  rJ  r  r   z'stop_words'
'analyzer'	!= 'word'c                 *    |                                  S r"   r/   r$   s    r&   r^  r^        aggii r(   z'tokenizer'c                 *    |                                  S r"   r/   r$   s    r&   r^  r^    r  r(   \w+ru   'token_pattern'zis not Nonec                 *    |                                  S r"   r#   r$   s    r&   r^  r^    r  r(   c                 *    |                                  S r"   r  r$   s    r&   r^  r^    r  r(   z'preprocessor'zis callablerw   c                 *    |                                  S r"   r  r$   s    r&   r^  r^    r  r(   z'ngram_range')	NNNr  r  r   r  r  r  c
                    t           }
 |             }|                    ||||||           d|d|d|	}t          j        t          |          5  |                    |
           d d d            d S # 1 swxY w Y   d S )N)r   ro   rd   r|   r   r{   zThe parameter z will not be used since  r   )r   r   r   r  r  r   )rF   r   ro   rd   r|   r   r{   unused_name	ovrd_nameovrd_msgr@  r   r   s                r&   test_unused_parameters_warnr    s    r  J:<<DOO!#      			C
 
k	-	-	-                   s   A66A:=A:zVectorizer, Xrx   ry   )r   barr   )r   bazc                      |             }t          |d          rJ |                    |           t          |d          rJ d S )Nn_features_in_)r<  r   )rF   r   r   s      r&   test_n_features_inr  &  sU     Jz#344444NN1z#34444444r(   c                      t          d          } |                     ddg          j        }|                     ddg          j        }||k    sJ d S )Nrx   ru  helloworld)r   r   r   )r  vocab1vocab2s      r&   )test_tie_breaking_sample_order_invariancer  5  s]     q
)
)
)CWWgw'((4FWWgw'((4FVr(   c                  z    t          dd          } |                     dg          j        }|d         dk    sJ d S )Ni@B )ry   r   )rX  r|   z22pcs efuturer   )r   r   rR  )hashingrR  s     r&   2test_nonnegative_hashing_vectorizer_result_indicesr  >  sD      7GGGG 122:G1:??????r(   c                 >     |             }t          |d          rJ dS )z0Check that vectorizers do not define set_output.
set_outputN)r<  )rY  r   s     r&   'test_vectorizers_do_not_have_set_outputr  F  s+    
 )++CsL)))))))r(   c                    t          j        ddt          j        d          } | |          }|                                }t                                          |          }|                    |d          }t          ||           ||usJ |                    |d          }||u sJ t          j
        t                    5  t          ||           ddd           dS # 1 swxY w Y   dS )	zJCheck the behaviour of TfidfTransformer.transform with the copy parameter.r   r!  r"  r#  T)r  FN)r
   r$  r=  r  r  r   r   r   r   r   r   AssertionError)r)  r   r+  X_csr_originaltransformerX_transforms         r&   test_tfidf_transformer_copyr  O  s;    	BRZbAAAAM!E ZZ\\N"$$((//K''D'99K 777e####''E'::K%	~	&	& < <$UN;;;< < < < < < < < < < < < < < < < < <s   C$$C(+C()r  r<  r  collectionsr   collections.abcr   	functoolsr   ior   	itertoolsr   r0  r=  r   numpy.testingr   r	   r1  r
   sklearn.baser   sklearn.feature_extraction.textr   r   r   r   r   r   r   r   sklearn.model_selectionr   r   r   sklearn.pipeliner   sklearn.svmr   sklearn.utils._testingr   r   r   r   sklearn.utils.fixesr   r   r   r    r   r  r   r'   r-   r1   r5   rC   rE   markparametrizers   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r	  r  r  r>  r%  r)  rR  rU  rb  rn  rs  r  r  r  r  r  r  r  r  r  r  r  r  rp   r?  rB  r  r  r  r  r  r   r  r  r  r  r  r  r  r  r  r&  r.  int32rQ  r9  r@  rC  rO  rX  rm  r~  r  rv  paramr  r  r  r  r  r  r  r  r4   r(   r&   <module>r     s,    				  # # # # # # # # # # # #                        G G G G G G G G            	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 T S S S S S S S S S % % % % % % ! ! ! ! ! !            S R R R R R R R R R R R  !22, , ,       $ $ $!0 !0 !0H. . .* 9J'KLL:  :  ML: z     <  && & &4& & &() ) ).& & &*
+ 
+ 
+& & &$ $ $0 0 0E E E& & &4 4 4&  &) ) )*< < <    	M   & & &&  d d dN5 5 5: #? #? #?LD/ D/ D/N 'IJJ> > KJ>: : :4- - -$- - -$( ( (( ! ! !4 'IJJ< < KJ<>!1 !1 !1H$1 $1 $1N
3 
3 
3 H H H0" " "  < &*' 
 
 

 
 
8
 
 
:Y Y YQ Q Q   + + +     1 1 1; ; ;B B B ?O5FG & & & RZ$<==$ $ >=$ "GGNN$K$K 4 4 4 6	2:t$	2:t$	RZ'	RZ'	 ' ' ' 	f---F+++F+++ 2 2 2,U U U + + +6 .99- - :9 -0 /?4EF 6 6  62 /?4EF  #	&+	!GH V V  V &m<<<  ++-C-CD  
';<<K K =<  K /?4EF F F F ?$5G  	5
 x 
	
 
	
 
	
 
	
 
	

	
qCI IT UI I Z@ 	Qq111Q3G3GHI	.) 5 5 5      /?4DFWX * * * .99< < :9< < <r(   