๐ ์ค๋์ sklearn์ Pipeline ๋ฉ์๋์ ๋ํด ์ด์ผ๊ธฐ ํด๋ณผ ์์ ์ ๋๋ค. ์ผ๋ฐ์ ์ผ๋ก ๋จ์ ๋ชจ๋ธ ํ์ต๋ง์ผ๋ก๋ ์ต์ ์ ์์ธก๊ฒฐ๊ณผ๋ฅผ ๋์ถํ๊ธฐ ์ด๋ ต์ต๋๋ค. ๊ทธ๋์ ์ฐ๋ฆฌ๋ ์ ๊ทํ, ๊ต์ฐจ๊ฒ์ฆ ๋ฑ ๋ฐฉ๋ฒ์ ๋ชจ๋ธ์ ์ ์ฉํด ํ์ต์ ์งํํ๊ฒ ๋ฉ๋๋ค. Pipeline๋ ์ด๋ฌํ ๊ณผ์ ์ ํ๋์ ์ํฌํ๋ก์ฐ๋ก ๊ฐ๋ฅ์ผ ํด์ค๋๋ค. ๊ฒฐ๋ก ์ ์ผ๋ก ํ์ดํ๋ผ์ธ์ ์ฌ์ฉํ๋ฉด ๋ฐ์ดํฐ ์ฌ์ ์ฒ๋ฆฌ ๋ฐ ๋ถ๋ฅ์ ๋ชจ๋ ๋จ๊ณ๋ฅผ ํฌํจํ๋ ๋จ์ผ ๊ฐ์ฒด๋ฅผ ๋ง๋ค ์ ์์ต๋๋ค.
์๋์ ๋งํฌ์์ Pipeline์ ๋ํ ์ค๋ช
์ ๋ค์ผ์ค ์ ์์ต๋๋ค.
Liz Sander - Software Library APIs: Lessons Learned from scikit-learn - PyCon 2018
โ ๊ธฐ๋ณธ์ ์ธ Pipeline ์์ ์ ๋๋ค. Pipeline() ๊ฐ์ฒด์ Scaler์ ๋ชจ๋ธ์ ๋จ์ผ ๊ฐ์ฒด๋ก์ ์ฌ์ฉํ ์ ์์ต๋๋ค. ์๋์ ์ฝ๋๋ฅผ ๋ณด์๋ฉด ์ฝ๋์ ๊ฐ๋ ์ฑ ๋์์ง๊ณ , ๊ฐ๊ฒฐํด์ง๋๋ค. ๋ํ ์ฌํ์ฑ์ด ์ฆ๊ฐํฉ๋๋ค.
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# add your data here
data = load_iris()
X_train,X_test,y_train, y_test = train_test_split(data.data, data.target,
test_size = 0.2, random_state = 42)
# it takes a list of tuples as parameter
pipeline = Pipeline([
('scaler',StandardScaler()), # ํ์คํ
('clf', LogisticRegression(random_state = 42)) #
])
# use the pipeline object as you would
# a regular classifier
pipeline.fit(X_train,y_train)
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.metrics import f1_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import classification_report
cats = ['alt.atheism', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target
vect = CountVectorizer() # ๊ฐ ํ
์คํธ์์ ๋จ์ด ์ถํ ํ์๋ฅผ ์นด์ดํ
ํ ๋ฒกํฐ
tfidf = TfidfTransformer() # TF-IDF๋ผ๋ ๊ฐ์ ์ฌ์ฉํ์ฌ CountVectorizer์ ๋จ์ ์ ๋ณด์ํจ
# this is a linear SVM classifier
clf = LinearSVC()
pipeline = Pipeline([
('vect',vect),
('tfidf',tfidf),
('clf',clf)
])
# call fit as you would on any classifier
pipeline.fit(X_train,y_train)
# predict test instances
y_preds = pipeline.predict(X_test)
# calculate f1
mean_f1 = f1_score(y_test, y_preds, average='micro')
print(classification_report(y_preds,y_test))
print("mean_f1 :", mean_f1)
>>>
precision recall f1-score support
0 0.95 0.99 0.97 307
1 0.99 0.96 0.98 406
accuracy 0.97 713
macro avg 0.97 0.98 0.97 713
weighted avg 0.98 0.97 0.97 713
mean_f1 : 0.9747545582047685
โ ์๋ ์ฝ๋์ ๊ฐ์ด ๊ต์ฐจ ๊ฒ์ฆ๊ณผ, GridSearchCV ๋ฐฉ๋ฒ๋ค ๋ํ ํจ๊ป ์ฌ์ฉํ ์ ์๋ค. ๊ฐ์ธ์ ์ผ๋ก Pipeline์ ๊ฐ์ฅ ํฐ ์ฅ์ ์ ๊ฐ์์ฑ์ด๋ผ๊ณ ์๊ฐํ๋ค.
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
cats = ['alt.atheism', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target
pipeline = Pipeline([
('vect',CountVectorizer()),
('tfidf',TfidfTransformer()),
('clf',LinearSVC())
])
# this is where you define the values for
# GridSearchCV to iterate over
# l1 penalty is incompatible with other configs
param_grid = [
{
'vect__max_df':[0.8,0.9,1.0],
'clf__penalty':['l2'],
'clf__dual':[True,False]
},
{
'vect__max_df':[0.8,0.9,1.0],
'clf__penalty':['l1'],
'clf__dual': [False]
}
]
# do 3-fold cross validation for each of the 6 possible
# combinations of the parameter values above
grid = GridSearchCV(pipeline, cv=3, param_grid=param_grid,scoring='f1_micro')
grid.fit(X_train,y_train)
# summarize results
print("Best: %f using %s" % (grid.best_score_,
grid.best_params_))
# print("result : \n", grid.cv_results_ )
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
# now train and predict test instances
# using the best configs
pipeline.set_params(clf__penalty='l2',vect__max_df=0.9,clf__dual=True)
pipeline.fit(X_train,y_train)
y_preds = pipeline.predict(X_test)
# calculate f1
print(classification_report(y_preds,y_test))
f1_score(y_test, y_preds, average='micro')
>>>
Best: 0.992546 using {'clf__dual': True, 'clf__penalty': 'l2', 'vect__max_df': 0.9}
0.990681 (0.001311) with: {'clf__dual': True, 'clf__penalty': 'l2', 'vect__max_df': 0.8}
0.992546 (0.001309) with: {'clf__dual': True, 'clf__penalty': 'l2', 'vect__max_df': 0.9}
0.990681 (0.001311) with: {'clf__dual': True, 'clf__penalty': 'l2', 'vect__max_df': 1.0}
0.990681 (0.001311) with: {'clf__dual': False, 'clf__penalty': 'l2', 'vect__max_df': 0.8}
0.992546 (0.001309) with: {'clf__dual': False, 'clf__penalty': 'l2', 'vect__max_df': 0.9}
0.990681 (0.001311) with: {'clf__dual': False, 'clf__penalty': 'l2', 'vect__max_df': 1.0}
0.971112 (0.003459) with: {'clf__dual': False, 'clf__penalty': 'l1', 'vect__max_df': 0.8}
0.972051 (0.008202) with: {'clf__dual': False, 'clf__penalty': 'l1', 'vect__max_df': 0.9}
0.971115 (0.004719) with: {'clf__dual': False, 'clf__penalty': 'l1', 'vect__max_df': 1.0}
precision recall f1-score support
0 0.96 0.99 0.97 308
1 0.99 0.97 0.98 405
accuracy 0.98 713
macro avg 0.97 0.98 0.98 713
weighted avg 0.98 0.98 0.98 713
0.9761570827489481