작업 2유형 : 2회차

SOOYEON·2022년 6월 24일
0

빅데이터분석기사

목록 보기
35/36
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,classification_report
from sklearn.ensemble import RandomForestClassifier


# 전처리
train['age'] =train['age'].str.replace('*','').astype('int')
train['bmi'] = train['bmi'].fillna(train['bmi'].mean())
test['bmi'] = test['bmi'].fillna(test['bmi'].mean())
x = train.drop(columns =['id','stroke'])
xd = pd.get_dummies(x)
y = train['stroke']


#학습
x_train,x_test,y_train,y_test = train_test_split(xd,y,stratify =y ,random_state=1)
rf = RandomForestClassifier()
rf.fit(x_train,y_train)
pred = rf.predict_proba(x_test)
print('test roc score : ',roc_auc_score(y_test,pred[:,1]))


# one-hot encoding시 train셋에만 존재하는 컬럼이 존재
test_preprocessing =pd.get_dummies(test.drop(columns=['id']))
test_preprocessing[list(set(x_train.columns) -set(test_preprocessing))] =0
test_preprocessing =test_preprocessing[x_train.columns]
test_pred = rf.predict_proba(test_preprocessing)

# 아래 코드 예측변수와 수험번호를 개인별로 변경하여 활용
# pd.DataFrame({'id': test.id, 'stroke': pred}).to_csv('003000000.csv', index=False)
pd.DataFrame({'id': test.id, 'stroke': test_pred[:,1]}).to_csv('003000000.csv', index=False)

0개의 댓글