from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,classification_report
from sklearn.ensemble import RandomForestClassifier
train['age'] =train['age'].str.replace('*','').astype('int')
train['bmi'] = train['bmi'].fillna(train['bmi'].mean())
test['bmi'] = test['bmi'].fillna(test['bmi'].mean())
x = train.drop(columns =['id','stroke'])
xd = pd.get_dummies(x)
y = train['stroke']
x_train,x_test,y_train,y_test = train_test_split(xd,y,stratify =y ,random_state=1)
rf = RandomForestClassifier()
rf.fit(x_train,y_train)
pred = rf.predict_proba(x_test)
print('test roc score : ',roc_auc_score(y_test,pred[:,1]))
test_preprocessing =pd.get_dummies(test.drop(columns=['id']))
test_preprocessing[list(set(x_train.columns) -set(test_preprocessing))] =0
test_preprocessing =test_preprocessing[x_train.columns]
test_pred = rf.predict_proba(test_preprocessing)
pd.DataFrame({'id': test.id, 'stroke': test_pred[:,1]}).to_csv('003000000.csv', index=False)