wbcd = pd.read_csv("c://data//wisc_bc_data.csv")
wbcd.info()
wbcd.shape
wbcd.describe()
def outlier_value(x):
for i in x.columns[x.dtypes=='float64']:
Q1 = x[i].quantile(0.25)
Q3 = x[i].quantile(0.75)
IQR = Q3 - Q1
print(i,x[i][(x[i]>Q3+IQR*5)|(x[i]<Q1-IQR*5)].count())
outlier_value(wbcd)
wbcd.isnull().sum()
from sklearn.preprocessing import MinMaxScaler
wbcd2 = wbcd.iloc[:,2:]
scaler = MinMaxScaler()
scaler.fit(wbcd2)
df_scaled = scaler.transform(wbcd2)
wbcd3 = pd.DataFrame(df_scaled, columns = wbcd2.columns)
wbcd3
x = df_scaled
y = wbcd['diagnosis'].to_numpy()
from sklearn.model_selection import train_test_split
x_train, x_test,y_train,y_test = train_test_split(x,y,test_size = 0.1,random_state = 1)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
from sklearn.neighbors import KNeighborsClassifier as KN
model = KN(n_neighbors = 5)
model.fit(x_train, y_train)
result = model.predict(x_test)
sum(y_test == result)/57*100
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,result)
accuracy
from sklearn.metrics import confusion_matrix
tn,fp,fn,tp = confusion_matrix(y_test, result).ravel()
print(tn,fp,fn,tp)
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
knn_num =[]
accu_num = []
fn_num =[]
for i in range(1,513,2):
model = KN(n_neighbors = i)
model.fit(x_train, y_train)
result = model.predict(x_test)
accuracy = accuracy_score(y_test,result)
tn,fp,fn,tp = confusion_matrix(y_test, result).ravel()
knn_num.append(i)
accu_num.append(round(accuracy,2))
fn_num.append(fn)
dt = pd.DataFrame({'knn':knn_num,'정확도':accu_num,'FN':fn_num})
min_fn = dt.FN.min()
정확도 = dt['정확도'][dt['FN']== dt.FN.min()].max()
knn = dt['knn'][(dt.정확도 == 정확도)&(dt.FN == dt.FN.min())].min()
print('가장 작은 fn값: ',min_fn, '\n정확도: ',정확도,'\n가장 작은 knn값: ',knn)