[TIL_Carrotww] 33 - 22/10/19

rjgjflยท2022๋…„ 10์›” 19์ผ
0

TIL

๋ชฉ๋ก ๋ณด๊ธฐ
40/138
post-thumbnail

๐Ÿ“Carrotww์˜ ์ฝ”๋”ฉ ๊ธฐ๋ก์žฅ

๐Ÿงฒ Titanic Dataset์œผ๋กœ ์ธ๊ณต์ง€๋Šฅ ํ•™์Šต์‹œํ‚ค๊ธฐ

๐Ÿ” sklearn์„ ์‚ฌ์šฉํ•˜์—ฌ ๋ฐ์ดํ„ฐ ํ•™์Šต์‹œ์ผœ๋ณด๊ธฐ

์œ„์™€ ๊ฐ™์€ ํ™”๋ฉด์—์„œ ์‚ฌ์šฉ์ž์˜ ์–ผ๊ตด ์‚ฌ์ง„์„ ๋„ฃ์œผ๋ฉด ์•„๋ž˜์˜ ์ •๋ณด๋ฅผ ์ž…๋ ฅํ•˜๊ณ  ์ด๋ฏธ์ง€๋Š” ์„ฑ๋ณ„๊ณผ ๋‚˜์ด๋ฅผ ํƒ์ƒ‰ํ•˜๋Š” ๋‹ค๋ฅธ ์ด๋ฏธ์ง€ ์ธ์‹ ๋ชจ๋ธ์„ ๊ฐ€์ ธ์™€ ์‚ฌ์šฉ, ์‚ฌ์šฉ์ž ์ •๋ณด๋ฅผ ์ž…๋ ฅ๋ฐ›์•„ ์ƒ์กด ์—ฌ๋ถ€๋ฅผ ์•Œ๋ ค์ฃผ๋Š” ๋จธ์‹ ๋Ÿฌ๋‹์„ ์ ์šฉํ•œ ์žฅ๊ณ  ํ”„๋กœ์ ํŠธ๋ฅผ ์ง„ํ–‰์ค‘์ด๋‹ค.

from pdb import post_mortem
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split #training and testing data split
from sklearn import metrics #accuracy measure
import torch

plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')

# ๋ฐ์ดํ„ฐ๋ฅผ ๋ถˆ๋Ÿฌ์˜ค๊ณ  ๋ณด์—ฌ์ค€๋‹ค.
train_data=pd.read_csv('/titanic/train.csv')
test_data=pd.read_csv('/titanic/test.csv')

# data.head()๋Š” ์•ž์˜ 5๊ฐœ๋งŒ์„ ๋ณด์—ฌ์ค€๋‹ค.
# print(train_data.head())

for col in train_data.columns :
    msg = 'ํ•ญ๋ชฉ {:>10}\t ๋น„์–ด์žˆ๋Š” ์ž๋ฃŒ์˜ ๋น„์œจ : {:.2f}%'.format(col, 100 * (train_data[col].isnull().sum() / train_data[col].shape[0]))
    # print(msg)

for col in test_data.columns :
    msg = 'ํ•ญ๋ชฉ {:>10}\t ๋น„์–ด์žˆ๋Š” ์ž๋ฃŒ์˜ ๋น„์œจ : {:.2f}%'.format(col, 100 * (test_data[col].isnull().sum() / test_data[col].shape[0]))
    # print(msg)

train_data.isnull().sum()
# print(train_data.isnull().sum())
# train_data ์—ด ๋ถ€๋ถ„์˜ ๋น„์–ด์žˆ๋Š” ๋ฐ์ดํ„ฐ ๋ชจ๋‘ sum() ํ•˜์—ฌ ๋ณด์—ฌ์คŒ

train_data['Initial']= train_data.Name.str.extract('([A-Za-z]+)\.')
test_data['Initial']= test_data.Name.str.extract('([A-Za-z]+)\.')
# print(test_data.Name.str.extract('([A-Za-z]+)\.'))
# print(train_data.Name.str.extract('([A-Za-z]+)\.'))

train_data['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)
test_data['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)
# ๋ผ๋ฒจ์— ๋”ฐ๋ผ ํ‰๊ท  ๊ฐ’์„ ๋‚˜ํƒ€๋‚ธ๋‹ค.

train_data.groupby('Initial')['Age'].mean()
# print(train_data.groupby('Initial')['Age'].mean())

train_data.loc[(train_data.Age.isnull())&(train_data.Initial=='Mr'),'Age']=33
train_data.loc[(train_data.Age.isnull())&(train_data.Initial=='Mrs'),'Age']=36
train_data.loc[(train_data.Age.isnull())&(train_data.Initial=='Master'),'Age']=5
train_data.loc[(train_data.Age.isnull())&(train_data.Initial=='Miss'),'Age']=22
train_data.loc[(train_data.Age.isnull())&(train_data.Initial=='Other'),'Age']=46

test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Mr'),'Age'] = 33
test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Mrs'),'Age'] = 36
test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Master'),'Age'] = 5
test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Miss'),'Age'] = 22
test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Other'),'Age'] = 46

train_data.Age.isnull().any()
test_data.Age.isnull().any()
train_data['Embarked'].fillna('S',inplace=True)

train_data['Age_band']=0
train_data.loc[train_data['Age']<=16,'Age_band']=0
train_data.loc[(train_data['Age']>16)&(train_data['Age']<=32),'Age_band']=1
train_data.loc[(train_data['Age']>32)&(train_data['Age']<=48),'Age_band']=2
train_data.loc[(train_data['Age']>48)&(train_data['Age']<=64),'Age_band']=3
train_data.loc[train_data['Age']>64,'Age_band']=4
train_data.head()

#family size max=4
train_data['Family_Size']=0
train_data['Family_Size']=train_data['Parch']+train_data['SibSp']

#Alone
train_data['Alone']=0
train_data.loc[train_data.Family_Size==0,'Alone']=1

train_data['Sex'].replace(['male','female'],[0,1],inplace=True)
train_data['Embarked'].replace(['S','C','Q'],[0,1,2],inplace=True)
train_data['Initial'].replace(['Mr','Mrs','Miss','Master','Other'],[0,1,2,3,4],inplace=True)

train_data.drop(['Name','Age','Ticket','Cabin','PassengerId','SibSp','Parch','Initial'],axis=1,inplace=True)

train,test=train_test_split(train_data,test_size=0.3,random_state=0,stratify=train_data['Survived'])
train_X=train[train.columns[1:]]
train_Y=train[train.columns[:1]]
test_X=test[test.columns[1:]]
test_Y=test[test.columns[:1]]
X=train_data[train_data.columns[1:]]
Y=train_data['Survived']

model = LogisticRegression()
model.fit(train_X,train_Y)
prediction3=model.predict(test_X)

test_x = [[1, 0, 10.0000, 1, 1, 1, 1]]
print('The accuracy of the Logistic Regression is',metrics.accuracy_score(prediction3,test_Y))

print(model.predict(test_x))
test_test = model.predict(test_x)

https://welcome-to-dewy-world.tistory.com/4?category=913368
์œ„ ์ฝ”๋“œ๋ฅผ ์ฐธ์กฐํ•˜์—ฌ ์ž‘์„ฑํ•˜์˜€๋‹ค. ๋ชจ๋“  ์ค„์˜ ์‹คํ–‰ ๊ฒฐ๊ณผ์™€ ํ•ด๋‹น ์ฝ”๋“œ๋ฅผ ์™œ ์‚ฌ์šฉํ•˜์˜€๋Š”์ง€๋Š” ์ง๊ด€์ ์œผ๋กœ ์•Œ ์ˆ˜ ์žˆ๋‹ค. ๊ฐœ์ธ์ ์œผ๋กœ ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌํ•˜๋Š” ์ฝ”๋“œ๋Š” 2์ฐจ์› ๋ฐฐ์—ด, 3์ฐจ์›๊นŒ์ง€ ๋‹ค๋ฃจ๋ฉฐ ์ฝ”๋“œ๊ฐ€ ์ง๊ด€์ ์ด์ง€ ์•Š์•„ ์ดํ•ดํ•˜๊ธฐ ๋„ˆ๋ฌด ํž˜๋“ค์—ˆ์ง€๋งŒ ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ๊ฐ€ ์•„๋‹Œ ๋ฐ์ดํ„ฐ ํ•™์Šต์„ ์‹œํ‚ค๋Š” ๊ฒƒ์€ ๊ฐ„๋‹จํ•œ ๋ฐฉ๋ฒ•์ด๋ฉด ๋‚˜๋ฆ„ ์ดํ•ดํ•˜๊ธฐ๊ฐ€ ์‰ฝ๋‹ค.
์ผ€๊ธ€ ์ถ”์ฒœ์ˆ˜ 10000๊ฐœ์˜ ํ•™์Šต๋ฒ•์€ ์ฃผ๋ง์— ์‹œ๊ฐ„์ด ๋˜๋ฉด ํ•ด๋ด์•ผ๊ฒ ๋‹ค.
์œ„ ์ฝ”๋“œ์™€ ์ •ํ™•๋„๊ฐ€ 4ํผ์„ผํŠธ์ •๋„ ์ฐจ์ด๊ฐ€ ๋‚˜๋ฉฐ ํ”„๋กœ์ ํŠธ๋ฅผ ์™„์„ฑํ•˜๊ณ  ๋‚˜๋ฉด ์ฝ”๋“œ๋ฅผ ํ•˜๋‚˜ํ•˜๋‚˜ ๋‹ค์‹œ ๋œฏ์–ด๋ด์•ผ๊ฒ ๋‹ค.

๊ฐœ์ธ์ ์œผ๋กœ ์ฒ˜์Œ์— C++์„ ๊ณต๋ถ€ํ•˜๋“ฏ ์—„์ฒญ ๋ง‰๋ง‰ํ–ˆ๋Š”๋ฐ ๋ฐ์ดํ„ฐ ํ•™์Šต์„ ์‹œํ‚ค๋ฉฐ ๊ฒฐ๊ณผ๊ฐ€ ๋‚˜์˜ค๋Š”๊ฒŒ ๋„ˆ๋ฌด ์‹ ๊ธฐํ–ˆ๋‹ค.

test_x ์˜ ๋ฐ์ดํ„ฐ๋Š” ํ”„๋กœ์ ํŠธ์—์„œ ์‚ฌ์šฉํ•  ๋ฐ์ดํ„ฐ ์ „๋‹ฌ ๋ฐฉ์‹์ด๋‹ค ํ•ด๋‹น ๋ฐฐ์—ด์„ ์ž…๋ ฅ๋ฐ›์•„ ๊ทธ๋Œ€๋กœ ๋„ฃ์–ด์ค„ ๊ฒƒ์ด๋‹ค.
ํŽธ์˜์ƒ ํ•™์Šต์— ์˜ํ–ฅ์„ ๋งŽ์ด ์ฃผ์ง€๋งŒ ์žฌ๋ฏธ๋กœ ํ•ด๋ณด๋Š” ํ…Œ์ŠคํŠธ์ด๊ธฐ์— ํ•ญ๊ตฌ(ํƒ‘์Šน์ง€)๋Š” ํ•œ๊ตญ ํ•ญ๊ตฌ๋กœ ์ž„์˜๋กœ ์ •ํ•˜์˜€๋‹ค.

๐Ÿงฒ Algorithm

๐Ÿ” ๋ฌธ์ œ๊ฐ€ ์•ˆํ’€๋ฆฐ๋‹ค ใ… ใ… ใ…  ์‹œ๊ฐ„๋„ ๋„‰๋„‰์น˜ ์•Š์•„ ๋ชปํ–ˆ๋‹ค ใ… ใ…  ์ผ์ฃผ์ผ ํ• ๋‹น์น˜๋ฅผ ๋ชป ํ’€ ๊ฒƒ ๊ฐ™๋‹ค... ๊ธˆ์š”์ผ ์˜ค์ „๊นŒ์ง€ ํ”„๋กœ์ ํŠธ๊ฐ€ ์ง„ํ–‰๋˜๋‹ˆ ๊ธˆ์š”์ผ๊ณผ ์ฃผ๋ง์— ๋งŽ์ด ํ’€์–ด์•ผ๊ฒ ๋‹ค...

0๊ฐœ์˜ ๋Œ“๊ธ€