머신러닝 모델링(Machine Learning Modeling) 과정

예린·2024년 3월 30일

머신러닝

목록 보기

5/7

1. 환경 준비

# 라이브러리 불러오기
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# warning 무시하기
warnings.filterwarnings(action='ignore')

# 데이터 읽어오기
path = 'titanic.csv'
data = pd.read_csv(path)

2. 데이터 수집 및 이해

# 상위 몇 개 행 확인
data.head()

# 하위 몇 개 행 확인
data.tail()

# 변수 확인
data.info()

# 기초통계량 확인
data.describe()

# 상관관계 확인
# numeric_only=True : 수치형 변수들에 관해서만 상관관계를 알려줌
data.corr(numeric_only=True)

#상관관계 시각화
sns.heatmap(data.corr(numeric_only=True),
           annot=True, # 숫자 표시
           cmap='Blues', # 색 지정
           cbar=False, # 오른쪽 컬러바 없애기
           square=True, # 정사각형으로 출력
           fmt='.2f', # 실수 두 자리로 출력
           annot_kws={'size': 9}) # annot kewords 글씨 크기
plt.show()

3. 데이터 전처리

변수 제거

# 변수 제거
# 여러 열 동시 제거
drop_cols = ['Cabin', 'PassengerId', 'Name', 'Ticket']
titanic.drop(drop_cols, axis=1, inplace=True)

# 확인
data.head()

결측치 처리

# 변수들의 NaN 포함 상태 확인
# sum()의 axis 옵션 기본 값은 0이기 때문에, 행을 더해서 열 기준 출력 : sum(axis = 0)
data.isna().sum()

# 결측치 채우기
data.fillna(('0'), inplace=True)

x, y 분리

# target 확인
target = 'Survived'

# 데이터 분리
x = data.drop(target, axis=1)
y = data.loc[:, target] # y = data[target] : 같은 결과

가변수화

# 가변수화 대상: Pclass, Sex, Embarked
data_columns = ['Pclass', 'Sex', 'Embarked']

# 가변수화
x = pd.get_dummies(data, columns=data_columns, drop_first=True, dtype=int)

#  확인
x.head()

학습용, 평가용 데이터 분리

# 모듈 불러오기
from sklearn.model_selection import train_test_split

# 7:3으로 분리
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
# train_size=0.3로 작성해도 됨 : 학습이 30%라는 뜻 / test_size=3으로 하면 마지막 3행을 사용한다는 뜻

4. 모델 학습

분류 문제인지, 회귀 문제인지에 따라 사용할 알고리즘과 평가 방법이 달라짐

# 1) 불러오기
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

# 2) 선언하기
model = LinearRegression()

# 4) 학습하기
model.fit(x_train, y_train)

#5) 예측하기
y_pred = model.predict(x_test)