Python Cheat Sheet

dpwl·2024년 4월 27일

Data Analysis python zerobase 데이터 분석 제로베이스 파이썬

Data Analysis

목록 보기

67/83

1. 데이터 핸들링

1.1 Data 형태 확인

import pandas as pd

# Data 형태 확인 (모든 행과 열)
df.shape

# 각 컬럼에 대한 data type 확인
df.info()

# 각 컬럼에 대한 Null 값 확인
df.isnull().sum()

1.2 Unique한 Value별 카운팅

len(df['col'].unique())

df['col'].unique()

1.3 DataFrame 특정값 치환

import numpy as np
import pandas as pd

df.replace(-200, np.NaN)

1.4 Null 값 이전 값으로 채워넣기

import numpy as np

df.fillna(method='ffill')

1.5 DataFrame 특정 column만 가져오기

import pandas as pd

df = df[['col1', 'col2']]

1.6 조건에 맞는 DataFrame 출력

import pandas as pd

df[(df['T'] >= 25) & (df['T'] <= 27)]

1.7 오름차순, 내림차순 정렬

import pandas as pd

# ascending=False(내림차순), default(오름차순)
df['col'].sort_values(ascending=False)

df['col'].sort_values(by=0)	# 열 값을 기준으로 정렬

1.8 특정값이 포함된 Data 찾기

df[df['col'].astype(str).str.contains('text')]

1.9 특정 조건 만족하는 값, 변경하기

import numpy as np

np.where(df['col'] <= 5, 1, 0)

1.10 groupby 활용 카운팅

import numpy as np
import pandas as pd

df['y'].groupby(df['job']).value_counts()

df['y'].value_counts()

1.11 pivot table 활용 데이터 처리

import pandas as pd

df_job = pd.pivot_table(df_job,        # 피벗할 데이터프레임
                     index='index',    # 행 위치에 들어갈 열
                     columns='col',    # 열 위치에 들어갈 열
                     values='value')   # 데이터로 사용할 열

1.12 inf(무한대) 데이터 null 처리

import numpy as np
import pandas as pd

df['col'].replace([np.inf, -np.inf], np.nan)
# np.inf(양의 무한대)와 -np.inf(음의 무한대)는 null 값으로 변환

1.13 lag 데이터 생성

import numpy as np
import pandas as pd

# + n : 순방향, - n : 역방향
df['col'].shift(1)

shift() 메서드는 데이터프레임의 행을 위나 아래로 이동시키는 역할을 한다. 기본적으로는 아래로 이동하며, 인자를 사용하여 이동하는 행의 수를 지정할 수 있다.

shift(1): 이는 데이터프레임의 행을 아래로 한 칸씩 이동시킨다. 즉, 각 행의 데이터가 바로 아래 행으로 이동한다.

shift(-1): 이는 데이터프레임의 행을 위로 한 칸씩 이동시킨다. 즉, 각 행의 데이터가 바로 위 행으로 이동한다.

1.14 중복 데이터 처리

import numpy as np
import pandas as pd

df.drop_duplicates(['col'], keep='first', inplace=True)
# keep='first': 중복된 데이터의 첫번째꺼 남기기
# keep='last': 중복된 데이터의 마지막꺼 남기기

1.15 문자열 데이터 앞 공백 제거

import numpy as np
import pandas as pd

df['col'].str.lstrip()

str.lstrip() 메서드는 문자열의 왼쪽(처음)에서 지정된 문자들을 제거하는 역할을 한다. 이 메서드는 문자열의 왼쪽에서부터 시작하여 지정된 문자들이 나타나지 않을 때까지 문자들을 제거한다.

예를 들어, " Hello "라는 문자열이 있다고 가정해보자. 이 문자열에 str.lstrip()을 적용하면 문자열의 왼쪽에 있는 공백이 모두 제거된다. 즉, "Hello "가 반환된다.

1.16 날짜 데이터 형식 변경

import datetime

df["Date"].dt.strftime("%Y-%m")

dt.strftime("%Y-%m")은 시계열 데이터의 각 날짜를 "년-월" 형식의 문자열로 변환하는 역할을 한다. 이는 날짜나 시간 데이터를 원하는 형식의 문자열로 표현할 때 사용된다.

dt: 이는 datetime 속성(attribute)으로, datetime 시계열 데이터의 메서드들을 제공한다.
strftime(): 이 메서드는 datetime 객체를 지정된 포맷 문자열에 따라 문자열로 변환한다. %Y는 연도를 나타내는 네 자리 숫자로, %m은 월을 나타내는 두 자리 숫자로 표현한다.

1.17 list 중복 없애기

import pandas as pd

all_list = list(df['start']) + list(df['end'])
unique_list = set(all_list)

set() 함수는 파이썬에서 집합(set)을 생성하는 데 사용된다. 집합은 중복된 요소를 허용하지 않고, 순서가 없는 자료구조이다.
my_set = set([1, 2, 3, 4, 5])
print(my_set)
# 결과: [1, 2, 3, 4, 5]

2. 데이터 시각화

2.1 Numeric(연속형) 변수 분포 확인

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use(['dark_background'])

# displot 활용 분포 그리기
sns.displot(df['col']);

# 분포의 평균도 같이 출력
print("col :", df['col'].mean())

displot()은 seaborn 라이브러리에서 제공하는 히스토그램과 커널 밀도 추정(Kernel Density Estimation, KDE)을 함께 보여주는 그래프를 생성하는 함수이다.

커널 밀도 추정은 데이터의 분포를 부드러운 곡선으로 나타내는 방법으로, 히스토그램의 막대와 함께 데이터의 밀도를 더 자세하게 표현한다.
# "sepal_length" 열을 추출하여 해당 데이터의 히스토그램과 KDE를 함께 그리기
sns.displot(data['sepal_length'], kde=True)
plt.show()

2.2 Plot size 조절

import matplotlib.pyplot as plt

# (20, 5) → 가로 inch, 세로 inch
plt.gcf().set_size_inches(20, 5)

gcf() 함수는 "Get Current Figure"의 약자로 Matplotlib 라이브러리에서 현재 활성화된 figure(그래프)를 반환한다. 따라서 gcf().set_size_inches()를 사용하면 현재 활성화된 그래프의 크기를 설정할 수 있다.

2.3 산점도(Scatter plot) 그리기

import seaborn as sns

# x(가로), y(세로), hue(구분자)
sns.scatterplot(x=df['x'], y=df['y'], hue=df['hue'], data=df)

2.4 기본 line plot 그리기

import matplotlib.pyplot as plt

plt.plot(df['x'], df['y'], label='label')

2.5 for문 활용 distplot 다중 출력

import matplotlib.pyplot as plt

# 데이터에 i번째 데이터부터 출력
for i in range(1,13):
    plt.subplot(3,4,i)
    plt.grid(False)
    sns.distplot(df.iloc[:,i])

plt.gcf().set_size_inches(20, 10)
plt.tight_layout()
plt.show()

2.6 이중 축 그래프 그리기

import matplotlib.pyplot as plt

# 새로운 figure와 서브플롯 생성
fig, ax1 = plt.subplots()
ax1.plot(df['x'], df['y'], color='green', label='label1')

# ax1과 동일한 x축을 공유하면서 새로운 y축을 추가한 서브플롯 생성
ax2 = ax1.twinx()
ax2.plot(df['x'], df['y'], color='deeppink', label='label2')

fig.legend()
plt.gcf().set_size_inches(25, 5)
plt.show()

plt.subplots() 함수는 Matplotlib에서 새로운 그래프(figure)와 하나의 서브플롯(subplot)을 생성한다.

fig 변수는 생성된 figure 객체를 나타낸다.

ax1 변수는 생성된 서브플롯(subplot) 객체를 나타낸다.

예를 들어, plt.subplots(2, 2)와 같이 사용하면 2x2 그리드 형태의 4개의 서브플롯이 있는 figure를 생성할 수 있다.

ax1.twinx()는 Matplotlib에서 하나의 서브플롯(ax1)과 동일한 x축을 공유하면서 새로운 y축을 추가한 서브플롯(ax2)을 생성하는 메서드이다. 즉, 하나의 서브플롯에서 x축은 공유하고, y축은 독립적으로 사용할 수 있게 된다.

2.7 pairplot 상관관계 분석

import seaborn as sns

# 모든 변수 조합에 관한 Scatter plot
df_pair = df[['col1', 'col2', 'col3', 'col4']]
sns.pairplot(df_pair)
plt.show()

2.8 Heat map 상관관계 분석

import seaborn as sns

# 모든 조합, 상관계수 표현
df_pair = df[['col1', 'col2', 'col3', 'col4']]
sns.heatmap(df_pair.corr(), vmin=-1, vmax=+1, annot=True, cmap='coolwarm');

heatmap() 함수는 seaborn 라이브러리에서 제공하는 히트맵(Heatmap)을 생성하는 함수이다. 히트맵은 데이터의 행과 열을 격자 형태로 나타내고, 각 격자 셀의 색상을 데이터의 값에 따라 다르게 표현하여 데이터 패턴을 시각화하는 데 사용된다.

data: 히트맵을 생성할 데이터를 지정한다. 일반적으로는 2차원 데이터(예: 데이터프레임)를 사용한다.

vmin, vmax: 색상 맵의 최소값(minimum)과 최대값(maximum)을 지정한다. 데이터의 값 범위를 설정하여 색상의 대비를 조정할 수 있다.

annot: 각 격자 셀에 데이터 값을 표시할지 여부를 지정한다. 기본값은 False이며, True로 설정하면 데이터 값을 셀 위에 표시한다.

cmap: 사용할 색상 맵(colormap)을 지정한다. 예를 들어, 'coolwarm', 'viridis', 'magma', 'cividis' 등의 다양한 색상 맵을 사용할 수 있다.

2.9 그래프에 수직, 수평선 추가 및 길이 조절

import seaborn as sns
import matplotlib.pyplot as plt

sns.scatterplot(data=df, x='x', y='y', s=50, linewidth=0);

# 수직선 추가
plt.vlines(-2, ymin=-2, ymax=2, color='r', linewidth=2);
plt.vlines(2, ymin=-2, ymax=2, color='r', linewidth=2);

# 수평선 추가
plt.hlines(-2, xmin=-2, xmax=2, color='r', linewidth=2);
plt.hlines(2, xmin=-2, xmax=2, color='r', linewidth=2);

plt.vlines(x, ymin, ymax, color, linewidth): 주어진 x 좌표를 기준으로 수직선을 추가한다.

x: 수직선의 x 좌표를 나타낸다.

ymin: 수직선이 지나가는 y 좌표의 시작점을 나타낸다.

ymax: 수직선이 지나가는 y 좌표의 끝점을 나타낸다.

color: 수직선의 색상을 지정한다다.

linewidth: 수직선의 두께를 지정한다.

plt.hlines(y, xmin, xmax, color, linewidth): 주어진 y 좌표를 기준으로 수평선을 추가한다.

y: 수평선의 y 좌표를 나타낸다.

xmin: 수평선이 지나가는 x 좌표의 시작점을 나타낸다.

xmax: 수평선이 지나가는 x 좌표의 끝점을 나타낸다.

color: 수평선의 색상을 지정한다.

linewidth: 수평선의 두께를 지정한다.

2.10 catplot 그리기

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use(['dark_background'])

sns.catplot(x="x", hue="y", kind="count", palette="pastel", edgecolor=".6", data=df);
plt.gcf().set_size_inches(25, 3)

catplot() 함수는 seaborn 라이브러리에서 제공하는 범주형 데이터를 시각화하기 위한 다목적 함수이다. catplot()은 여러 유형의 범주형 플롯을 생성할 수 있는데, 기본적으로 kind 매개변수를 통해 어떤 유형의 플롯을 그릴지 지정할 수 있다. 몇 가지 흔히 사용되는 플롯 유형에는 strip, swarm, box, violin, boxen, point, bar, count 등이 있다.

x, y: 데이터프레임에서 사용할 열의 이름을 지정한다. x는 x 축에 표시될 열을, y는 y 축에 표시될 열을 지정한다.

hue: 데이터를 분리하여 시각화할 때 사용할 열의 이름을 지정한다.

kind: 그래프의 유형을 지정한다. 여러 유형이 있으며, 주로 point, bar, count, box, violin, strip, swarm 등이 사용된다.

data: 사용할 데이터프레임을 지정한다.

palette: 색상 팔레트를 지정한다.

edgecolor: 그래프 요소의 테두리 선의 색상을 지정한다.

2.11 그래프 특정 값에 색상 입히기

import numpy as np
import matplotlib.pyplot as plt

df['vol_color'] = np.where(df['Volume_issue']==1, 'red', 'gray')
colors = list(df['vol_color'])
print(colors)

plt.figure(figsize=(10, 8))

plt.subplot(2,1,1)
plt.plot(df['Date'], df['Close'], 'o-', ms=1, lw=0.5, label='Close')
plt.legend()

plt.subplot(2,1,2)
plt.bar(df['Date'], df['Volume'], label='volume', color=colors)
plt.legend()

plot() 함수는 Matplotlib 라이브러리에서 가장 기본적인 그래프를 그리는 함수 중 하나이다. 이 함수를 사용하여 선 그래프, 산점도, 막대 그래프 등을 그릴 수 있다.

x, y: 그래프에 사용할 데이터이다. x는 x 축에 해당하는 데이터를, y는 y 축에 해당하는 데이터를 지정한다.

color: 그래프의 색상을 지정한다. 일반적으로 문자열('red', 'blue' 등) 또는 RGB 튜플((0.1, 0.2, 0.5) 등)로 지정된다.

linestyle: 그래프의 선 스타일을 지정한다. 예를 들어 '-', '--', '-.', ':' 등이 있으며, 각각 실선, 대시선, 대시-닷선, 점선을 나타낸다.

linewidth 또는 lw: 그래프의 선 두께를 지정한다.

marker: 데이터 포인트의 모양을 지정한다. 예를 들어 'o'는 원, '^'는 삼각형, 's'는 사각형 등이 있다.

markersize 또는 ms: 데이터 포인트의 크기를 지정한다.

label: 그래프에 레이블을 추가한다. 이 레이블은 범례에 표시된다.

alpha: 그래프의 투명도를 지정한다. 0에 가까울수록 투명하고, 1에 가까울수록 불투명하다.

2.12 bar plot 그리기

# barplot, order 옵션을 활용하여 가독성 Up
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use(['dark_background'])

sns.barplot(x='arrival_date_month', y='hotel', hue='arrival_date_year', data=df_reservation,
            order=['01.January', '02.February', '03.March', '04.April', '05.May', '06.June', '07.July', '08.August', '09.September', '10.October', '11.November', '12.December']);
plt.gcf().set_size_inches(20, 5);

3. 데이터 분석 및 모델링

3.1 Train/Test set 분할

# 모델링을 학습하기 위한 Feature(X)와 Y데이터를 구분하는 단계
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

X=df_merge.drop(['y'], axis=1)
Y=df_merge['y']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, stratify=Y)

print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

train_test_split(): 이 함수는 데이터를 훈련 세트와 테스트 세트로 나누는 데 사용된다. 일반적으로 지도 학습 모델을 훈련시키고 평가하기 위해 사용된다. 훈련 세트는 모델을 학습시키는 데 사용되고, 테스트 세트는 모델의 성능을 평가하는 데 사용된다.

3.2 모델 학습 및 예측

from sklearn.ensemble import RandomForestClassifier

# 모델 학습
rfc = RandomForestClassifier(random_state=123456)
rfc.fit(x_train, y_train)

# 예측
# 예측은 학습에 사용된 Data와 Test Data 모두 예측하고 평가함(※ 과적합 여부 판별)
y_pred_train = rfc.predict(x_train)
y_pred_test = rfc.predict(x_test)

3.3 이진분류 모델 성능 확인

from sklearn.metrics import classification_report
print(classification_report(y_train, y_pred_train))
print(classification_report(y_test, y_pred_test))

3.4 하이퍼 파라미터 튜닝

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

params = { 'n_estimators' : [400, 500],
           'max_depth' : [6, 8, 10, 12]
            }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 123456, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 3, n_jobs = -1, scoring='recall')
grid_cv.fit(x_train, y_train)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

3.5 중요 변수 파악(Feature Importance)

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use(['dark_background'])

# rfc → 생성한 Model에 name 기재
ftr_importances_values = rfc.feature_importances_
ftr_importances = pd.Series(ftr_importances_values, index = x_train.columns)
ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Feature Importances')
sns.barplot(x=ftr_top20, y=ftr_top20.index)
plt.show()

3.6 모델 Save & Read

import pickle

# 모델 저장
saved_model = pickle.dumps(model)

# 모델 Read
model_from_pickle = pickle.loads(saved_model)

3.7 상관계수 값 출력

import scipy.stats as stats

stats.pearsonr(x=df['x'], y=df['y'])

3.8 Regressor(회귀) 모델 학습 및 평가

# 모델링을 학습하기 위한 Fearue(X)와 Y데이터를 구분하는 단계
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

X=df.drop(['y'], axis=1)
Y=df['y']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

# RandomForestRegressor 모델 학습
rfr = RandomForestRegressor()
rfr.fit(x_train, y_train)

# 예측
# 예측은 학습에 사용된 Data와 Test Data 모두 예측하고 평가함(※ 과적합 여부 판별)
import numpy as np
from sklearn.metrics import mean_absolute_error, r2_score
y_pred_train = rfr.predict(x_train)
y_pred_test = rfr.predict(x_test)


mse_train = mean_absolute_error(y_train, y_pred_train)
print('mse_train(mse): ', mse_train)
rmse_train = (np.sqrt(mse_train))
print('rmse_train(rmse): ', rmse_train)
r2_train = r2_score(y_train, y_pred_train)
print('rmse_train(r2): ', r2_train)
print('')
mse_test = mean_absolute_error(y_test, y_pred_test)
print('mse_test(mse): ', mse_test)
rmse_test = (np.sqrt(mse_test))
print('rmse_test(rmse): ', rmse_test)
r2_test = r2_score(y_test, y_pred_test)
print('rmse_test(r2): ', r2_test)

3.9 표준화 및 PCA 차원축소

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

x = StandardScaler().fit_transform(x)

pca = PCA(n_components = 2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

3.10 선형회귀활용 모델링

# 모델링을 학습하기 위한 Fearue(X)와 Y데이터를 구분하는 단계
from sklearn.model_selection import train_test_split
from sklearn import metrics

X=df.drop(['y'], axis=1)
Y=df['y']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

# LR(선형회귀) 모델 활용
from sklearn.linear_model import LinearRegression
mlr = LinearRegression()
mlr.fit(x_train, y_train)

# 예측
# 예측은 학습에 사용된 Data와 Test Data 모두 예측하고 평가함(※ 과적합 여부 판별)
import numpy as np
from sklearn.metrics import mean_absolute_error, r2_score
y_pred_train = mlr.predict(x_train)
y_pred_test = mlr.predict(x_test)

# 평가
mse_train = mean_absolute_error(y_train, y_pred_train)
print('mse_train(mse): ', mse_train)
rmse_train = (np.sqrt(mse_train))
print('rmse_train(rmse): ', rmse_train)
r2_train = r2_score(y_train, y_pred_train)
print('rmse_train(r2): ', r2_train)
print('')
mse_test = mean_absolute_error(y_test, y_pred_test)
print('mse_test(mse): ', mse_test)
rmse_test = (np.sqrt(mse_test))
print('rmse_test(rmse): ', rmse_test)
r2_test = r2_score(y_test, y_pred_test)
print('rmse_test(r2): ', r2_test)

3.11 선형회귀 상관계수 확인

df_coef = pd.DataFrame({'col':X.columns, 'coef':mlr.coef_}).reset_index(drop=True)
df_coef

3.12 light gbm 활용 모델링

# ▶ 모델링을 학습하기 위한 Fearue(X)와 Y데이터를 구분하는 단계
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split

# 데이터 세트로드
X = df.drop(['y'], axis=1)
Y = df['y']

# train/test split
x_train, x_test, y_train, y_test = train_test_split (X, Y, test_size = 0.3)

# 데이터 세트를 적절한 LGB 형식으로 변환
d_train = lgb.Dataset (x_train, label = y_train)

# setting the parameters
params = {}
params [ 'learning_rate'] = 0.02
params [ 'boosting_type'] = 'gbdt' # GradientBoostingDecisionTree
params['objective'] = 'binary'
params [ 'metric' ] = 'binary_logloss' # metric for binary-class
params [ 'max_depth'] = 5
params [ 'num_leaves' ] = 32
params ['seed'] = 23456

# 모델 학습
clf = lgb.train (params, d_train, 1000) # epocs에서 모델 훈련

from sklearn.metrics import classification_report

y_pred_train = clf.predict(x_train)
for i in range(0,len(y_pred_train)):
    if y_pred_train[i]>=.5:       # setting threshold to .5
       y_pred_train[i]=1
    else:
       y_pred_train[i]=0

y_pred_test = clf.predict(x_test)
for i in range(0,len(y_pred_test)):
    if y_pred_test[i]>=.5:       # setting threshold to .5
       y_pred_test[i]=1
    else:
       y_pred_test[i]=0

print(classification_report(y_train, y_pred_train))
print(classification_report(y_test, y_pred_test))

3.13 연속형, 범주형 변수 list 나누기

import numpy as np
import pandas as pd
#  numeric, categorical value 나누기
numeric_list=[]
categoical_list=[]

for i in df.columns :
  if df[i].dtypes == 'O' :
    categoical_list.append(i)
  else :
    numeric_list.append(i)

3.14 AUROC score 출력하기

from sklearn.metrics import roc_auc_score

y_pred_train_proba = rfc.predict_proba(x_train)[:, 1]
y_pred_test_proba = rfc.predict_proba(x_test)[:, 1]


roc_score_train = roc_auc_score(y_train, y_pred_train_proba)
roc_score_test = roc_auc_score(y_test, y_pred_test_proba)

print("roc_score_train :", roc_score_train)
print("roc_score_test :", roc_score_test)

3.15 Lable encoder 활용 범주형 데이터 처리

from sklearn.preprocessing import LabelEncoder

for col in categoical_list:
    print(col)
    le = LabelEncoder()
    le.fit(list(x_train[col].values) + list(x_test[col].values))
    x_train[col] = le.transform(x_train[col])
    x_test[col] = le.transform(x_test[col])

3.16 ROC 커브 그리기

from sklearn.metrics import roc_curve
def roc_curve_plot(y_test , pred_proba_c1):
    # 임곗값에 따른 FPR, TPR 값을 반환 받음.
    # FPR : 암환자가 아닌 환자를 암환자라고 잘 못 예측한 비율
    # TPR : Recall
    fprs , tprs , thresholds = roc_curve(y_test ,pred_proba_c1)

    # ROC Curve를 plot 곡선으로 그림.
    plt.plot(fprs , tprs, label='ROC')
    # 가운데 대각선 직선을 그림.
    plt.plot([0, 1], [0, 1], 'k--', label='Random', color='red')

    # FPR X 축의 Scale을 0.1 단위로 변경, X,Y 축명 설정등
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    plt.xlim(0,1)
    plt.ylim(0,1)
    plt.xlabel('FPR( 1 - Sensitivity )')
    plt.ylabel('TPR( Recall )')
    plt.legend()
    plt.show()

roc_curve_plot(y_test, y_pred_test_proba)

3.17 min-max scale 활용 정규화

from sklearn.preprocessing import minmax_scale

rfm['Recency'] = minmax_scale(rfm['Recency'], axis=0, copy=True)

dpwl

거북선통통통통

이전 포스트

Python Deleting

다음 포스트