한국형 우울증 딥러닝 예측 모델 및 진단 프로그램 "Kor-DEEPression" 개발 과정 정리 및 회고.
(3) EDA & Dashboard (시각화 분석 & 대시보드)
(Part3) Trend Analysis 트렌드 분석
탐색적 데이터 분석
대시보드
Plotly
임# Library Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as plex
import plotly.graph_objects as go
from plotly.subplots import make_subplots as plsp
# Matplotlib setting for VSCode
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False
# Renderer setting for VSCode & Github
import plotly.io as pio
pio.renderers.default = 'vscode+png'
# csv파일을 DataFrame으로 불러오기(Depression)
df_depr = pd.read_csv('downloads/EDA_depr.csv')
print("Depression(정상vs우울증)")
df_depr.head()
# csv파일을 DataFrame으로 불러오기(MDD)
df_mdd = pd.read_csv('downloads/EDA_mdd.csv')
print("MDD(경도우울vs주요우울)")
df_mdd.head()
# Shape 확인
print(f"Data Shape (Depression) : {df_depr.shape}")
print(f"Data Shape (MDD) : {df_mdd.shape}")
'''
Data Shape (Depression) : (16570, 21)
Data Shape (MDD) : (3359, 21)
'''
# 결측치 확인
print(f"Sum of Null Values (Depression) : {df_depr.isnull().sum().sum()}")
print(f"Sum of Null Values (MDD) : {df_mdd.isnull().sum().sum()}")
# 중복값 확인 (고유식별코드인 "id" column만 확인)
print(f"Sum of Duplicated Data (Depression) : {df_depr.id.duplicated().sum()}")
print(f"Sum of Duplicated Data (MDD) : {df_mdd.id.duplicated().sum()}")
'''
Sum of Null Values (Depression) : 0
Sum of Null Values (MDD) : 0
Sum of Duplicated Data (Depression) : 0
Sum of Duplicated Data (MDD) : 0
'''
# dtype 일치여부 확인
list(df_depr.dtypes) == list(df_mdd.dtypes)
'''
True
'''
# 식별코드 "id"를 제외한 변수들의 수치형 변수 및 범주형 변수 갯수 확인
# describe() 함수의 include 기능을 활용하여 갯수를 산출함(number에 int,float 모두 포함되므로 좀 더 간편하게 산출할 수 있음)
print('(Depression)')
print(f"Sum of Numerical Data : {df_depr.iloc[:,1:].describe(include=['number']).shape[1]}")
print(f"Sum of Categorical Data : {df_depr.iloc[:,1:].describe(include=['object']).shape[1]}")
print('(MDD)')
print(f"Sum of Numerical Data : {df_mdd.iloc[:,1:].describe(include=['number']).shape[1]}")
print(f"Sum of Categorical Data : {df_mdd.iloc[:,1:].describe(include=['object']).shape[1]}")
'''
(Depression)
Sum of Numerical Data : 3
Sum of Categorical Data : 17
(MDD)
Sum of Numerical Data : 3
Sum of Categorical Data : 17
'''
num_cols = list(df_depr.iloc[:,1:].describe(include=['number']).columns[1:])
cat_cols = list(df_depr.iloc[:,1:].describe(include=['object']).columns[:-1])
print("독립변수(Independent Variable)")
print(f"\t수치형 변수(Numerical Data) : 총 {len(num_cols)} 개")
print(f"\t범주형 변수(Categorical Data) : 총 {len(cat_cols)} 개")
print("종속변수(Dependent Variable)")
print(f"\tDepression(정상vs우울증) : '{df_depr.columns[-1]}'")
print(f"\tMDD(경도우울vs주요우울) : '{df_mdd.columns[-1]}'")
'''
독립변수(Independent Variable)
수치형 변수(Numerical Data) : 총 2 개
범주형 변수(Categorical Data) : 총 16 개
종속변수(Dependent Variable)
Depression(정상vs우울증) : 'depression'
MDD(경도우울vs주요우울) : 'MDD'
'''
# Group By Target counts
group_depr = df_depr.groupby(['depression'], as_index=False)['id'].count()
display(group_depr)
# 변수 값 수정
count_depr = group_depr.replace({'No': 'Normal (정상)', 'Yes': 'Depression (우울증)'})
display(count_depr)
# Depression(정상vs우울증)-Count 시각화
plot_depr = plsp(rows=1, cols=2,
subplot_titles=('Bar_plot','Pie_plot'),
specs=[[{'type':'xy'}, {'type':'domain'}]])
plot_depr.add_trace(
go.Bar(x=count_depr.depression,
y=count_depr.id,
marker_color=['pink', 'darkviolet'],
texttemplate="%{y:,}",
showlegend=False,
hoverinfo='x+y'
),
row=1, col=1
)
plot_depr.add_trace(
go.Pie(values=count_depr.id,
labels=count_depr.depression,
hole=0.3,
showlegend=True,
marker_colors=['pink', 'darkviolet'],
hoverinfo="label+value+percent",
textinfo='value+percent',
rotation=180
),
row=1, col=2
)
plot_depr.update_layout(title_text='Counts of "Depression"',
width=720,
height=540,
legend=dict(orientation='h',
yanchor='bottom', y=0,
xanchor='right', x=1.05))
plot_depr.update_yaxes(tickformat=',')
plot_depr.show()
# Group By Target & Year
group_year_depr = df_depr.groupby(['depression', 'year'], as_index=False)['id'].count()
count_year_depr = group_year_depr.replace({'No': 'Normal (정상)', 'Yes': 'Depression (우울증)'})
display(count_year_depr)
# Depression(정상vs우울증)-Year 시각화
bar_year_depr = plex.bar(data_frame=count_year_depr, barmode='group',
x='year', y='id', color='depression',
text_auto=True, color_discrete_sequence=['pink', 'darkviolet'],
title='Counts of "Depression" by "year"')
bar_year_depr.update_layout(width=720, height=540,
legend_title_text="Depression",
xaxis_title_text="Year",
yaxis_title_text="")
bar_year_depr.update_yaxes(tickformat=',')
bar_year_depr.show()
# Age(만나이) Histogram (Depression)
sns.histplot(data=df_depr, x='age', kde=True, bins=12, color='darkviolet')
plt.title('Histogram of "Age" (Depression)')
plt.xlabel('Age')
plt.ylabel('')
plt.show()
# Age(만나이) & Target Histogram (Depression)
query_age_depr_no = df_depr.query('depression == "No"')
query_age_depr_yes = df_depr.query('depression == "Yes"')
hist_age_depr = go.Figure()
hist_age_depr.add_trace(
go.Histogram(x=query_age_depr_no.age,
nbinsx=12,
name='Normal (정상)',
marker_color='pink',
texttemplate="%{y}",
hoverinfo='x+y')
)
hist_age_depr.add_trace(
go.Histogram(x=query_age_depr_yes.age,
nbinsx=12,
name='Depression (우울증)',
marker_color='darkviolet',
texttemplate="%{y}",
hoverinfo='x+y')
)
hist_age_depr.update_layout(barmode='overlay',
title_text='Histogram of "Age" & "Depression"',
xaxis_title_text="Age",
yaxis_title_text="",
width=720,
height=540,
legend=dict(orientation='h',
yanchor='bottom', y=1,
xanchor='right', x=1))
hist_age_depr.update_xaxes(dtick=10)
hist_age_depr.update_yaxes(tickformat=',')
hist_age_depr.update_traces(opacity=0.75)
hist_age_depr.show()
# BMI (체질량지수) Histogram (Depression)
sns.histplot(data=df_depr, x='BMI', kde=True, color='darkviolet')
plt.title('Histogram of "BMI" (Depression)')
plt.xlabel('BMI')
plt.ylabel('')
plt.show()
# BMI (체질량지수) & Target Histogram (Depression)
query_BMI_depr_no = df_depr.query('depression == "No"')
query_BMI_depr_yes = df_depr.query('depression == "Yes"')
hist_BMI_depr = go.Figure()
hist_BMI_depr.add_trace(
go.Histogram(x=query_BMI_depr_no.BMI,
xbins=dict(end=50),
name='Normal (정상)',
marker_color='pink',
hoverinfo='x+y')
)
hist_BMI_depr.add_trace(
go.Histogram(x=query_BMI_depr_yes.BMI,
xbins=dict(end=50),
name='Depression (우울증)',
marker_color='darkviolet',
hoverinfo='x+y')
)
hist_BMI_depr.update_layout(barmode='overlay',
title_text='Histogram of "BMI" & "Depression"',
xaxis_title_text="BMI",
yaxis_title_text="",
width=720,
height=540,
legend=dict(orientation='h',
yanchor='bottom', y=1,
xanchor='right', x=1))
hist_BMI_depr.update_xaxes(dtick=5)
hist_BMI_depr.update_yaxes(tickformat=',')
hist_BMI_depr.update_traces(opacity=0.75)
hist_BMI_depr.show()
# 범주형 변수 bar_plot 함수 정의
def bar_category(target, col, order_list=None):
if target == 'depression':
# Group By Target & column
group_df = df_depr.groupby([target, col], as_index=False)['id'].count()
count_df = group_df.copy()
count_df[target] = count_df[target].replace({'No': 'Normal (정상)', 'Yes': 'Depression (우울증)'})
# 시각화 (depression)
bar_plot = plex.bar(data_frame=count_df, barmode='group',
x=col, y='id', color=target,
text_auto=True, color_discrete_sequence=['pink', 'darkviolet'],
title=f'Bar plot of "Depression" by "{col.capitalize()}"')
elif target == 'MDD':
# Group By Target & column
group_df = df_mdd.groupby([target, col], as_index=False)['id'].count()
count_df = group_df.copy()
count_df[target] = count_df[target].replace({'No': 'Minor D. (경도우울)', 'Yes': 'Major D. (주요우울)'})
# 시각화 (MDD)
bar_plot = plex.bar(data_frame=count_df, barmode='group',
x=col, y='id', color=target,
text_auto=True, color_discrete_sequence=['darkviolet', 'purple'],
title=f'Bar plot of "MDD" by "{col.capitalize()}"')
else:
raise Exception('Error : target must be "depression" or "MDD"')
# 시각화 설정(공통)
bar_plot.update_layout(width=720, height=540,
legend_title_text="",
xaxis_title_text=f"{col.capitalize()}",
yaxis_title_text="",
legend=dict(orientation="h",
yanchor='bottom', y=1,
xanchor='right', x=1))
if order_list:
bar_plot.update_layout(xaxis={'categoryorder':'array', 'categoryarray':order_list})
bar_plot.update_yaxes(tickformat=',')
bar_plot.show()
# 범주형 변수 pie_plot 함수 정의
def pie_category(target, col):
if target == 'depression':
# Group By Target & column
group_df = df_depr.groupby([target, col]).count()[['id']].rename(columns={'id':'Count'})
group_df['Total'] = 'Total'
count_df = group_df.reset_index()
count_df[target] = count_df[target].replace({'No': 'Normal<br>(정상)', 'Yes': 'Depression<br>(우울증)'})
# 시각화 (depression)
pie_plot = plex.sunburst(data_frame=count_df,
path=['Total',target, col],
values='Count',
color_discrete_sequence=['pink', 'darkviolet'],
title=f'Pie plot of "Depression" by "{col.capitalize()}"')
elif target == 'MDD':
# Group By Target & column
group_df = df_mdd.groupby([target, col]).count()[['id']].rename(columns={'id':'Count'})
group_df['Total'] = 'Total'
count_df = group_df.reset_index()
count_df[target] = count_df[target].replace({'No': 'Minor D.<br>(경도우울)', 'Yes': 'Major D.<br>(주요우울)'})
# 시각화 (MDD)
pie_plot = plex.sunburst(data_frame=count_df,
path=['Total',target, col],
values='Count',
color_discrete_sequence=['darkviolet', 'purple'],
title=f'Pie plot of "MDD" by "{col.capitalize()}"')
else:
raise Exception('Error : target must be "depression" or "MDD"')
# 시각화 설정(공통)
pie_plot.update_traces(textinfo='label+percent parent')
pie_plot.update_layout(width=540, height=540)
pie_plot.show()
bar_category(target='depression', col='sex')
pie_category(target='depression', col='sex')
bar_category(target='MDD', col='sex')
pie_category(target='MDD', col='sex')
household_order = ['1인 가구', '1세대', '2세대', '3세대 이상']
bar_category(target='depression', col='household', order_list=household_order)
pie_category(target='depression', col='household')
bar_category(target='MDD', col='household', order_list=household_order)
pie_category(target='MDD', col='household')
marital_order = ['기혼(유배우자)', '사별/이혼', '미혼']
bar_category(target='depression', col='marital', order_list=marital_order)
pie_category(target='depression', col='marital')
bar_category(target='MDD', col='marital', order_list=marital_order)
pie_category(target='MDD', col='marital')
subj_health_order = ['매우 나쁨', '나쁨', '보통', '좋음', '매우 좋음']
bar_category(target='depression', col='subj_health', order_list=subj_health_order)
pie_category(target='depression', col='subj_health')
bar_category(target='MDD', col='subj_health', order_list=subj_health_order)
pie_category(target='MDD', col='subj_health')
stress_order = ['거의 느끼지 않음', '적게 느끼는 편임', '많이 느끼는 편임', '대단히 많이 느낌']
bar_category(target='depression', col='stress', order_list=stress_order)
pie_category(target='depression', col='stress')
bar_category(target='MDD', col='stress', order_list=stress_order)
pie_category(target='MDD', col='stress')
<대시보드 스크린샷>
다음 과정에서는 머신러닝 및 딥러닝 모델링 및 성능비교분석 과정을 다루도록 하겠음.