서울시 월별 기상정보로 서울시 식중독 발생 환자 수를 예측하는 프로그램 Seoul FP-Weather 개발 프로젝트 진행과정 정리 및 회고. (2) 회귀모델(LightGBM) 모델링 과정
(2023 수정사항)
서울시 월간 기상정보를 기반으로 서울시 월간 식중독 발생 환자 수를 예측하는 AI 머신 러닝 프로그램
Tech Stack
자유주제
로 데이터 파이프라인 구축
및 API 서비스 개발
Pull
& Store
)Machine Learning
& Frond-end
)대시보드
개발 및 배포서론
식중독
프로그램소개
데이터베이스
파이프라인
DB구축
모델링
데이터 불러오기
회귀 모델
객체 부호화
배포
평년값 예측
대시보드
웹 배포
결론
활용방안
한계점
핵심, 느낀점
# 라이브러리 import
import os
import sys
import csv
import psycopg2
from dotenv import load_dotenv
#PostgreSQL 연결정보를 변수로 저장
load_dotenv(verbose=True)
HOST = os.getenv('postgre_host')
PASSWORD = os.getenv('postgre_password')
DATABASE = 'postgredb'
USERNAME = 'kjcheong'
PORT = 5432
#파일 실행시 작동하는 함수안에 과정을 모두 포함
def main():
# postgreSQL 연결
try:
conn = psycopg2.connect(
host=HOST,
port=PORT,
database=DATABASE,
user=USERNAME,
password=PASSWORD)
cur = conn.cursor()
print('connection success to DB')
except:
print('connection failure to DB')
sys.exit()
# JOIN QUERY
sql_query_join = """
SELECT w."year" , w."month" , w.avgta , w.maxta , w.minta , w.sumrn , w.avgws , w.avgrhm , w.sumsshr , w.avgps , fs2.patient_count
FROM weather w
JOIN fp_seoul fs2
ON w.month_id = fs2.month_id
"""
# to csv
sql_csv = f"""COPY ({sql_query_join}) TO STDOUT WITH CSV DELIMITER ',';"""
with open("./data/csv/fp-weather.csv", "w") as cf:
cur.copy_expert(sql_csv, cf)
conn.close()
print('"fp-weather.csv" file created')
#파일 실행시 함수 실행 명령
if __name__ == "__main__":
main()
# $ python data/postgresql-2-join-to-csv.py
'''
connection success to DB
"fp-weather.csv" file created
'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['axes.unicode_minus'] = False
import seaborn as sns
#머신러닝
import lightgbm as lgbm
from lightgbm import early_stopping, log_evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
#객체 부호화
import pickle
columns0 = ['year','month','avgTa','maxTa','minTa','sumRn','avgWs','avgRhm','sumSsHr','avgPs','fp_patients']
df0 = pd.read_csv('csv/fp-weather.csv', names=columns0)
df0.head()
df0.to_csv('csv/fp-weather-add-column.csv',index=False)
columns1 = ['avgTa','maxTa','minTa','sumRn','avgWs','avgRhm','sumSsHr','avgPs','fp_patients']
df1 = df0[columns1]
df1.head()
print('Sum of Duplicated Data : {}'.format(df1.duplicated().sum()))
print('Sum of Null Data : {}'.format(df1.isnull().sum().sum()))
'''
Sum of Duplicated Data : 0
Sum of Null Data : 0
'''
train, test = train_test_split(df1, test_size=0.25, random_state=42)
print('분리전 데이터 : {}\n'.format(df1.shape))
print('훈련 데이터 : {}'.format(train.shape))
print('테스트 데이터 : {}'.format(test.shape))
'''
분리전 데이터 : (208, 9)
훈련 데이터 : (156, 9)
테스트 데이터 : (52, 9)
'''
target = 'fp_patients'
features = df1.columns.drop(target)
X_train = train[features]
y_train = train[target]
X_test = test[features]
y_test = test[target]
print('X y 분리 후 shape\n')
print('X_train : {}'.format(X_train.shape))
print('y_train : {}'.format(y_train.shape))
print('\nX_test : {}'.format(X_test.shape))
print('y_test : {}'.format(y_test.shape))
'''
X y 분리 후 shape
X_train : (156, 8)
y_train : (156,)
X_test : (52, 8)
y_test : (52,)
'''
baseline = [y_train.mean()] * len(y_train)
baseline_mse = mean_squared_error(y_train, baseline)
baseline_rmse = np.sqrt(mean_squared_error(y_train, baseline))
baseline_mae = mean_absolute_error(y_train, baseline)
print('Baseline mean : {}'.format(y_train.mean()))
print('Baseline MSE : {}'.format(baseline_mse))
print('Baseline RMSE : {}'.format(baseline_rmse))
print('Baseline MAE : {}'.format(baseline_mae))
'''
Baseline mean : 103.72435897435898
Baseline MSE : 46863.85350920447
Baseline RMSE : 216.48060769779002
Baseline MAE : 113.04100920447075
'''
train_ds = lgbm.Dataset(X_train, label= y_train)
test_ds = lgbm.Dataset(X_test, label= y_test)
param = {
'boosting' : 'gbdt',
'objective' : 'regression',
'metric' : 'rmse',
'seed' : 42,
'is_training_metric' : True,
'max_depth' : 16,
'learning_rate' : 0.05,
'feature_fraction' : 0.75,
'bagging_fraction' : 0.8,
'bagging_freq' : 16,
'num_leaves' : 128,
'verbosity' : -1
}
model_lgbm = lgbm.train(params=param,
train_set=train_ds,
num_boost_round=1000,
valid_sets=test_ds,
callbacks=[early_stopping(75),
log_evaluation(75)])
#output
'''
Training until validation scores don't improve for 75 rounds
[75] valid_0's rmse: 120.443
[150] valid_0's rmse: 123.299
Early stopping, best iteration is:
[93] valid_0's rmse: 119.486
'''
predict_train = model_lgbm.predict(X_train)
predict_test = model_lgbm.predict(X_test)
df_metrics = pd.DataFrame({'Baseline':[y_train.mean(),
baseline_mse,
baseline_rmse,
baseline_mae],
'Train':[predict_train.mean(),
mean_squared_error(y_train,predict_train),
np.sqrt(mean_squared_error(y_train,predict_train)),
mean_absolute_error(y_train,predict_train)],
'Test':[predict_test.mean(),
mean_squared_error(y_test,predict_test),
np.sqrt(mean_squared_error(y_test,predict_test)),
mean_absolute_error(y_test,predict_test)]},
index = ['Mean','MSE','RMSE','MAE'])
df_metrics
df_metrics_plot = df_metrics.iloc[2:]
df_metrics_plot
x = np.arange(2)
width = 0.3
fig, ax = plt.subplots()
rect1 = ax.bar(x-width, df_metrics_plot.Baseline.round(3), width, label='Baseline',color='black',alpha=0.5)
rect2 = ax.bar(x, df_metrics_plot.Train.round(3), width, label='Train',color='purple',alpha=0.5)
rect3 = ax.bar(x+width, df_metrics_plot.Test.round(3), width, label='Test',color='purple',alpha=0.75)
ax.set_title('Metrics of LightGBM model')
ax.set_xticks(x, df_metrics_plot.index)
ax.bar_label(rect1, padding=3)
ax.bar_label(rect2, padding=3)
ax.bar_label(rect3, padding=3)
fig.tight_layout()
plt.legend(ncol=3)
plt.show()
result = pd.concat([y_test.reset_index(drop=True), pd.DataFrame(predict_test)], axis = 1)
result.columns = ['label','predict']
sns.regplot(x='label', y='predict', data=result)
plt.title('Regplot of LightGBM model')
plt.show()
with open('model.pkl', 'wb') as pf:
pickle.dump(model_lgbm, pf)
import pickle
import numpy as np
model = pickle.load(open("./data/model.pkl", "rb"))
# 1월 평년값(1991~2020)
array1 = np.array([[-1.9, 2.1, -5.5, 16.8, 2.3, 56.2, 169.6, 1024.9]])
# 2월 평년값(1991~2020)
array2 = np.array([[0.7, 5.1, -3.2, 28.2, 2.5, 54.6, 170.8, 1023.2]])
# 3월 평년값(1991~2020)
array3 = np.array([[6.1, 11.0, 1.9, 36.9, 2.7, 54.6, 198.2, 1019.4]])
# 4월 평년값(1991~2020)
array4 = np.array([[12.6, 17.9, 8.0, 72.9, 2.7, 54.8, 206.3, 1014.8]])
# 5월 평년값(1991~2020)
array5 = np.array([[18.2, 23.6, 13.5, 103.6, 2.5, 59.7, 223.0, 1010.9]])
# 6월 평년값(1991~2020)
array6 = np.array([[22.7, 27.6, 18.7, 129.5, 2.2, 65.7, 189.1, 1007.3]])
# 7월 평년값(1991~2020)
array7 = np.array([[25.3, 29.0, 22.3, 414.4, 2.2, 76.2, 123.6, 1006.4]])
# 8월 평년값(1991~2020)
array8 = np.array([[26.1, 30.0, 22.9, 348.2, 2.1, 73.5, 156.1, 1008.2]])
# 9월 평년값(1991~2020)
array9 = np.array([[21.6, 26.2, 17.7, 141.5, 1.9, 66.4, 179.7, 1013.5]])
# 10월 평년값(1991~2020)
array10 = np.array([[15.0, 20.2, 10.6, 52.2, 2.0, 61.8, 206.5, 1019.2]])
# 11월 평년값(1991~2020)
array11 = np.array([[7.5, 11.9, 3.5, 51.1, 2.2, 60.4, 157.3, 1022.6]])
# 9월 평년값(1991~2020)
array12 = np.array([[0.2, 4.2, -3.4, 22.6, 2.3, 57.8, 162.9, 1025.1]])
array_list = [array1, array2, array3, array4, array5, array6, array7, array8, array9, array10, array11, array12]
for array in array_list:
pred = int(model.predict(array).round(0))
print(pred)