AWS 서버 MySQL DB에 접속 및 연결
import mysql.connector
# AWS 서버 DB에 접속/연결
remote = mysql.connector.connect(
host = host,
port = 3306,
user = "admin",
password = password,
database = "sql_project"
)
업종 목적지별 배달 주문건수 데이터 테이블 생성 및 입력
# AWS DB에 테이블 생성
cur = remote.cursor()
cur.execute("CREATE TABLE delivery_count (date date, hour_time int, deliver_type varchar(16), dosi varchar(8), sigu varchar(8), count int, dayweek varchar(16))")
# 생성한 테이블에 데이터 입력
sql = """INSERT INTO delivery_count VALUES (%s, %s, %s, %s, %s, %s, %s)"""
cursor = remote.cursor(buffered=True)
for i, row in seoul_deliver_count.iterrows():
cursor.execute(sql, tuple(row))
#print(tuple(row))
remote.commit()
주문지역 기상 정보 데이터 테이블 생성 및 입력
# AWS DB에 테이블 생성
cur = remote.cursor()
cur.execute("CREATE TABLE delivery_weather (dosi varchar(8), sigu varchar(8), date date, hour_time int, rain_type varchar(8), humidity int, precipitation float, temperatur float, wind_speed float, wind_strength varchar(8))")
# 생성한 테이블에 데이터 입력
sql = """INSERT INTO delivery_weather VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
cursor = remote.cursor(buffered=True)
for i, row in seoul_deliver_weather.iterrows():
cursor.execute(sql, tuple(row))
#print(tuple(row))
remote.commit()
이상치 조회
-- hmd, pcp, temp, wind_spd
select hmd from delivery group by hmd order by hmd limit 10;
select pcp from delivery group by pcp order by pcp limit 10;
select temp from delivery group by temp order by temp limit 10;
select wind_spd from delivery group by wind_spd order by wind_spd limit 10;
이상치 조회 결과:
이상치 제거
-- hmd, wind_spd는 그대로 두고, pcp, temp의 이상치만 제거
delete from delivery
where CAST(pcp AS DECIMAL) = CAST(-998.9 AS DECIMAL);
delete from delivery
where temp = -999;
두 데이터 테이블 병합 및 1차 가공
create table delivery select
c.date, mid(c.date, 6,2) as month, c.hour, (case
when c.dayweek = 'Monday' then '월요일'
when c.dayweek = 'Tuesday' then '화요일'
when c.dayweek = 'Wednesday' then '수요일'
when c.dayweek = 'Thursday' then '목요일'
when c.dayweek = 'Friday' then '금요일'
when c.dayweek = 'Saturday' then '토요일'
when c.dayweek = 'Sunday' then '일요일' end) as dayweek,
c.dlvr_type, c.sigu, c.count, w.rain_type, w.hmd, w.pcp, w.temp, w.wind_spd, w.wind_str
from delivery_count c, delivery_weather w
where c.sigu=w.sigu and c.date=w.date and c.hour=w.hour;
데이터 2차 가공
-- 강우량 범주화하여 새 컬럼으로 추가
alter table delivery
add column pcp_g varchar(10);
update delivery
set pcp_g = case
when pcp > 0 and pcp < 1.0 then '0.1-1.0'
when pcp >= 1.0 and pcp < 2.5 then '1.0-2.5'
when pcp >= 2.5 and pcp < 5.0 then '2.5-5.0'
when pcp >= 5.0 and pcp < 10.0 then '5.0-10.0'
when pcp >= 10.0 and pcp < 15.0 then '10.0-15.0'
when pcp >= 15.0 and pcp < 20.0 then '15.0-20.0'
when pcp >= 20.0 and pcp < 30.0 then '20.0-30.0'
when pcp >= 30.0 and pcp < 40.0 then '30.0-40.0'
when pcp >= 40.0 and pcp < 50.0 then '40.0-50.0'
when pcp >= 50.0 and pcp < 70.0 then '50.0-70.0'
when pcp >= 70.0 and pcp < 110.0 then '70.0-110.0'
when pcp >= 110.0 then '110.0 이상' else '0' end;
-- 기온 범주화하여 새 컬럼으로 추가
alter table delivery
add column temp_g varchar(10);
update delivery
set temp_g = case
when temp >= 0 and temp < 10 then '0-10'
when temp >= 10 and temp < 20 then '10-20'
when temp >= 20 and temp < 30 then '20-30'
when temp >= 30 and temp < 40 then '30-40'
when temp >= 40 then '40 이상'
when temp >= -10 and temp < 0 then '-10-0'
when temp < -10 then '-10 미만' end;
import pandas as pd
cursor = remote.cursor(buffered=True)
cursor.execute("select * from delivery")
result = cursor.fetchall()
delivery = pd.DataFrame(result, columns=['date','month','hour','dayweek','dlvr_type','gu','count','rain_type','hmd','pcp','temp','wind_spd','wind_str','pcp_g','temp_g'])
변수간 상관관계
# 변수간 상관관계
colormap = 'vlag_r' #plt.cm.PuBu
plt.figure(figsize=(6,6))
plt.title("Features Correlation")
mask = np.triu(np.ones_like(delivery.drop(columns='hour').corr(), dtype=np.bool))
sns.heatmap(round(delivery.drop(columns='hour').corr(),4),
linewidths=0.1, vmax=1.0, vmin=-1.0, square=True, # square: 정사각형 모양
mask=mask, cmap=colormap, linecolor='white', annot=True)
plt.yticks(rotation=0)
plt.show();
전체 강우량, 기온, 배달건수 조회
# 전체 강우량, 기온, 배달건수 데이터 시각화하여 조회
temp = delivery[['date','pcp','temp','count']]
plt.figure(figsize=(20,10))
sns.lineplot(x='date', y='pcp', data=temp, label='pcp')
sns.lineplot(x='date', y='temp', data=temp, label='temp')
sns.lineplot(x='date', y='count', data=temp, label='count')
plt.legend()
plt.show()
월별 총 배달건수
# 월별 총 배달건수
cursor = remote.cursor(buffered=True)
cursor.execute(
"select month, sum(count) \
from delivery \
group by month")
df = pd.DataFrame(cursor.fetchall(), columns=['month','count'])
sns.barplot(data=df, x='count', y='month', palette='Blues_r');
기온별 시간당 평균 배달건수
# 기온에 따른 시간당 평균 배달건수
cursor = remote.cursor(buffered=True)
cursor.execute(
"select temp_g, sum(count), avg(count) \
from delivery \
group by temp_g")
df = pd.DataFrame(cursor.fetchall(), columns=['temp','count_total' ,'count_avg'])
sns.barplot(data=df.sort_values(by='count_avg', ascending=False), x='count_avg', y='temp', palette='Blues_r');
강우량별 시간당 평균 배달건수
# 강우량에 따른 시간당 평균 배달건수
cursor = remote.cursor(buffered=True)
cursor.execute(
"select pcp_g, sum(count), avg(count) \
from delivery \
group by pcp_g")
df = pd.DataFrame(cursor.fetchall(), columns=['pcp','count_total' ,'count_avg'])
sns.barplot(data=df.sort_values(by='count_avg', ascending=False), x='count_avg', y='pcp', palette='Blues_r');
날씨별 시간당 평균 배달건수
# 날씨별 시간당 평균 배달건수
cursor = remote.cursor(buffered=True)
cursor.execute(
"select rain_type, avg(count) \
from delivery \
group by rain_type \
order by avg(count) desc")
df = pd.DataFrame(cursor.fetchall(), columns=['rain_type','count_avg'])
sns.barplot(data=df.sort_values(by='count_avg', ascending=False), x='count_avg', y='rain_type', palette='Blues_r');
배달유형별 총 배달건수
# 배달유형별 총 배달건수
cursor = remote.cursor(buffered=True)
cursor.execute(
"select dlvr_type, sum(count) \
from delivery \
group by dlvr_type \
order by sum(count) desc")
df = pd.DataFrame(cursor.fetchall(), columns=['dlvr_type','count'])
sns.barplot(data=df, x='count', y='dlvr_type', palette='Blues_r');
요일별 총 배달건수
# 요일별 총 배달건수
cursor = remote.cursor(buffered=True)
cursor.execute(
"select dayweek, sum(count), avg(count)\
from delivery\
group by dayweek \
order by sum(count) desc")
df = pd.DataFrame(cursor.fetchall(), columns=['dlvr_type','total_count','avg_count'])
sns.barplot(data=df, x='total_count', y='dlvr_type', palette='Blues_r');