[공모전 수상작 리뷰] Reactjs+Nodejs+python+scikit-learn{ PCA(주성분 분석), VAR(다변량시계열분석)}으로 공연 예매 추이 시나리오 별 예측하는 서비스 만들어보기 - 데이터 분석 편(1)

Design.C·2022년 1월 6일
1

데이터 분석을 하며 공부한 점

  • 다양한 소스에서 데이터 수집
  • 수집한 데이터를 목적에 맞게 전처리
  • 데이터 모델링 및 모델 간 교차검증
  • 다변량 시계열 분석 최종 모델 개발

다양한 소스에서 데이터 수집

  • KOPIS에서 제공해준 데이터
  • KOPIS 홈페이지에 공개해 둔 데이터
  • KDI 에서 허용한 데이터
  • 공공데이터포털
  • 서울시 열린 데이터 광장
  • 그 외 필요한 경우 크롤링

수집한 데이터를 목적에 맞게 전처리

Covid19 확진자 수 추이를 시계열 데이터로 전처리하는 과정

# 필요한 라이브러리
import requests
from bs4 import BeautifulSoup as bs
from urllib import parse
import pandas as pd
serviceKey = '본인의 key' # 공공데이터포털에서 발급받은 서비스키
params = {'ServiceKey':parse.unquote(serviceKey), # 서비스키(필수)
          'startCreateDt':20200101, # 데이터 생성일 시작범위(선택)
          'endCreateDt':20210831} # 데이터 생성일 종료범위(선택)
# 서비스URL
url = 'http://openapi.data.go.kr/openapi/service/rest/Covid19/getCovid19InfStateJson?'

res = requests.get(url, params=params)
soup = bs(res.text, 'lxml')
items = soup.find_all('item')
lst = []
for y in items:
    l = {}
    for x in y:
        l[x.name]=x.text
    lst.append(l)
# list를 dataframe으로 만들기
df = pd.DataFrame(lst)
df.columns = df.columns.map(response)
df
누적확진률 누적검사 누적검사완료 치료중 격리해제 등록일시 사망자 확진자 검사진행 음성 고유번호 기준일 기준시간 수정
0 1.5018316920 4213867 4043529 17557 42270 2020-12-31 09:35:07.493 900 60727 170338 3982802 371 20201231 00:00 2021-04-20 16:12:20.079
1 1.4985939397 4159509 3987738 17446 41435 2020-12-30 09:35:46.266 879 59760 171771 3927978 370 20201230 00:00 2021-04-20 16:13:02.935
2 1.4949567172 4098166 3927204 17148 40703 2020-12-29 09:37:33.024 859 58710 170962 3868494 369 20201229 00:00 2021-04-20 16:13:28.79
3 1.4906876613 4038294 3868416 17579 39268 2020-12-28 09:40:11.19 819 57666 169878 3810750 368 20201228 00:00 2021-04-20 16:14:02.198
4 1.4809513093 4006400 3839424 17012 39040 2020-12-27 09:32:40.326 808 56860 166976 3782564 367 20201227 00:00 2021-04-20 16:14:27.518
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
331 NaN NaN NaN NaN 0 2020-02-03 21:26:59.59 0 0 0 NaN 4 20200204 00:00 2020-02-03 21:26:59.59
332 NaN NaN NaN NaN 0 2020-02-03 14:41:17.17 0 15 0 NaN 3 20200203 09:00 2020-02-03 14:41:17.17
333 NaN NaN NaN NaN 2 2020-02-03 12:22:49.49 2 2 2 NaN 2 20200202 09:00 2020-02-03 12:22:49.49
334 NaN NaN NaN NaN 1 2020-01-31 17:47:33.33 0 0 1 NaN 1 20200201 18:00 2020-01-31 17:47:33.33
335 NaN NaN NaN 1 2020-01-31 17:47:33.33 0 0 1 NaN 1 20200101 18:00 2020-02-03 12:21:56.56

336 rows × 14 columns

df['확진자']=df['확진자'].astype(int)
decidecnt=[]
for index, val in enumerate(df['확진자']):
    if(index < len(df['확진자'])-1):
        decidecnt.append(df['확진자'][index]-df['확진자'][index+1])
decidecnt = pd.Series(decidecnt)
df['확진자'] = decidecnt
type(df['확진자'][0])
df['확진자'][0]-df['확진자'][1]
967
df['등록일시']=df['등록일시'].str.split(" ").str[0]
df['등록일시'] = df['등록일시'].apply(pd.to_datetime)
df['등록일시'].value_counts()
2020-02-03    3
2020-01-31    2
2020-08-18    2
2020-12-19    2
2020-12-09    2
             ..
2020-05-29    1
2020-04-28    1
2020-03-28    1
2020-02-26    1
2020-02-11    1
Name: 등록일시, Length: 330, dtype: int64
df_refined = df[['등록일시','확진자']].sort_values('등록일시',ascending=True).reset_index()
df_refined.drop('index',axis=1, inplace=True)
df_refined.iloc[0,:]
등록일시    2020-01-31 00:00:00
확진자                       0
Name: 0, dtype: object
df_refined.to_csv('코로나19일별확진자수.csv')
covid19 = pd.read_excel('결과물파일.xlsx')
covid19['날짜'] = covid19['날짜'].astype(str)
covid19['날짜'] = covid19['날짜'].str.split("-").str[0]+\
covid19['날짜'].str.split("-").str[1]+\
covid19['날짜'].str.split("-").str[2]
covid19['날짜']
0      20200101
1      20200102
2      20200103
3      20200104
4      20200105
         ...   
361    20201227
362    20201228
363    20201229
364    20201230
365    20201231
Name: 날짜, Length: 366, dtype: object
covid19 = pd.read_csv('코로나19일별확진자수.csv', encoding='cp949')

다른 데이터들 수집 및 전처리 과정

#OTT데이터 전처리
ott = ott[:-1]
ott_user_cnt = []
ott_user_install = []
ott_user_time = []
ott
for index, j in enumerate(ott):
    block = j.split(",")
    ott_user_cnt.append(block[1].split(":")[1])
    ott_user_install.append(block[2].split(":")[1])
    ott_user_time.append(block[3].split(":")[1][:-1])
dict_data = {'ott이용자수':ott_user_cnt,
            'ott신규설치': ott_user_install,
            'ott사용시간': ott_user_time}
df = pd.DataFrame(dict_data)
df.to_csv("201901_202008_OTT이용.csv")
#배달앱 데이터 전처리
delivery=read_txt("배달앱이용.txt")
delivery = delivery[:-1]
del_user_cnt = []
del_user_install = []
del_user_time = []
ott
for index, j in enumerate(delivery):
    block = j.split(",")
    del_user_cnt.append(block[1].split(":")[1])
    del_user_install.append(block[2].split(":")[1])
    del_user_time.append(block[3].split(":")[1][:-1])
len(del_user_cnt)
973
dict_data = {'배달앱이용자수':del_user_cnt,
            '배달앱신규설치': del_user_install,
            '배달앱사용시간': del_user_time}
df = pd.DataFrame(dict_data)
df.to_csv("201901_202008_배달앱이용.csv")
#화상회의앱 데이터 전처리
meeting = read_txt("화상회의앱.txt")
meeting = meeting[:-1]
user_cnt = []
user_install = []
user_time = []
ott
for index, j in enumerate(meeting):
    block = j.split(",")
    user_cnt.append(block[1].split(":")[1])
    user_install.append(block[2].split(":")[1])
    user_time.append(block[3].split(":")[1][:-1])
print(user_cnt)
print(user_install)
print(user_time)
dict_data = {'화상회의앱이용자수':user_cnt,
            '화상회의앱신규설치': user_install,
            '화상회의앱사용시간': user_time}
df = pd.DataFrame(dict_data)
df.to_csv("201901_202008_화상회의앱이용.csv")
#중고거래앱 데이터 전처리
used = read_txt("중고거래앱.txt")
used = used[:-1]
user_cnt = []
user_install = []
user_time = []
for index, j in enumerate(used):
    block = j.split(",")
    user_cnt.append(block[1].split(":")[1])
    user_install.append(block[2].split(":")[1])
    user_time.append(block[3].split(":")[1][:-1])
print(user_cnt)
print(user_install)
print(user_time)
dict_data = {'중고거래앱이용자수':user_cnt,
            '중고거래앱신규설치': user_install,
            '중고거래앱사용시간': user_time}
df = pd.DataFrame(dict_data)
df.to_csv("201901_202008_중고거래앱이용.csv")
#코로나 누락된 일별 데이터가 포함된 세 데이터 전처리
covid = read_txt("코로나.txt")
covid[0].split(",")[0].split(":")[2].split("T")[0].split("\"")[1]
'2020-01-20'
covid = read_txt("코로나.txt")
covid = covid[:-1]
date = []
user_cnt = []
for index, j in enumerate(covid):
    block = j.split(",")
    date.append(block[0].split(":")[2].split("T")[0].split("\"")[1])
    user_cnt.append(block[3].split(":")[1])
print(date)
print(user_cnt)
dict_data = {'날짜':date,'신규확진자수':user_cnt}
df = pd.DataFrame(dict_data)
df.to_csv("201901_202007_코로나확진자수.csv")
['2020-01-20', '2020-01-21', '2020-01-22', '2020-01-23', '2020-01-24', '2020-01-25', '2020-01-26', '2020-01-27', '2020-01-28', '2020-01-29', '2020-01-30', '2020-01-31', '2020-02-01', '2020-02-02', '2020-02-03', '2020-02-04', '2020-02-05', '2020-02-06', '2020-02-07', '2020-02-08', '2020-02-09', '2020-02-10', '2020-02-11', '2020-02-12', '2020-02-13', '2020-02-14', '2020-02-15', '2020-02-16', '2020-02-17', '2020-02-18', '2020-02-19', '2020-02-20', '2020-02-21', '2020-02-22', '2020-02-23', '2020-02-24', '2020-02-25', '2020-02-26', '2020-02-27', '2020-02-28', '2020-02-29', '2020-03-01', '2020-03-02', '2020-03-03', '2020-03-04', '2020-03-05', '2020-03-06', '2020-03-07', '2020-03-08', '2020-03-09', '2020-03-10', '2020-03-11', '2020-03-12', '2020-03-13', '2020-03-14', '2020-03-15', '2020-03-16', '2020-03-17', '2020-03-18', '2020-03-19', '2020-03-20', '2020-03-21', '2020-03-22', '2020-03-23', '2020-03-24', '2020-03-25', '2020-03-26', '2020-03-27', '2020-03-28', '2020-03-29', '2020-03-30', '2020-03-31', '2020-04-01', '2020-04-02', '2020-04-03', '2020-04-04', '2020-04-05', '2020-04-06', '2020-04-07', '2020-04-08', '2020-04-09', '2020-04-10', '2020-04-11', '2020-04-12', '2020-04-13', '2020-04-14', '2020-04-15', '2020-04-16', '2020-04-17', '2020-04-18', '2020-04-19', '2020-04-20', '2020-04-21', '2020-04-22', '2020-04-23', '2020-04-24', '2020-04-25', '2020-04-26', '2020-04-27', '2020-04-28', '2020-04-29', '2020-04-30', '2020-05-01', '2020-05-02', '2020-05-03', '2020-05-04', '2020-05-05', '2020-05-06', '2020-05-07', '2020-05-08', '2020-05-09', '2020-05-10', '2020-05-11', '2020-05-12', '2020-05-13', '2020-05-14', '2020-05-15', '2020-05-16', '2020-05-17', '2020-05-18', '2020-05-19', '2020-05-20', '2020-05-21', '2020-05-22', '2020-05-23', '2020-05-24', '2020-05-25', '2020-05-26', '2020-05-27', '2020-05-28', '2020-05-29', '2020-05-30', '2020-05-31', '2020-06-01', '2020-06-02', '2020-06-03', '2020-06-04', '2020-06-05', '2020-06-06', '2020-06-07', '2020-06-08', '2020-06-09', '2020-06-10', '2020-06-11', '2020-06-12', '2020-06-13', '2020-06-14', '2020-06-15', '2020-06-16', '2020-06-17', '2020-06-18', '2020-06-19', '2020-06-20', '2020-06-21', '2020-06-22', '2020-06-23', '2020-06-24', '2020-06-25', '2020-06-26', '2020-06-27', '2020-06-28', '2020-06-29', '2020-06-30', '2020-07-01', '2020-07-02', '2020-07-03', '2020-07-04', '2020-07-05', '2020-07-06', '2020-07-07', '2020-07-08', '2020-07-09', '2020-07-10', '2020-07-11', '2020-07-12', '2020-07-13', '2020-07-14', '2020-07-15', '2020-07-16', '2020-07-17', '2020-07-18', '2020-07-19', '2020-07-20', '2020-07-21', '2020-07-22', '2020-07-23', '2020-07-24', '2020-07-25', '2020-07-26', '2020-07-27', '2020-07-28', '2020-07-29', '2020-07-30', '2020-07-31', '2020-08-01', '2020-08-02', '2020-08-03', '2020-08-04', '2020-08-05', '2020-08-06', '2020-08-07', '2020-08-08', '2020-08-09', '2020-08-10', '2020-08-11', '2020-08-12', '2020-08-13', '2020-08-14', '2020-08-15', '2020-08-16', '2020-08-17', '2020-08-18', '2020-08-19', '2020-08-20', '2020-08-21', '2020-08-22', '2020-08-23', '2020-08-24', '2020-08-25', '2020-08-26', '2020-08-27', '2020-08-28', '2020-08-29', '2020-08-30', '2020-08-31', '2020-09-01', '2020-09-02', '2020-09-03', '2020-09-04', '2020-09-05', '2020-09-06', '2020-09-07', '2020-09-08', '2020-09-09', '2020-09-10', '2020-09-11', '2020-09-12', '2020-09-13', '2020-09-14', '2020-09-15', '2020-09-16', '2020-09-17', '2020-09-18', '2020-09-19', '2020-09-20', '2020-09-21', '2020-09-22', '2020-09-23', '2020-09-24', '2020-09-25', '2020-09-26', '2020-09-27', '2020-09-28', '2020-09-29', '2020-09-30', '2020-10-01', '2020-10-02', '2020-10-03', '2020-10-04', '2020-10-05', '2020-10-06', '2020-10-07', '2020-10-08', '2020-10-09', '2020-10-10', '2020-10-11', '2020-10-12', '2020-10-13', '2020-10-14', '2020-10-15', '2020-10-16', '2020-10-17', '2020-10-18', '2020-10-19', '2020-10-20', '2020-10-21', '2020-10-22', '2020-10-23', '2020-10-24', '2020-10-25', '2020-10-26', '2020-10-27', '2020-10-28', '2020-10-29', '2020-10-30', '2020-10-31', '2020-11-01', '2020-11-02', '2020-11-03', '2020-11-04', '2020-11-05', '2020-11-06', '2020-11-07', '2020-11-08', '2020-11-09', '2020-11-10', '2020-11-11', '2020-11-12', '2020-11-13', '2020-11-14', '2020-11-15', '2020-11-16', '2020-11-17', '2020-11-18', '2020-11-19', '2020-11-20', '2020-11-21', '2020-11-22', '2020-11-23', '2020-11-24', '2020-11-25', '2020-11-26', '2020-11-27', '2020-11-28', '2020-11-29', '2020-11-30', '2020-12-01', '2020-12-02', '2020-12-03', '2020-12-04', '2020-12-05', '2020-12-06', '2020-12-07', '2020-12-08', '2020-12-09', '2020-12-10', '2020-12-11', '2020-12-12', '2020-12-13', '2020-12-14', '2020-12-15', '2020-12-16', '2020-12-17', '2020-12-18', '2020-12-19', '2020-12-20', '2020-12-21', '2020-12-22', '2020-12-23', '2020-12-24', '2020-12-25', '2020-12-26', '2020-12-27', '2020-12-28', '2020-12-29', '2020-12-30', '2020-12-31', '2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04', '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08', '2021-01-09', '2021-01-10', '2021-01-11', '2021-01-12', '2021-01-13', '2021-01-14', '2021-01-15', '2021-01-16', '2021-01-17', '2021-01-18', '2021-01-19', '2021-01-20', '2021-01-21', '2021-01-22', '2021-01-23', '2021-01-24', '2021-01-25', '2021-01-26', '2021-01-27', '2021-01-28', '2021-01-29', '2021-01-30', '2021-01-31', '2021-02-01', '2021-02-02', '2021-02-03', '2021-02-04', '2021-02-05', '2021-02-06', '2021-02-07', '2021-02-08', '2021-02-09', '2021-02-10', '2021-02-11', '2021-02-12', '2021-02-13', '2021-02-14', '2021-02-15', '2021-02-16', '2021-02-17', '2021-02-18', '2021-02-19', '2021-02-20', '2021-02-21', '2021-02-22', '2021-02-23', '2021-02-24', '2021-02-25', '2021-02-26', '2021-02-27', '2021-02-28', '2021-03-01', '2021-03-02', '2021-03-03', '2021-03-04', '2021-03-05', '2021-03-06', '2021-03-07', '2021-03-08', '2021-03-09', '2021-03-10', '2021-03-11', '2021-03-12', '2021-03-13', '2021-03-14', '2021-03-15', '2021-03-22', '2021-03-21', '2021-03-20', '2021-03-19', '2021-03-18', '2021-03-17', '2021-03-16', '2021-03-23', '2021-03-29', '2021-03-24', '2021-03-25', '2021-03-26', '2021-03-27', '2021-03-28', '2021-04-04', '2021-04-03', '2021-04-02', '2021-04-01', '2021-03-31', '2021-03-30', '2021-04-11', '2021-04-10', '2021-04-09', '2021-04-08', '2021-04-07', '2021-04-06', '2021-04-05', '2021-04-18', '2021-04-17', '2021-04-16', '2021-04-15', '2021-04-14', '2021-04-13', '2021-04-12', '2021-04-25', '2021-04-24', '2021-04-23', '2021-04-22', '2021-04-21', '2021-04-20', '2021-04-19', '2021-05-02', '2021-05-01', '2021-04-30', '2021-04-29', '2021-04-28', '2021-04-27', '2021-04-26', '2021-05-09', '2021-05-08', '2021-05-07', '2021-05-06', '2021-05-05', '2021-05-04', '2021-05-03', '2021-05-12', '2021-05-11', '2021-05-10', '2021-05-23', '2021-05-22', '2021-05-21', '2021-05-20', '2021-05-19', '2021-05-18', '2021-05-17', '2021-05-16', '2021-05-15', '2021-05-14', '2021-05-13', '2021-05-30', '2021-05-29', '2021-05-28', '2021-05-27', '2021-05-26', '2021-05-25', '2021-05-24', '2021-06-05', '2021-06-04', '2021-06-03', '2021-06-02', '2021-06-01', '2021-06-13', '2021-06-12', '2021-06-11', '2021-06-10', '2021-06-09', '2021-06-08', '2021-06-07', '2021-06-20', '2021-06-19', '2021-06-18', '2021-06-17', '2021-06-16', '2021-06-15', '2021-06-14', '2021-06-27', '2021-06-26', '2021-06-25', '2021-06-24', '2021-06-23', '2021-06-22', '2021-06-21', '2021-06-06', '2021-05-31', '2021-07-05', '2021-07-04', '2021-07-03', '2021-07-02', '2021-07-01', '2021-06-30', '2021-06-29', '2021-06-28', '2021-07-11', '2021-07-10', '2021-07-09', '2021-07-08', '2021-07-07', '2021-07-06', '2021-07-18', '2021-07-17', '2021-07-16', '2021-07-15', '2021-07-14', '2021-07-13', '2021-07-12', '2021-07-25', '2021-07-24', '2021-07-23', '2021-07-22', '2021-07-21', '2021-07-20', '2021-07-19', '2021-08-01', '2021-07-31', '2021-07-30', '2021-07-29', '2021-07-28', '2021-07-27', '2021-07-26', '2021-08-09', '2021-08-08', '2021-08-07', '2021-08-06', '2021-08-05', '2021-08-04', '2021-08-03']
[' 1', ' 0', ' 0', ' 0', ' 1', ' 0', ' 1', ' 1', ' 0', ' 0', ' 3', ' 4', ' 1', ' 3', ' 0', ' 1', ' 2', ' 5', ' 1', ' 0', ' 3', ' 0', ' 1', ' 0', ' 0', ' 0', ' 0', ' 1', ' 1', ' 1', ' 15', ' 36', ' 74', ' 190', ' 210', ' 207', ' 130', ' 253', ' 449', ' 427', ' 909', ' 595', ' 686', ' 600', ' 516', ' 438', ' 518', ' 483', ' 367', ' 248', ' 131', ' 242', ' 114', ' 110', ' 107', ' 76', ' 74', ' 84', ' 93', ' 152', ' 87', ' 147', ' 98', ' 64', ' 76', ' 100', ' 104', ' 91', ' 146', ' 105', ' 78', ' 125', ' 101', ' 89', ' 86', ' 94', ' 81', ' 47', ' 47', ' 53', ' 39', ' 27', ' 30', ' 32', ' 25', ' 27', ' 27', ' 22', ' 22', ' 18', ' 8', ' 13', ' 9', ' 11', ' 8', ' 6', ' 10', ' 10', ' 10', ' 14', ' 9', ' 4', ' 9', ' 6', ' 13', ' 8', ' 3', ' 2', ' 4', ' 12', ' 18', ' 34', ' 35', ' 27', ' 26', ' 29', ' 27', ' 19', ' 13', ' 15', ' 13', ' 32', ' 12', ' 20', ' 23', ' 25', ' 16', ' 19', ' 40', ' 79', ' 58', ' 39', ' 27', ' 35', ' 38', ' 49', ' 39', ' 39', ' 51', ' 57', ' 38', ' 38', ' 50', ' 45', ' 56', ' 49', ' 34', ' 37', ' 34', ' 43', ' 59', ' 49', ' 67', ' 48', ' 17', ' 46', ' 51', ' 28', ' 39', ' 51', ' 62', ' 42', ' 43', ' 51', ' 54', ' 61', ' 63', ' 61', ' 48', ' 44', ' 62', ' 50', ' 45', ' 35', ' 44', ' 62', ' 33', ' 39', ' 61', ' 60', ' 39', ' 34', ' 26', ' 45', ' 63', ' 59', ' 41', ' 113', ' 58', ' 25', ' 28', ' 48', ' 18', ' 36', ' 31', ' 30', ' 23', ' 34', ' 33', ' 43', ' 20', ' 43', ' 36', ' 28', ' 34', ' 54', ' 56', ' 103', ' 166', ' 279', ' 197', ' 246', ' 297', ' 288', ' 324', ' 332', ' 397', ' 266', ' 280', ' 320', ' 441', ' 371', ' 323', ' 299', ' 248', ' 235', ' 267', ' 195', ' 198', ' 168', ' 167', ' 119', ' 136', ' 156', ' 155', ' 176', ' 136', ' 121', ' 109', ' 106', ' 113', ' 153', ' 126', ' 110', ' 82', ' 70', ' 61', ' 110', ' 125', ' 114', ' 61', ' 95', ' 50', ' 38', ' 113', ' 77', ' 63', ' 75', ' 64', ' 73', ' 75', ' 114', ' 69', ' 54', ' 72', ' 58', ' 98', ' 102', ' 84', ' 110', ' 47', ' 73', ' 91', ' 76', ' 58', ' 91', ' 121', ' 155', ' 77', ' 61', ' 119', ' 88', ' 103', ' 125', ' 114', ' 127', ' 124', ' 97', ' 75', ' 117', ' 125', ' 145', ' 89', ' 143', ' 126', ' 100', ' 146', ' 143', ' 191', ' 205', ' 208', ' 222', ' 230', ' 285', ' 325', ' 348', ' 386', ' 330', ' 271', ' 349', ' 382', ' 581', ' 555', ' 503', ' 450', ' 438', ' 451', ' 511', ' 540', ' 628', ' 577', ' 631', ' 615', ' 589', ' 670', ' 680', ' 689', ' 950', ' 1030', ' 718', ' 880', ' 1078', ' 1014', ' 1064', ' 1050', ' 1097', ' 925', ' 867', ' 1090', ' 985', ' 1241', ' 1132', ' 970', ' 807', ' 1045', ' 1050', ' 967', ' 1029', ' 824', ' 657', ' 1020', ' 715', ' 840', ' 870', ' 674', ' 641', ' 665', ' 451', ' 537', ' 562', ' 524', ' 513', ' 580', ' 520', ' 389', ' 386', ' 404', ' 401', ' 346', ' 431', ' 392', ' 437', ' 350', ' 559', ' 497', ' 469', ' 456', ' 355', ' 305', ' 336', ' 467', ' 451', ' 370', ' 393', ' 371', ' 287', ' 302', ' 444', ' 504', ' 403', ' 362', ' 326', ' 343', ' 457', ' 621', ' 621', ' 561', ' 448', ' 416', ' 332', ' 356', ' 440', ' 395', ' 387', ' 415', ' 355', ' 355', ' 344', ' 444', ' 424', ' 398', ' 418', ' 416', ' 346', ' 446', ' 470', ' 465', ' 488', ' 490', ' 459', ' 382', ' 415', ' 451', ' 447', ' 463', ' 445', ' 469', ' 363', ' 346', ' 384', ' 428', ' 430', ' 494', ' 505', ' 482', ' 543', ' 543', ' 557', ' 551', ' 503', ' 444', ' 614', ' 677', ' 670', ' 700', ' 667', ' 477', ' 473', ' 672', ' 658', ' 673', ' 698', ' 730', ' 541', ' 586', ' 644', ' 785', ' 797', ' 735', ' 731', ' 549', ' 532', ' 606', ' 627', ' 661', ' 679', ' 769', ' 512', ' 499', ' 564', ' 701', ' 525', ' 574', ' 676', ' 541', ' 488', ' 635', ' 511', ' 463', ' 585', ' 666', ' 561', ' 646', ' 654', ' 528', ' 619', ' 610', ' 681', ' 747', ' 715', ' 480', ' 533', ' 587', ' 629', ' 706', ' 516', ' 530', ' 744', ' 695', ' 681', ' 677', ' 459', ' 452', ' 565', ' 556', ' 611', ' 602', ' 453', ' 485', ' 429', ' 482', ' 507', ' 540', ' 545', ' 373', ' 397', ' 614', ' 668', ' 634', ' 610', ' 645', ' 394', ' 357', ' 555', ' 429', ' 711', ' 743', ' 794', ' 826', ' 761', ' 794', ' 595', ' 501', ' 1324', ' 1378', ' 1316', ' 1275', ' 1212', ' 746', ' 1454', ' 1452', ' 1536', ' 1600', ' 1615', ' 1150', ' 1100', ' 1487', ' 1629', ' 1630', ' 1842', ' 1784', ' 1277', ' 1251', ' 1442', ' 1539', ' 1710', ' 1673', ' 1895', ' 1363', ' 1318', ' 1492', ' 1729', ' 1823', ' 1703', ' 1775', ' 1725', ' 1200']

df.to_csv("201901_202007_코로나확진자수.csv")
df
날짜 신규확진자수
0 2020-01-20 1
1 2020-01-21 0
2 2020-01-22 0
3 2020-01-23 0
4 2020-01-24 1
... ... ...
562 2021-08-07 1823
563 2021-08-06 1703
564 2021-08-05 1775
565 2021-08-04 1725
566 2021-08-03 1200

567 rows × 2 columns

df = df.sort_values(by='날짜')['신규확진자수']
df
0          1
1          0
2          0
3          0
4          1
       ...  
564     1775
563     1703
562     1823
561     1729
560     1492
Name: 신규확진자수, Length: 567, dtype: object
#서울 지하철 이용 데이터 전처리(모델에는 이 데이터가 쓰이지는 않음)
subway = read_txt("서울지하철이용.txt")

user_cnt = []
time = []
for index, j in enumerate(subway):
    user_cnt.append(j.split(",")[1].split("{")[1][:-2].split(":")[1][1:])
    temp = j.split(",")[0].split("{")[1][:-2].split(":")[1][1:].split("\"")[1].\
    split("T")[0]
    time.append(temp)
print(time)
dict_data = {'지하철이용자수':user_cnt}
df = pd.DataFrame(dict_data)
df.to_csv("201901_202007_지하철이용자수.csv")
# dict_data = {'날짜': time,
#             '확인':time}
# df = pd.DataFrame(dict_data)
df_date = pd.read_csv(
    '내 저장 경로\\날짜.csv'
                             ,encoding='cp949')
temp = pd.merge(left = df , right = df_date, how = "outer", on = "날짜")
temp = temp.sort_values(by='날짜')
temp.to_csv('내 저장 경로\\누락날짜확인.csv')
subway = read_txt("서울지하철이용.txt")
subway
['365: {date: "2019-01-01T00:00:00.000Z", sum: {total: 2169874}}',
 '366: {date: "2019-01-02T00:00:00.000Z", sum: {total: 5067609}}',
 '367: {date: "2019-01-03T00:00:00.000Z", sum: {total: 5255830}}',
 '368: {date: "2019-01-04T00:00:00.000Z", sum: {total: 5485567}}',
 '369: {date: "2019-01-05T00:00:00.000Z", sum: {total: 3921397}}',
 '370: {date: "2019-01-06T00:00:00.000Z", sum: {total: 2777687}}',
 '371: {date: "2019-01-07T00:00:00.000Z", sum: {total: 5199344}}',
  ...
  ...

 '1269: {date: "2021-06-23T00:00:00.000Z", sum: {total: 4393854}}',
 '1270: {date: "2021-06-24T00:00:00.000Z", sum: {total: 4433884}}',
 '1271: {date: "2021-06-25T00:00:00.000Z", sum: {total: 4565923}}',
 '1272: {date: "2021-06-26T00:00:00.000Z", sum: {total: 2918361}}',
 '1273: {date: "2021-06-27T00:00:00.000Z", sum: {total: 2113908}}',
 '1274: {date: "2021-06-28T00:00:00.000Z", sum: {total: 4247266}}',
 '1275: {date: "2021-06-29T00:00:00.000Z", sum: {total: 4373982}}',
 '1276: {date: "2021-06-30T00:00:00.000Z", sum: {total: 4358985}}']
#지하철 직접 집계
subway_raw = pd.read_csv("지하철_202108.csv", encoding="cp949")
subway_agg = subway_raw.groupby("사용일자").sum()
subway_agg = subway_agg.sort_values(by='사용일자')
subway_agg.to_csv("지하철202107_202108.csv")
movie = read_txt("영화(201801_202106).txt")
#영화관람 데이터 전처리(모델에는 안쓰임)
movie_list = []
for i in movie:
    temp = i.split(",")[3].split(":")[1][1:]
    movie_list.append(temp)
movie_list
dict_data = {"movie_sales":movie_list}
movie_df = pd.DataFrame(dict_data)
movie_df
movie_sales
0 10309962150
1 2805968850
2 3086189000
3 3357281295
4 7145939653
... ...
907 3338439630
908 2990616610
909 999626730
910 996710000
911 1243958090

912 rows × 1 columns

df = pd.read_csv("내 저장 경로\\CARD_SUBWAY_MONTH_201901.csv",encoding='cp949')
#모델에 쓰인 지하철 데이터 전처리
import pandas as pd
subway_list = []
for i in range(1,10):
    subway_list.append(pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_20190{i}.csv",encoding='cp949'))
df_10=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_201910.csv",encoding='cp949')
df_11=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_201911.csv",encoding='cp949')
df_12=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_201912.csv",encoding='cp949')
subway_list.append(df_10)
subway_list.append(df_11)
subway_list.append(df_12)
subway_list
[           사용일자    노선명          역명   승차총승객수   하차총승객수      등록일자
 0      20190101    2호선       을지로4가   3862.0   3728.0  20190104
 1      20190101    2호선       을지로3가   8104.0   7554.0  20190104
 2      20190101    2호선       을지로입구  22478.0  21330.0  20190104
 3      20190101    2호선          시청   8381.0   6049.0  20190104
 4      20190101    1호선         동묘앞   8045.0   8504.0  20190104
 ...         ...    ...         ...      ...      ...       ...
 18329  20190131  우이신설선      북한산보국문   6195.0   5786.0  20190203
 18330  20190131  우이신설선          정릉   4600.0   4211.0  20190203
 18331  20190131  우이신설선  성신여대입구(돈암)   3922.0   4289.0  20190203
 18332  20190131  우이신설선          보문   1659.0   1693.0  20190203
 18333  20190131  우이신설선         신설동   2075.0   2215.0  20190203
 
 ... 후략 ... 
 [18333 rows x 6 columns]]

#df_8은 나누기 2
subway_list.append(df)
df1=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202001.csv",encoding='cp949')
df2=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202002.csv",encoding='cp949')
df3=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202003.csv",encoding='cp949')
df4=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202004.csv",encoding='cp949')
subway_list.append(df1)
subway_list.append(df2)
subway_list.append(df3)
subway_list.append(df4)
df1=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202005.csv",encoding='cp949')
df2=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202006.csv",encoding='cp949')
df3=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202007.csv",encoding='cp949')
df4=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202008.csv",encoding='cp949')
subway_list.append(df1)
subway_list.append(df2)
subway_list.append(df3)
subway_list.append(df4)
df1=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202009.csv",encoding='cp949')
df2=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202010.csv",encoding='cp949')
df3=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202011.csv",encoding='cp949')
df4=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202012.csv",encoding='cp949')
subway_list.append(df1)
subway_list.append(df2)
subway_list.append(df3)
subway_list.append(df4)
len(subway_list)
31
for i in range(1,8):
    subway_list.append(pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_20210{i}.csv",encoding='cp949'))
subway_list_2 = []
for i in subway_list:
    subway_list_2.append(i.loc[:,['사용일자','승차총승객수']])
len(subway_list_2)
31
res=pd.DataFrame()
for i in subway_list_2:
    print(i.groupby("사용일자").sum())
    res = pd.concat([res,i.groupby("사용일자").sum()])
             승차총승객수
사용일자               
20190101  3419948.0
20190102  7621668.0
20190103  7873692.0
20190104  8270345.0
20190105  6062294.0
20190106  4408718.0
20190107  7824428.0
20190108  7913162.0
20190109  7815405.0
20190110  7977300.0
20190111  8351322.0
20190112  6282189.0
20190113  4501053.0
20190114  7680436.0

... 후략...
res.to_csv("지하철201901_202107.csv")

다음 포스트에 이어서...

profile
코더가 아닌 프로그래머를 지향하는 개발자

0개의 댓글

관련 채용 정보