from urllib.request import urlopen
import requests as rq
from bs4 import BeautifulSoup as bs
from typing import final
from venv import create
import pandas as pd
from multiprocessing import Process
from typing import final
from venv import create
import requests
from decimal import Decimal
import re
import xml.etree.ElementTree as ET
import time
import json
import datetime
import sys
import googlemaps
from urllib.request import urlopen
import requests as rq
from bs4 import BeautifulSoup as bs
import time
from datetime import datetime
import zipfile
import os
import boto3
from typing import final
from venv import create
import pandas as pd
import chunk
import warnings
from multiprocessing import Process
import bisect
import pandas as pd
from typing import final
from venv import create
import requests
from decimal import Decimal
import json
import shutil
def parsing_pre(path_src, path_dest):
src = open(path_src, "r", encoding="cp949")
dest = open(path_dest, "w", encoding="cp949")
for line in src:
line_list = line.split('|')
newline = line_list[0] + ',' + line_list[1] + ',' + line_list[3] + line_list[4] + ',' + line_list[13] + '\n'
dest.write(newline)
src.close()
dest.close()
def parsing_post(path_src, path_dest):
src = open(path_src, "r", encoding="cp949")
dest = open(path_dest, "a", encoding="cp949")
for line in src:
line_list = line.split('|')
if str(int(line_list[9])) == '0':
if line_list[7] == '0':
newline = line_list[0] + ',' + line_list[4] + ' ' + line_list[5] + ' ' + line_list[6] + ' ' + str(int(line_list[8])) + '번지' + ',' + line_list[2] + line_list[3] + ',' + line_list[16]
else:
newline = line_list[0] + ',' + line_list[4] + ' ' + line_list[5] + ' ' + line_list[6] + ' ' + '산 ' + str(int(line_list[8])) + '번지' + ',' + line_list[2] + line_list[3] + ',' + line_list[16]
else:
if line_list[7] == '0':
newline = line_list[0] + ',' + line_list[4] + ' ' + line_list[5] + ' ' + line_list[6] + ' ' + str(int(line_list[8])) + '-' + str(int(line_list[9])) + '번지' + ',' + line_list[2] + line_list[3] + ',' + line_list[16]
else:
newline = line_list[0] + ',' + line_list[4] + ' ' + line_list[5] + ' ' + line_list[6] + ' ' + '산 ' + str(int(line_list[8])) + '-' + str(int(line_list[9])) + '번지' + ',' + line_list[2] + line_list[3] + ',' + line_list[16]
dest.write(newline)
src.close()
dest.close()
##### txt file parsing #####
# ''' ~ 2021 '''
# year = 2011
# for i in range(11):
# path_src = "C:/Users/TSM/OneDrive/바탕 화면/alladdr/gas" + str(year) + ".txt"
# path_dest = "C:/Users/TSM/OneDrive/바탕 화면/energy/gas_" + str(year) + ".txt"
# parsing_pre(path_src, path_dest)
# year += 1
# ''' 2022 ~ '''
# year = 202201
# for i in range(9):
# path_src = "C:/Users/TSM/OneDrive/바탕 화면/alladdr/gas" + str(year) + ".txt"
# path_dest = "C:/Users/TSM/OneDrive/바탕 화면/energy/gas_" + "2022" + ".txt"
# parsing_post(path_src, path_dest)
# year += 1
# map1, map2 open
map1 = pd.read_excel("C:/Users/TSM/OneDrive/바탕 화면/alladdr/map1.xlsx",names=['행정동코드','법정동코드'])
map1 = map1.astype({'행정동코드':'str','법정동코드':'str'})
map1_set = set(list(map1['법정동코드']))
map2 = pd.read_csv("C:/Users/TSM/OneDrive/바탕 화면/alladdr/map2.csv",names=['지번주소','법정동코드','행정동코드'],dtype={'지번주소':'str','법정동코드':'str','행정동코드':'str'},encoding='cp949')
map2_set = set(list(map2['지번주소']))
map2_addr = list(map2['지번주소'])
map2_hjd = list(map2["행정동코드"])
idxarr = sorted(range(len(map2_addr)), key=lambda k:map2_addr[k])
map2_addr.sort()
###### bjd -> hjd (elcty) #####
Chunksize = 10000
year = 2011
for i in range(12):
for chunk in pd.read_csv("C:/Users/TSM/OneDrive/바탕 화면/energy/elcty_" + str(year) + ".txt", chunksize=Chunksize, sep=',', encoding='cp949', names=['date','addr','bjd','energy'], low_memory=False):
chunk = chunk.astype({'date':'str', 'addr':'str', 'bjd':'str', 'energy':'int'})
for idx, row in chunk.iterrows():
지번주소 = chunk.at[idx, 'addr']
법정동코드 = chunk.at[idx, 'bjd']
if 법정동코드 not in map1_set:
if 지번주소 not in map2_set:
chunk.at[idx, 'bjd'] = "False"
else:
idxx = bisect.bisect_left(map2_addr, 지번주소)
idxxx = idxarr[idxx]
tmp = map2_hjd[idxxx]
chunk.at[idx, 'bjd'] = tmp
else:
chunk.at[idx, 'bjd'] = map1.loc[map1['법정동코드']==법정동코드]['행정동코드'].values.astype('str')[0]
chunk.to_csv("C:/Users/TSM/OneDrive/바탕 화면/energy/h_elcty.csv", mode='a', index=False, header=None)
print(str(year) + "elcty 행정동변환 완료")
year += 1
###### bjd -> hjd (gas) #####
Chunksize = 10000
year = 2011
for i in range(12):
for chunk in pd.read_csv("C:/Users/TSM/OneDrive/바탕 화면/energy/gas_" + str(year) + ".txt", chunksize=Chunksize, sep=',', encoding='cp949', names=['date','addr','bjd','energy'], low_memory=False):
chunk = chunk.astype({'date':'str', 'addr':'str', 'bjd':'str', 'energy':'int'})
for idx, row in chunk.iterrows():
지번주소 = chunk.at[idx, 'addr']
법정동코드 = chunk.at[idx, 'bjd']
if 법정동코드 not in map1_set:
if 지번주소 not in map2_set:
chunk.at[idx, 'bjd'] = "False"
else:
idxx = bisect.bisect_left(map2_addr, 지번주소)
idxxx = idxarr[idxx]
tmp = map2_hjd[idxxx]
chunk.at[idx, 'bjd'] = tmp
else:
chunk.at[idx, 'bjd'] = map1.loc[map1['법정동코드']==법정동코드]['행정동코드'].values.astype('str')[0]
chunk.to_csv("C:/Users/TSM/OneDrive/바탕 화면/energy/h_gas.csv", mode='a', index=False, header=None)
print(str(year) + "gas 행정동변환 완료")
year += 1
##### truncate & group by sum & join #####
print("start truncate")
Chunksize = 1000000
for chunk in pd.read_csv("C:/Users/TSM/OneDrive/바탕 화면/energy/h_gas.csv",chunksize=Chunksize,sep =',',encoding='utf-8',names=['date','addr','hjd','gas'],header=0,low_memory=False):
chunk = chunk.astype({'date':'str','addr':'str','hjd':'str','gas':'int'})
idx1 = chunk[chunk['addr'] == ' '].index
chunk.drop(idx1,inplace=True)
idx2 = chunk[chunk['hjd'] == 'False'].index
chunk.drop(idx2,inplace=True)
chunk = chunk.drop_duplicates(keep='first')
chunk.to_csv("C:/Users/TSM/OneDrive/바탕 화면/energy/gas_trunc.csv", mode='a', index=False, header=None)
for chunk in pd.read_csv("C:/Users/TSM/OneDrive/바탕 화면/energy/h_elcty.csv",chunksize=Chunksize,sep =',',encoding='utf-8',names=['date','addr','hjd','electricity'],header=0,low_memory=False):
chunk = chunk.astype({'date':'str','addr':'str','hjd':'str','electricity':'int'})
idx1 = chunk[chunk['addr'] == ' '].index
chunk.drop(idx1,inplace=True)
idx2 = chunk[chunk['hjd'] == 'False'].index
chunk.drop(idx2,inplace=True)
chunk = chunk.drop_duplicates(keep='first')
chunk.to_csv("C:/Users/TSM/OneDrive/바탕 화면/energy/electricity_trunc.csv", mode='a', index=False, header=None)
print("complete truncate")
print("start groupby")
df = pd.DataFrame(columns=['date', 'hjd', 'gas', 'count'])
df = df.astype({'date':'str','hjd':'str','gas':'int','count':'int'})
df = df.sort_values(by=['date','hjd'])
for chunk in pd.read_csv("C:/Users/TSM/OneDrive/바탕 화면/energy/gas_trunc.csv",chunksize=Chunksize,sep =',',encoding='utf-8',names=['date','addr','hjd','gas'],header=0,low_memory=False):
chunk = chunk.astype({'date':'str','addr':'str','hjd':'str','gas':'int'})
chunk = chunk.groupby(['date','hjd'],as_index=False)['gas'].agg({'gas':'sum','count':'count'})
chunk = chunk.astype({'date':'str','hjd':'str','gas':'int','count':'int'})
df = pd.concat([df,chunk])
while (df.duplicated(subset=['date','hjd']).sum() > 0):
print(df.duplicated(subset=['date','hjd']).sum())
df = df.groupby(['date','hjd'],as_index=False).sum()
df = df.astype({'date':'str','hjd':'str','gas':'int','count':'int'})
print(df.duplicated(subset=['date','hjd']).sum())
df.to_csv("C:/Users/TSM/OneDrive/바탕 화면/energy/gas_groupby.csv", mode='a', index=False, header=None)
df = pd.DataFrame(columns=['date', 'hjd', 'electricity', 'count'])
df = df.astype({'date':'str','hjd':'str','electricity':'int','count':'int'})
df = df.sort_values(by=['date','hjd'])
for chunk in pd.read_csv("C:/Users/TSM/OneDrive/바탕 화면/energy/electricity_trunc.csv",chunksize=Chunksize,sep =',',encoding='utf-8',names=['date','addr','hjd','electricity'],header=0,low_memory=False):
chunk = chunk.astype({'date':'str','addr':'str','hjd':'str','electricity':'int'})
chunk = chunk.groupby(['date','hjd'],as_index=False)['electricity'].agg({'electricity':'sum','count':'count'})
chunk = chunk.astype({'date':'str','hjd':'str','electricity':'int','count':'int'})
df = pd.concat([df,chunk])
while (df.duplicated(subset=['date','hjd']).sum() > 0):
print(df.duplicated(subset=['date','hjd']).sum())
df = df.groupby(['date','hjd'],as_index=False).sum()
df = df.astype({'date':'str','hjd':'str','electricity':'int','count':'int'})
print(df.duplicated(subset=['date','hjd']).sum())
df.to_csv("C:/Users/TSM/OneDrive/바탕 화면/energy/electricity_groupby.csv", mode='a', index=False, header=None)
print("complete groupby")
print("start join")
gas = pd.read_csv("C:/Users/TSM/OneDrive/바탕 화면/energy/gas_groupby.csv",sep =',',encoding='utf-8',names=['date','addr','gas','count'],header=0,low_memory=False)
gas = gas.astype({'date':'str','addr':'str','gas':'int','count':'int'})
gas = gas.sort_values(by=['date','addr'])
gas.drop(['count'], axis=1, inplace=True)
electricity = pd.read_csv("C:/Users/TSM/OneDrive/바탕 화면/energy/electricity_groupby.csv",sep =',',encoding='utf-8',names=['date','addr','electricity','count'],header=0,low_memory=False)
electricity = electricity.astype({'date':'str','addr':'str','electricity':'int','count':'int'})
electricity = electricity.sort_values(by=['date','addr'])
df = pd.merge(left=electricity, right=gas, how='left', on=['date','addr'])
df = df[['date','addr','electricity','gas','count']]
df = df.astype({'date':'str','addr':'str','electricity':'int','gas':'int','count':'int'}, errors='ignore')
df.to_csv("C:/Users/TSM/OneDrive/바탕 화면/energy/buildingenergy.csv", mode='a', index=False, header=None)
print("complete join")
유실되는 지번주소를 최대한 줄여 map2를 새로 생성함
새로 생성한 map2를 이용하여 유실 되는 경우를 최대한 줄인 데이터셋을 local에 구축
명절 동안 돌려놔야지..