import chunk
from typing import final
from venv import create
import pandas as pd
import multiprocessing
import base64
import chunk
import os
import sys
import pymysql
import requests
from sqlalchemy import create_engine
import requests
import warnings
import fileinput
import glob
from multiprocessing import Process
import bisect
#-*- coding: cp949 -*-
Chunksize = 10000
warnings.simplefilter(action='ignore', category=FutureWarning)
kakao_key=[""]
index=0
def get_hangjeongdong(addr): #행정동코드 가져오는 함수
global index
global kakao_key
url = "https://dapi.kakao.com/v2/local/search/address.json"
params={'query':addr}
headers = {"Authorization":"KakaoAK "+ kakao_key[index]}
hangjeongdong=requests.get(url,params=params,headers=headers).json()
if('documents' not in hangjeongdong): # 일일 api요청 개수가 초과 했을때
return 100
hangjeongdong=hangjeongdong['documents']
if(len(hangjeongdong)==0): # 행정동이 리턴되지 않을때
return False
else :
hangjeongdong=hangjeongdong[0]['address']['h_code']
if hangjeongdong == '':
return False
else:
return hangjeongdong
map1 = pd.read_excel("C:/Users/TSM/Downloads/map1.xlsx",names=['행정동코드','법정동코드'])
map1 = map1.astype({'행정동코드':'str','법정동코드':'str'})
map1_set = set(list(map1['법정동코드']))
x = 0; y = 0
for chunk in pd.read_csv("C:/Users/TSM/Downloads/gas_final/gas_final.txt",chunksize=Chunksize,sep =',',encoding='cp949',names=['date','addr','bjd','gas'],header=0,low_memory=False):
chunk = chunk.astype({'date':'str','addr':'str','bjd':'str','gas':'int'})
map2 = pd.read_csv("C:/Users/TSM/Downloads/map2_revised.csv",names=['지번주소','행정동코드','법정동코드'],dtype={'지번주소':'str','행정동코드':'str','법정동코드':'str'},encoding='cp949',low_memory=False)
map2_set = set(list(map2['지번주소']))
map2_list_addr = list(map2['지번주소'])
map2_list_hjd = list(map2['행정동코드'])
idxarr = sorted(range(len(map2_list_addr)),key=lambda k:map2_list_addr[k])
map2_list_addr.sort()
for i, row in chunk.iterrows():
지번주소 = chunk.at[i, 'addr']
법정동코드 = chunk.at[i, 'bjd']
if 법정동코드 not in map1_set: # not in map1
if 지번주소 not in map2_set: # not in map2
y+=1
행정동코드 = get_hangjeongdong(지번주소)
if 행정동코드 == 100:
index += 1
if index == len(kakao_key):
break
행정동코드 = get_hangjeongdong(지번주소)
if 행정동코드 is False:
chunk.drop(i,axis=0,inplace=True)
else:
chunk.at[i, 'bjd'] = 행정동코드
tmp_data = {'지번주소':[지번주소],'행정동코드':[행정동코드],'법정동코드':[법정동코드]}
tmp_df = pd.DataFrame(tmp_data)
tmp_df = tmp_df.astype({'지번주소':'str','행정동코드':'str','법정동코드':'str'})
tmp_df.to_csv('C:/Users/TSM/Downloads/map2_revised.csv',mode='a',index=False,encoding='cp949',header=False)
else: # exist in map2
idxx = bisect.bisect_left(map2_list_addr, 지번주소)
idxxx = idxarr[idxx]
temp = map2_list_hjd[idxxx]
if (temp is not None) and (temp is not False):
chunk.at[i, 'bjd'] = temp
else:
chunk.drop(i,axis=0,inplace=True)
else: # exist in map1
chunk.at[i, 'bjd'] = map1.loc[map1['법정동코드']==법정동코드]['행정동코드'].values.astype('str')[0]
x += Chunksize
print("읽음: ", x, " 요청 보냄: ", y)
chunk.to_csv("h_gas.csv", mode='a', index=False, header=None)
print("=====================읽음: ", x, " 요청 보냄: ", y, "=====================")
x = 0; y = 0
for chunk in pd.read_csv("C:/Users/TSM/Downloads/electricity_final/electricity_final.txt",chunksize=Chunksize,sep =',',encoding='cp949',names=['date','addr','bjd','electricity'],header=0,low_memory=False):
chunk = chunk.astype({'date':'str','addr':'str','bjd':'str','electricity':'int'})
map2 = pd.read_csv("C:/Users/TSM/Downloads/map2_revised.csv",names=['지번주소','행정동코드','법정동코드'],dtype={'지번주소':'str','행정동코드':'str','법정동코드':'str'},encoding='cp949',low_memory=False)
map2_set = set(list(map2['지번주소']))
map2_list_addr = list(map2['지번주소'])
map2_list_hjd = list(map2['행정동코드'])
idxarr = sorted(range(len(map2_list_addr)),key=lambda k:map2_list_addr[k])
map2_list_addr.sort()
for i, row in chunk.iterrows():
지번주소 = chunk.at[i, 'addr']
법정동코드 = chunk.at[i, 'bjd']
if 법정동코드 not in map1_set: # not in map1
if 지번주소 not in map2_set: # not in map2
y+=1
행정동코드 = get_hangjeongdong(지번주소)
if 행정동코드 == 100:
index += 1
if index == len(kakao_key):
break
행정동코드 = get_hangjeongdong(지번주소)
if 행정동코드 is False:
chunk.drop(i,axis=0,inplace=True)
else:
chunk.at[i, 'bjd'] = 행정동코드
tmp_data = {'지번주소':[지번주소],'행정동코드':[행정동코드],'법정동코드':[법정동코드]}
tmp_df = pd.DataFrame(tmp_data)
tmp_df = tmp_df.astype({'지번주소':'str','행정동코드':'str','법정동코드':'str'})
tmp_df.to_csv('C:/Users/TSM/Downloads/map2_revised.csv',mode='a',index=False,encoding='cp949',header=False)
else: # exist in map2
idxx = bisect.bisect_left(map2_list_addr, 지번주소)
idxxx = idxarr[idxx]
temp = map2_list_hjd[idxxx]
if (temp is not None) and (temp is not False):
chunk.at[i, 'bjd'] = temp
else:
chunk.drop(i,axis=0,inplace=True)
else: # exist in map1
chunk.at[i, 'bjd'] = map1.loc[map1['법정동코드']==법정동코드]['행정동코드'].values.astype('str')[0]
x += Chunksize
print("읽음: ", x, " 요청 보냄: ", y)
chunk.to_csv("h_electricity.csv", mode='a', index=False, header=None)
print("=====================읽음: ", x, " 요청 보냄: ", y, "=====================")
다 돌아가는데 5일 걸림 ㅠ