gas_trunc, electricity_trunc group by

ewillwin·2022년 7월 28일
0

TSMtech Record

목록 보기
15/39
import chunk
from typing import final
from venv import create
import pandas as pd
import multiprocessing
import base64
import chunk
import os
import sys
import pymysql
import requests
from sqlalchemy import create_engine
import requests
import warnings
import fileinput
import glob
from multiprocessing import Process
import bisect
#-*- coding: cp949 -*-

Chunksize = 1000000
warnings.simplefilter(action='ignore', category=FutureWarning)

df = pd.DataFrame(columns=['date', 'hjd', 'gas', 'count'])
df = df.astype({'date':'str','hjd':'str','gas':'int','count':'int'})
df = df.sort_values(by=['date','hjd'])

x = 0; y = 0
for chunk in pd.read_csv("C:/Users/TSM/gas_trunc.csv",chunksize=Chunksize,sep =',',encoding='utf-8',names=['date','addr','hjd','gas'],header=0,low_memory=False):
    chunk = chunk.astype({'date':'str','addr':'str','hjd':'str','gas':'int'})
    chunk = chunk.groupby(['date','hjd'],as_index=False)['gas'].agg({'gas':'sum','count':'count'})
    chunk = chunk.astype({'date':'str','hjd':'str','gas':'int','count':'int'})
    df = pd.concat([df,chunk])
    x += Chunksize
    print("gas -> 읽음: ", x)

while (df.duplicated(subset=['date','hjd']).sum() > 0):
    print(df.duplicated(subset=['date','hjd']).sum())
    df = df.groupby(['date','hjd'],as_index=False).sum()
    df = df.astype({'date':'str','hjd':'str','gas':'int','count':'int'}) 
    print(df.duplicated(subset=['date','hjd']).sum())

df.to_csv("gas_groupby.csv", mode='a', index=False, header=None)

##########################################################################################################################################3
df = pd.DataFrame(columns=['date', 'hjd', 'electricity', 'count'])
df = df.astype({'date':'str','hjd':'str','electricity':'int','count':'int'})
df = df.sort_values(by=['date','hjd'])

x = 0; y = 0
for chunk in pd.read_csv("C:/Users/TSM/electricity_trunc.csv",chunksize=Chunksize,sep =',',encoding='utf-8',names=['date','addr','hjd','electricity'],header=0,low_memory=False):
    chunk = chunk.astype({'date':'str','addr':'str','hjd':'str','electricity':'int'})
    chunk = chunk.groupby(['date','hjd'],as_index=False)['electricity'].agg({'electricity':'sum','count':'count'})
    chunk = chunk.astype({'date':'str','hjd':'str','electricity':'int','count':'int'})
    df = pd.concat([df,chunk])
    x += Chunksize
    print("electricity -> 읽음: ", x)

while (df.duplicated(subset=['date','hjd']).sum() > 0):
    print(df.duplicated(subset=['date','hjd']).sum())
    df = df.groupby(['date','hjd'],as_index=False).sum()
    df = df.astype({'date':'str','hjd':'str','electricity':'int','count':'int'}) 
    print(df.duplicated(subset=['date','hjd']).sum())

df.to_csv("electricity_groupby.csv", mode='a', index=False, header=None)
profile
💼 Software Engineer @ LG Electronics | 🎓 SungKyunKwan Univ. CSE

0개의 댓글