에너지 소비량 (gas, electricity) parsing

ewillwin·2022년 7월 12일
0

TSMtech Record

목록 보기
10/39
import chunk
from typing import final
from venv import create
import pandas as pd
import multiprocessing
import base64
import chunk
import os
import sys
import pymysql
import requests
from sqlalchemy import create_engine

Chunksize = 10**7

df = pd.DataFrame(columns=['date', 'addr', 'bjd', 'electricity'])
df = df.astype({'date':'str', 'addr':'str', 'bjd':'str', 'electricity':'int'})
for chunk in pd.read_csv("C:/Users/TSM/Downloads/electricity/electricity.txt", chunksize=Chunksize, sep ='|', encoding='CP949', usecols=[0, 1, 3, 4, 13],names=['date', 'addr', 'bjd', 'b2', 'electricity'], low_memory=False):
    chunk = chunk.drop(chunk[chunk.date >= 202201].index)
    chunk = chunk.astype({'date':'str', 'addr':'str', 'bjd':'str', 'b2':'str', 'electricity':'int'})
    chunk['bjd'] = chunk['bjd'] + chunk['b2']
    chunk = chunk.drop('b2', axis=1)
    print(chunk)
    df = pd.concat([df, chunk])
df.to_csv("electricity_parsed.txt", mode='w', index=False, header=None, encoding="utf-8-sig")

'''====================================================================================================='''

df2 = pd.DataFrame(columns=['date', 'addr', 'bjd', 'electricity'])
df2 = df2.astype({'date':'str', 'addr':'str', 'bjd':'str', 'electricity':'int'})
for chunk in pd.read_csv("C:/Users/TSM/Downloads/2022electricity/2022electricity.txt", chunksize=Chunksize, sep ='|', encoding='CP949', usecols=[0, 2, 3, 4, 5, 6, 8, 9, 16], names=['date', 'bjd', 'b2', 'addr0', 'addr1', 'addr2', 'addr3', 'addr4', 'electricity'], low_memory=False):
    chunk = chunk.astype({'date':'str', 'bjd':'str', 'b2':'str', 'addr0':'str', 'addr1':'str', 'addr2':'str', 'addr3':'str', 'addr4':'str', 'electricity':'int'})
    chunk['bjd'] = chunk['bjd'] + chunk['b2']
    chunk = chunk.drop('b2', axis=1)
    chunk['addr'] = chunk['addr0'] + ' ' + chunk['addr1'] + ' ' + chunk['addr2'] + ' ' + chunk['addr3'] + ' ' + chunk['addr4']
    chunk = chunk.drop('addr0', axis=1)
    chunk = chunk.drop('addr1', axis=1)
    chunk = chunk.drop('addr2', axis=1)
    chunk = chunk.drop('addr3', axis=1)
    chunk = chunk.drop('addr4', axis=1)
    print(chunk)
    df2 = pd.concat([df2, chunk])
df2.to_csv("electricity_parsed.txt", mode='a', index=False, header=None, encoding="utf-8-sig")

pandas.errors.ParserError: Error tokenizing data. C error: out of memory
-> 전기에너지 소비량 데이터 chunksize 줄여야할듯

numpy.core._exceptions._ArrayMemoryError: Unable to allocate 610. MiB for an array with shape (1, 160000000) and data type int32
-> df없이 chunk 단위로 바로 to_csv

import chunk
from typing import final
from venv import create
import pandas as pd
import multiprocessing
import base64
import chunk
import os
import sys
import pymysql
import requests
from sqlalchemy import create_engine

Chunksize = 5000000

i = 1
#df = pd.DataFrame(columns=['date', 'addr', 'bjd', 'electricity'])
#df = df.astype({'date':'str', 'addr':'str', 'bjd':'str', 'electricity':'int'})
for chunk in pd.read_csv("C:/Users/TSM/Downloads/electricity/electricity.txt", chunksize=Chunksize, sep ='|', encoding='CP949', usecols=[0, 1, 3, 4, 13],names=['date', 'addr', 'bjd', 'b2', 'electricity'], low_memory=False):
    chunk = chunk.drop(chunk[chunk.date >= 202201].index)
    chunk = chunk.astype({'date':'str', 'addr':'str', 'bjd':'str', 'b2':'str', 'electricity':'int'})
    chunk['bjd'] = chunk['bjd'] + chunk['b2']
    chunk = chunk.drop('b2', axis=1)
    #print(chunk)
    print(Chunksize * i)
    i += 1
    chunk.to_csv("electricity_parsed.txt", mode='a', index=False, header=None, encoding="utf-8-sig")
    #df = pd.concat([df, chunk])

#df.to_csv("electricity_parsed.txt", mode='w', index=False, header=None, encoding="utf-8-sig")

'''====================================================================================================='''

df2 = pd.DataFrame(columns=['date', 'addr', 'bjd', 'electricity'])
df2 = df2.astype({'date':'str', 'addr':'str', 'bjd':'str', 'electricity':'int'})
for chunk in pd.read_csv("C:/Users/TSM/Downloads/2022electricity/2022electricity.txt", chunksize=Chunksize, sep ='|', encoding='CP949', usecols=[0, 2, 3, 4, 5, 6, 8, 9, 16], names=['date', 'bjd', 'b2', 'addr0', 'addr1', 'addr2', 'addr3', 'addr4', 'electricity'], low_memory=False):
    chunk = chunk.astype({'date':'str', 'bjd':'str', 'b2':'str', 'addr0':'str', 'addr1':'str', 'addr2':'str', 'addr3':'str', 'addr4':'str', 'electricity':'int'})
    chunk['bjd'] = chunk['bjd'] + chunk['b2']
    chunk = chunk.drop('b2', axis=1)
    chunk['addr'] = chunk['addr0'] + ' ' + chunk['addr1'] + ' ' + chunk['addr2'] + ' ' + chunk['addr3'] + ' ' + chunk['addr4']
    chunk = chunk.drop('addr0', axis=1)
    chunk = chunk.drop('addr1', axis=1)
    chunk = chunk.drop('addr2', axis=1)
    chunk = chunk.drop('addr3', axis=1)
    chunk = chunk.drop('addr4', axis=1)
    print(chunk)
    df2 = pd.concat([df2, chunk])
df2.to_csv("electricity_parsed.txt", mode='a', index=False, header=None, encoding="utf-8-sig")
profile
💼 Software Engineer @ LG Electronics | 🎓 SungKyunKwan Univ. CSE

0개의 댓글