import chunk
from typing import final
from venv import create
import pandas as pd
import multiprocessing
import base64
import chunk
import os
import sys
import pymysql
import requests
from sqlalchemy import create_engine
Chunksize = 10**7
df = pd.DataFrame(columns=['date', 'addr', 'bjd', 'electricity'])
df = df.astype({'date':'str', 'addr':'str', 'bjd':'str', 'electricity':'int'})
for chunk in pd.read_csv("C:/Users/TSM/Downloads/electricity/electricity.txt", chunksize=Chunksize, sep ='|', encoding='CP949', usecols=[0, 1, 3, 4, 13],names=['date', 'addr', 'bjd', 'b2', 'electricity'], low_memory=False):
chunk = chunk.drop(chunk[chunk.date >= 202201].index)
chunk = chunk.astype({'date':'str', 'addr':'str', 'bjd':'str', 'b2':'str', 'electricity':'int'})
chunk['bjd'] = chunk['bjd'] + chunk['b2']
chunk = chunk.drop('b2', axis=1)
print(chunk)
df = pd.concat([df, chunk])
df.to_csv("electricity_parsed.txt", mode='w', index=False, header=None, encoding="utf-8-sig")
'''====================================================================================================='''
df2 = pd.DataFrame(columns=['date', 'addr', 'bjd', 'electricity'])
df2 = df2.astype({'date':'str', 'addr':'str', 'bjd':'str', 'electricity':'int'})
for chunk in pd.read_csv("C:/Users/TSM/Downloads/2022electricity/2022electricity.txt", chunksize=Chunksize, sep ='|', encoding='CP949', usecols=[0, 2, 3, 4, 5, 6, 8, 9, 16], names=['date', 'bjd', 'b2', 'addr0', 'addr1', 'addr2', 'addr3', 'addr4', 'electricity'], low_memory=False):
chunk = chunk.astype({'date':'str', 'bjd':'str', 'b2':'str', 'addr0':'str', 'addr1':'str', 'addr2':'str', 'addr3':'str', 'addr4':'str', 'electricity':'int'})
chunk['bjd'] = chunk['bjd'] + chunk['b2']
chunk = chunk.drop('b2', axis=1)
chunk['addr'] = chunk['addr0'] + ' ' + chunk['addr1'] + ' ' + chunk['addr2'] + ' ' + chunk['addr3'] + ' ' + chunk['addr4']
chunk = chunk.drop('addr0', axis=1)
chunk = chunk.drop('addr1', axis=1)
chunk = chunk.drop('addr2', axis=1)
chunk = chunk.drop('addr3', axis=1)
chunk = chunk.drop('addr4', axis=1)
print(chunk)
df2 = pd.concat([df2, chunk])
df2.to_csv("electricity_parsed.txt", mode='a', index=False, header=None, encoding="utf-8-sig")
pandas.errors.ParserError: Error tokenizing data. C error: out of memory
-> 전기에너지 소비량 데이터 chunksize 줄여야할듯
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 610. MiB for an array with shape (1, 160000000) and data type int32
-> df없이 chunk 단위로 바로 to_csv
import chunk
from typing import final
from venv import create
import pandas as pd
import multiprocessing
import base64
import chunk
import os
import sys
import pymysql
import requests
from sqlalchemy import create_engine
Chunksize = 5000000
i = 1
#df = pd.DataFrame(columns=['date', 'addr', 'bjd', 'electricity'])
#df = df.astype({'date':'str', 'addr':'str', 'bjd':'str', 'electricity':'int'})
for chunk in pd.read_csv("C:/Users/TSM/Downloads/electricity/electricity.txt", chunksize=Chunksize, sep ='|', encoding='CP949', usecols=[0, 1, 3, 4, 13],names=['date', 'addr', 'bjd', 'b2', 'electricity'], low_memory=False):
chunk = chunk.drop(chunk[chunk.date >= 202201].index)
chunk = chunk.astype({'date':'str', 'addr':'str', 'bjd':'str', 'b2':'str', 'electricity':'int'})
chunk['bjd'] = chunk['bjd'] + chunk['b2']
chunk = chunk.drop('b2', axis=1)
#print(chunk)
print(Chunksize * i)
i += 1
chunk.to_csv("electricity_parsed.txt", mode='a', index=False, header=None, encoding="utf-8-sig")
#df = pd.concat([df, chunk])
#df.to_csv("electricity_parsed.txt", mode='w', index=False, header=None, encoding="utf-8-sig")
'''====================================================================================================='''
df2 = pd.DataFrame(columns=['date', 'addr', 'bjd', 'electricity'])
df2 = df2.astype({'date':'str', 'addr':'str', 'bjd':'str', 'electricity':'int'})
for chunk in pd.read_csv("C:/Users/TSM/Downloads/2022electricity/2022electricity.txt", chunksize=Chunksize, sep ='|', encoding='CP949', usecols=[0, 2, 3, 4, 5, 6, 8, 9, 16], names=['date', 'bjd', 'b2', 'addr0', 'addr1', 'addr2', 'addr3', 'addr4', 'electricity'], low_memory=False):
chunk = chunk.astype({'date':'str', 'bjd':'str', 'b2':'str', 'addr0':'str', 'addr1':'str', 'addr2':'str', 'addr3':'str', 'addr4':'str', 'electricity':'int'})
chunk['bjd'] = chunk['bjd'] + chunk['b2']
chunk = chunk.drop('b2', axis=1)
chunk['addr'] = chunk['addr0'] + ' ' + chunk['addr1'] + ' ' + chunk['addr2'] + ' ' + chunk['addr3'] + ' ' + chunk['addr4']
chunk = chunk.drop('addr0', axis=1)
chunk = chunk.drop('addr1', axis=1)
chunk = chunk.drop('addr2', axis=1)
chunk = chunk.drop('addr3', axis=1)
chunk = chunk.drop('addr4', axis=1)
print(chunk)
df2 = pd.concat([df2, chunk])
df2.to_csv("electricity_parsed.txt", mode='a', index=False, header=None, encoding="utf-8-sig")