python(20) pandas로 데이터 핸들링

hyukstory 혁스토리·2020년 8월 28일

목록 보기

26/35

import pandas as pd

handling cctv data


cctv = pd.read_csv('c:/Users/student/Desktop/python/P_4week/01.CCTV_in_Seoul.csv',encoding = 'UTF-8')
cctv.head()
cctv.columns[0]

# 열 제목 바꾸기
cctv.rename(columns={cctv.columns[0]:'구별'},inplace = True) # inplace: 변수의 내용을 갱신
cctv.head()

cctv.sort_values(by='소계',ascending=True).head(5)

# 최근 증가율이라는 열 새로 추가하기
cctv['최근증가율'] = (cctv['2016년'] + cctv['2015년'] + cctv['2014년'])\
    /cctv['2013년도 이전']* 100

cctv.sort_values(by='최근증가율',ascending=False).head(5)

handling population data

pop = pd.read_excel('c:/Users/student/Desktop/python/P_4week/01.population_in_Seoul.xls') # 인코딩 에러나면 그냥 빼기
pop.head()

# 옵션 적용
pop = pd.read_excel('c:/Users/student/Desktop/python/P_4week/01.population_in_Seoul.xls',
                    header=2,              # 세번째 줄부터 읽는 옵션
                    usecols = 'B,D,G,J,N') # B, D, G, J, N 열만 읽는 옵션
pop.head()

# 열 제목 바꾸기
pop.rename(columns={pop.columns[0] : '구별',
                    pop.columns[1] : '인구수',
                    pop.columns[2] : '한국인',
                    pop.columns[3] : '외국인',
                    pop.columns[4] : '고령자'},
                    inplace = True) # inplace: 변수의 내용을 갱신
pop.head()


# 행 지우기 : drop
pop.drop([0], inplace = True)
pop.head()


pop['구별'].unique()  # 반복된 데이터는 하나로 나타냄

pop[pop['구별'].isnull()] # nan 데이터 추출

pop.drop([26], inplace = True)
pop



pop['외국인비율'] = pop['외국인'] / pop['인구수'] * 100
pop['고령자비율'] = pop['고령자'] / pop['인구수'] * 100
pop.head()

CCTV data + 인구 데이터 합치기

# 구별 기준으로 합치기
data_result = pd.merge(cctv, pop, on = '구별')
data_result.head()

# 의미 없는 칼럼 지우기
del data_result['2013년도 이전']
del data_result['2014년']
del data_result['2015년']
del data_result['2016년']
data_result.head()



# index 를 '구별'로 설정하기
data_result.set_index('구별', inplace = True)
data_result.head()

합친 데이터 시각화

# 상관관계 분석
import numpy as np
np.corrcoef(data_result['고령자비율'], data_result['소계'])   # 소계 : cctv 개수
# = -0.28078554  : 약한 음의 상관관계

np.corrcoef(data_result['외국인비율'], data_result['소계'])
# = -0.13607433 : 거의 무시 (의미 없음)

np.corrcoef(data_result['인구수'], data_result['소계'])
# = 0.30634228 : 약한 상관관계



data_result.sort_values(by = '소계' , ascending = False).head(5)
data_result.sort_values(by = '인구수' , ascending = False).head(5)




# matplotlib 폰트를 한글로 변경하기
import platform
import matplotlib.pyplot as plt

from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system... sorry~~~~')


# 구별 CCTV 개수 막대그래프로 그려보기
import matplotlib.pyplot as plt
plt.figure()
#data_result['소계'].plot(kind = 'barh', grid = True, figsize = (10,10))

data_result['소계'].sort_values().plot(kind = 'barh', grid = True, figsize = (10,10))

plt.show()


# 인구 대비 CCTV 비율 계산해서 정렬
data_result['CCTV비율'] = data_result['소계'] / data_result['인구수'] * 100

data_result['CCTV비율'].sort_values().plot(kind='barh', 
                                         grid=True, figsize=(10,10))

plt.show()



# scatter 함수로 보기
plt.figure(figsize=(6,6))
plt.scatter(data_result['인구수'], data_result['소계'], s=50)
plt.xlabel('인구수')
plt.ylabel('CCTV')
plt.grid()
plt.show()




# scatter 함수 위에 직선 그리기
fp1 = np.polyfit(data_result['인구수'], data_result['소계'], 1)
fp1   # 직선

f1 = np.poly1d(fp1)   # y축
fx = np.linspace(100000, 700000, 100)  # x축

plt.figure(figsize=(10,10))
plt.scatter(data_result['인구수'], data_result['소계'], s=50)
plt.plot(fx, f1(fx), ls='dashed', lw=3, color='g')
plt.xlabel('인구수')
plt.ylabel('CCTV')
plt.grid()
plt.show()



# 좀 더 설득력 있는 그래프 만들기
## 1. 직선에서 멀리 있는 값들은 이름이 같이 나타나게
## 2. 직선에서 멀어질수록 다른 색을 나타내도록

fp1 = np.polyfit(data_result['인구수'], data_result['소계'], 1)

f1 = np.poly1d(fp1)
fx = np.linspace(100000, 700000, 100)

data_result['오차'] = np.abs(data_result['소계'] - f1(data_result['인구수'])) # 오차 계산

df_sort = data_result.sort_values(by='오차', ascending=False)
df_sort.head()



## color map 입히기
plt.figure(figsize=(14,10))
plt.scatter(data_result['인구수'], data_result['소계'], 
            c=data_result['오차'], s=50)
plt.plot(fx, f1(fx), ls='dashed', lw=3, color='g')

for n in range(10):
    plt.text(df_sort['인구수'][n]*1.02, df_sort['소계'][n]*0.98, 
             df_sort.index[n], fontsize=15)
    
plt.xlabel('인구수')
plt.ylabel('인구당비율')
plt.colorbar()
plt.grid()
plt.savefig('C:/Users/student/Desktop/python/P_4week/cctv_pop_plot.png')
plt.show()

결론 :

서울시에서 다른 구와 비교했을 때, 강남구, 양천구, 서초구, 은평구는 CCTV가 많지만,
송파구, 강서구, 도봉구, 마포구는 다른 구에 비해 CCTV 비율이 낮다

hyukstory 혁스토리

문돌이의 고군분투 개발 공부

이전 포스트

python(19) pandas 다루기

다음 포스트

python(20) pandas로 데이터 핸들링

python

handling cctv data

handling population data

CCTV data + 인구 데이터 합치기

합친 데이터 시각화

결론 :

python(19) pandas 다루기

python(21) open API + docx + GUI + tab 여러개 만들기

0개의 댓글