SW과정 머신러닝 1019(10)

JongseokLee·2021년 10월 19일

SW 빅데이터 실무과정 2021-0628~1206

목록 보기

52/58

SW과정 머신러닝 1019(10)

1. 공공데이터분석 by Pandas Code.

#전국 신규 민간 아파트 분양가격 동향
%ls  #우리가 쓰고있는 디렉토리 안쪽에 파일 목록 확인 가능
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore') #경고메세지 무시
pd.read_csv('전국_평균_분양가격(2013년_9월부터_2015년_8월까지).csv').head(2)
pd.read_csv('전국_평균_분양가격(2013년_9월부터_2015년_8월까지).csv', encoding='cp949').head(2)
pd.read_csv('주택도시보증공사_전국_평균_분양가격(2019년_12월).csv',encoding='cp949').head(2)
df_first = pd.read_csv('전국_평균_분양가격(2013년_9월부터_2015년_8월까지).csv', encoding='cp949')
df_last = pd.read_csv('주택도시보증공사_전국_평균_분양가격(2019년_12월).csv',encoding='cp949')

df_last.isnull().sum()
df_last.isna().sum() #결과값이 하나라도 있으면 nan으로 나옴

df_last['분양가격(㎡)'].sum() #오류 발생 int로 바꿔야함
df_last['분양가격(㎡)'].astype(int) #공백 문자 때문에 오류
pd.to_numeric(df_last['분양가격(㎡)'],errors='coerce').isna().sum() #문제가 있는 쪽을 nan값으로 바꿔줌, 아까는 277인데 378로 늘어남
df_last['분양가격'] = pd.to_numeric(df_last['분양가격(㎡)'],errors='coerce')

df_last['분양가격(㎡)'].describe() #요약정보가 나옴
df_last['분양가격'].describe() #(count: 총 개수/ unique: 고유값(중복값 없음)/ top : 가장 많이 나온 값(동일한 값) / freq(프리퀀시 가장 많이 나온 값의 나온 횟수))

df_last['규모구분'].replace('전체','') #전체일치
df_last['전용면적'] = df_last['규모구분'].str.replace('전용면적','')
df_last['전용면적'] = df_last['전용면적'].str.replace('초과','~')
df_last['전용면적'] = df_last['전용면적'].str.replace('이하','')
df_last['전용면적'] = df_last['전용면적'].str.replace(' ','~').str.strip()
df_last['전용면적']

#컬럼제거 (규모구분, 분양가격(㎡))
df_last.drop(['규모구분', '분양가격(㎡)'], axis=1, inplace=True) #컬럼삭제, 원본 저장

#지역별 평당분양가격의 평균
df_last.groupby('지역명')['평단분양가격'].mean().sort_values(ascending=False)

#전용면적별 평당분양가격
#전용면적, 지역별 평당분양가격
#지역별, 전용면적별 평당분양가격

temp = df_last.groupby(['지역명', '전용면적'])['평단분양가격'].mean().unstack()

#연도별지역별 평당분양가격

g = df_last.groupby(['연도','지역명'])['평단분양가격'].mean()
g
g.unstack() #인덱스로 되어있던 지역명이 컬럼으로 올라간 것 확인 가능
g.unstack().stack()
g.unstack().T # transpose()와 같음
g.unstack().transpose() #컬럼과 행 위치가 바뀜
df_last.columns.values[4] = '평당분양가격' #컬럼명 변경하는 방법
df_last

#지역별 평당분양가격(pivot_table)
pd.pivot_table(df_last, index=['지역명'], values=['평단분양가격'], aggfunc='mean').head(2)

#전용면적별 평당분양가격
pd.pivot_table(df_last, index=['전용면적'], values=['평단분양가격'], aggfunc='mean').head(2)

#전용면적지역별 평당분양가격
pd.pivot_table(df_last, index=['전용면적', '지역명'],values=['평단분양가격'],aggfunc='mean').round(-1).head(2) #round는 소수점 자리 위치를 말함 -1은 1의자리에서 끊음

pd.pivot_table(df_last, index=['전용면적'], columns=['지역명'], values=['평단분양가격'],aggfunc='mean').round(-1).head(2) #round는 소수점 자리 위치를 말함 -1은 1의자리에서 끊음

import matplotlib.pyplot as plt
g = df_last.groupby(['지역명'])['평단분양가격'].mean()
g
g.plot() #바로 차트가 그려짐

g.plot(kind='bar', rot=0, figsize=(10,8)) #rot=0으로 글자위치 정렬함, kind로 여러가지 모양 만들 수 있음

#분양가가 높은 지역명으로 분양가격의 평균을 구하고 선그래프로 시각화
g = df_last.groupby(['지역명'])['평단분양가격'].mean().sort_values(ascending=False)
g.plot()
plt.show()

temp.plot.bar() #kind 안쓰고 .bar or .barh 이렇게써도 가능함
temp.plot.bar(rot=0, figsize=(10,3))

#전용면적으로 평단분양가격의 평균을 구하고 막대그래프로 시각화
df_last.columns
g = df_last.groupby('전용면적')['평당분양가격'].mean().plot.bar(rot=0, figsize=(7,5))

df_last.head(1)
df_last.columns

2. Box plot

상위 25%와 하위 25%는 빼고 중위값을 기준으로 분석함
errors='coerce'는 errors의 에러 통제 옵션

errors='coerce'는 NaN으로 교체 errors='ignore'은 스킵, errors='raise'는 에러발생

df_last.pivot_table(index='연도',values='평당분양가격').plot.box()

df_last.pivot_table(index='월', columns=['연도'],values='평당분양가격').plot.box()

df_last.pivot_table(index='전용면적', columns=['연도'], values='평당분양가격').plot.box()

temp = df_last.pivot_table(index='월', columns=['연도', '전용면적'], values=['평당분양가격'])

temp.plot.box(figsize=(15,3),rot=90)
plt.show()

temp.plot.bar()

import seaborn as sns
plt.figure(figsize=(10,6))
sns.barplot(data=df_last, x='지역명', y='평당분양가격')

sns.barplot(data=df_last, x='연도', y='평당분양가격')
?sns.barplot #쉬프트+탭에 나오는 항목 다 나옴

import seaborn as sns
sns.set_theme(style="whitegrid")
tips = sns.load_dataset("tips")
ax = sns.barplot(x="day", y="total_bill", data=tips)

ax = sns.barplot(x="time", y="tip",data=tips, order=["Dinner", "Lunch"])

sns.set(font='NanumGothic')
sns.catplot(data=df_last, x='연도',  y='평당분양가격', kind='bar', col='지역명', col_wrap=4)

plt.figure(figsize=(10,5))
sns.lineplot(data=df_last, x='연도', y='평당분양가격', hue='지역명')
plt.legend(bbox_to_anchor=(1.02,1)) #범례위치 표시

sns.relplot(data=df_last, x='연도', y='평당분양가격', kind='line', hue='지역명', col='지역명', col_wrap=4)

sns.boxplot(data=df_last, x='연도', y='평당분양가격')

plt.figure(figsize=(13,5))
sns.boxplot(data=df_last, x='연도', y='평당분양가격', hue='전용면적')

sns.violinplot(data=df_last, x='연도', y='평당분양가격')

pd.options.display.max_columns=
df_last.head()

#pd.options.display.max_rows=None #숨은 행 다 보이게 하는 코드
#pd.options.display.max_columns=25 #숨은 열 개수만큼 다 보이게 하는 코드
pd.options.display.max_rows=10
df_last

df_first_melt = df_first.melt(id_vars='지역', var_name='기간', value_name='평당분양가격')

df_first_melt.columns = ['지역명', '기간', '평당분양가격']
df_first_melt['연도'] = df_first_melt['기간'].str.split('년').str.get(0).astype(int)
df_first_melt['월'] = df_first_melt['기간'].str.split('년').str.get(1).str.replace('월','').astype(int)

#전용면적이 '전체'인 데이터만 추출, 컬럼은 (지역명, 연도, 월, 평당분양가격)만 추출합니다.
# df_last_pre = df_last[df_last['전용면적'] == '전체']
# df_last_pre.columns
# df_last_pre.loc[:,['지역명', '연도', '월', '평당분양가격']]
df_last_pre = df_last.loc[df_last['전용면적'] == '전체',['지역명', '연도', '월', '평당분양가격']]

df_first_pre = df_first_melt[['지역명', '연도', '월', '평당분양가격']]
df_last_pre.head(1)
pd.options.display.max_rows=25
df_first_melt.append(df_first_pre)

# pd.options.display.max_rows=25
# df_first_melt.append(df_first_pre)

df = pd.concat([df_first_pre,df_last_pre])
df

#연도별 평당분양가격 막대그래프로 시각화
#df.groupby('연도')['평당분양가격'].plot.bar()
sns.barplot(data=df,x='연도',y='평당분양가격')

JongseokLee

DataEngineer Lee.

이전 포스트

SW과정 머신러닝 1018(9)

다음 포스트

SW과정 머신러닝 1019(10)

SW 빅데이터 실무과정 2021-0628~1206

SW과정 머신러닝 1019(10)

1. 공공데이터분석 by Pandas Code.

2. Box plot

SW과정 머신러닝 1018(9)

SW과정 머신러닝 1020(11)

0개의 댓글