17_EDA 2

김정연·2023년 6월 23일

데이터스쿨

목록 보기

18/30

02.Analysis Seoul Crime

import pandas as pd
import numpy as np

# 데이터 읽기, 숫자값 천단위 ',' 처리
crime_raw_data = pd.read_csv("02. crime_in_Seoul.csv", thousands=",", encoding="euc-kr")
crime_raw_data.head()

crime_raw_data.info()

info()로 데이터 개요를 확인해 보니, rangeindex가 65534개인데 310개로 나와있다.

crime_raw_data["죄종"].unique()

#출력 : array(['살인', '강도', '강간', '절도', '폭력', nan], dtype=object)
#Nan이 존재

# null 찾기
crime_raw_data[crime_raw_data["죄종"].isnull()].head()

# null 제거
crime_raw_data = crime_raw_data[crime_raw_data["죄종"].notnull()]

다시 info()로 확인해보니 null값이 제거된 것을 확인할 수 있다.

✅Pandas Pivot Table 기본개념

index, columns, values, aggfunc, fill_value

#예시로 사용할 데이터 불러오기
df = pd.read_excel("02. sales-funnel.xlsx")

df.head()

# Name 컬럼을 인덱스로 설정
pd.pivot_table(df, index="Name")

#멀티 인덱스 설정
df.pivot_table(index=["Name", "Rep", "Manager"])

#values 설정
df.pivot_table(index=["Manager", "Rep"], values = "Price")

#Price컬럼 sum 연산 적용
df.pivot_table(index=["Manager", "Rep"], values="Price", aggfunc=[np.sum, len])

# Product를 컬럼으로 설정
df.pivot_table(index=["Manager", "Rep"], values="Price", columns="Product", aggfunc=np.sum)

# Nan 값 설정 : fill_value
df.pivot_table(index=["Manager", "Rep"], values="Price", columns="Product", aggfunc=np.sum, fill_value=0)

#aggfunc 2개 이상 설정
df.pivot_table(
    index=["Manager", "Rep", "Product"],
    values=["Price", "Quantity"],
    aggfunc=[np.sum, np.mean],
    fill_value=0,
    margins=True #총계(All) 추가
)

서울시 범죄 현황 정리

crime_station = crime_raw_data.pivot_table(index="구분", columns=["죄종", "발생검거"], aggfunc=[np.sum])
crime_station.head()

crime_station.columns

#다중 컬럼에서 특정 컬럼 제거
crime_station.columns = crime_station.columns.droplevel([0,1]) 
crime_station.columns

#정리된 데이터 확인
crime_station.head()

crime_station.index

현재 index는 경찰서 이름으로 되어 있다.
경찰서 이름으로 구 이름을 알아내야 한다.

Iterrows(),Google Maps API

Pandas에 잘 맞춰진 반복문용 명령 iterrows()

Pandas 데이터 프레임은 대부분 2차원

이럴 때 for문을 이용하면, n번째라는 지정을 반복해서 가독률이 떨어짐

Pandas 데이터 프레임으로 반복문을 만들 때 iterrows()옵션을 사용하면 편함

받을 때, 인덱스와 내용으로 나누어 받는 것만 주의

# 구글에서 내키 받기
import googlemaps
gmaps_key = "내키"
gmaps = googlemaps.Client(key=gmaps_key)

#단순 테스트 코드
tmp = gmaps.geocode("서울영등포경찰서", language="ko")
tmp

# 구별 , lat, lng 컬럼생성
crime_station["구별"] = np.nan
crime_station["lat"] = np.nan
crime_station["lng"] = np.nan
crime_station.head()

이제부터....

경찰서 이름에서 소속된 구이름 얻기
구이름과 위도 경도 정보를 저장할 준비
반복문을 이용해서 위 표의 Nan을 모두 채워준다
iterrows()

count=0 # 작업이 잘 되는지 확인하기 위한 수단

for idx, rows in crime_station.iterrows():
  station_name = "서울" + str(idx) + "경찰서"
  tmp = gmaps.geocode(station_name, language="ko")

  tmp_gu = tmp[0].get("formatted_address")

  lat = tmp[0].get("geometry")["location"]["lat"]
  lng = tmp[0].get("geometry")["location"]["lng"]

  crime_station.loc[idx, "lat"] = lat
  crime_station.loc[idx, "lng"] = lng
  crime_station.loc[idx, "구별"] = tmp_gu.split()[2]

  print(count)
  count += 1

# 컬럼명이 정리된 데이터 확인
crime_station.columns = tmp
crime_station.head()

구별 데이터로 정리하기

# index_col "구분"을 인덱스 컬럼으로 설정
crime_anal_station = pd.read_csv("02. crime_in_Seoul_raw.csv", index_col=0, encoding="utf-8")
crime_anal_station.head()

crime_anal_gu = pd.pivot_table(crime_anal_station, index="구별", aggfunc=np.sum)

del crime_anal_gu["lat"]
crime_anal_gu.drop("lng", axis=1, inplace=True)

crime_anal_gu.head()

# 검거율 생성
# 하나의 컬럼을 다른 컬럼으로 나누기

crime_anal_gu["강도검거"] / crime_anal_gu["강도발생"]

# 다수의 컬럼을 다른 컬럼으로 나누기
crime_anal_gu[["강도검거", "살인검거"]].div(crime_anal_gu["강도발생"], axis=0).head()

target = ["강간검거율", "강도검거율", "살인검거율", "절도검거율", "폭력검거율"]
num = ["강간검거", "강도검거", "살인검거", "절도검거", "폭력검거"]
den = ["강간발생", "강도발생", "살인발생", "절도발생", "폭력발생"]

crime_anal_gu[target] = crime_anal_gu[num].div(crime_anal_gu[den].values)*100
crime_anal_gu.head()

# 필요없는 컬럼 제거

crime_anal_gu.drop(["강간검거", "강도검거", "살인검거", "절도검거", "폭력검거"], axis=1, inplace=True)
crime_anal_gu.head()

# 100보다 큰 숫자 찾아서 바꾸기

crime_anal_gu[crime_anal_gu[target]>100] = 100
crime_anal_gu.head()

from os import replace
# 컬럼 이름 변경

crime_anal_gu.rename(columns={"강간발생":"강간", "살인발생":"살인", "절도발생":"절도", "강도발생":"강도", "폭력발생":"폭력"}, inplace=True)
crime_anal_gu.head()

범죄 데이터 정렬을 위한 데이터 정리

# 정규화
col = ["살인", "강도", "강간", "절도", "폭력"]
crime_anal_norm = crime_anal_gu[col] / crime_anal_gu[col].max()
crime_anal_norm.head()

# 검거율 추가
col2 = ["강간검거율", "강도검거율", "살인검거율", "절도검거율", "폭력검거율"]
crime_anal_norm[col2] =crime_anal_gu[col2]
crime_anal_norm.head()

# 구별 CCTV 자료에서 인구수와 CCTV수 추가
result_CCTV = pd.read_csv("01. CCTV_result.csv", index_col="구별", encoding="utf-8")

crime_anal_norm[["인구수", "CCTV"]] = result_CCTV[["인구수", "소계"]]
crime_anal_norm.head()

#정규화된 범죄발생 건수 전체의 평균을 구해서 범죄 컬럼 대표값으로 사용

col= ["강간", "강도", "살인", "절도", "폭력"]
crime_anal_norm["범죄"] = np.mean(crime_anal_norm[col], axis=1) #axis=1 행, axis=0 열
crime_anal_norm.head()

# 검거율의 평균을 구해서 검거 컬럼의 대표값으로 사용

col = ["강간검거율", "강도검거율", "살인검거율", "절도검거율", "폭력검거율"]
crime_anal_norm["검거"] = np.mean(crime_anal_norm[col], axis=1) #axis=1 행을 따라서 연산하는 옵션
crime_anal_norm.head()

# np.mean 개념
np.array([0.384615,	1.000000,	1.000000,	1.000000,	1.000000])
#출력 : array([0.384615, 1.      , 1.      , 1.      , 1.      ])

np.mean(np.array([0.384615,	1.000000,	1.000000,	1.000000,	1.000000]))
#출력 : 0.876923

✅ Seaborn 개념 익히기

예제 1

np.linspace(0, 14, 100)

#출력:
array([ 0.        ,  0.14141414,  0.28282828,  0.42424242,  0.56565657,
        0.70707071,  0.84848485,  0.98989899,  1.13131313,  1.27272727,
        1.41414141,  1.55555556,  1.6969697 ,  1.83838384,  1.97979798,
        2.12121212,  2.26262626,  2.4040404 ,  2.54545455,  2.68686869,
        2.82828283,  2.96969697,  3.11111111,  3.25252525,  3.39393939,
        3.53535354,  3.67676768,  3.81818182,  3.95959596,  4.1010101 ,
        4.24242424,  4.38383838,  4.52525253,  4.66666667,  4.80808081,
        4.94949495,  5.09090909,  5.23232323,  5.37373737,  5.51515152,
        5.65656566,  5.7979798 ,  5.93939394,  6.08080808,  6.22222222,
        6.36363636,  6.50505051,  6.64646465,  6.78787879,  6.92929293,
        7.07070707,  7.21212121,  7.35353535,  7.49494949,  7.63636364,
        7.77777778,  7.91919192,  8.06060606,  8.2020202 ,  8.34343434,
        8.48484848,  8.62626263,  8.76767677,  8.90909091,  9.05050505,
        9.19191919,  9.33333333,  9.47474747,  9.61616162,  9.75757576,
        9.8989899 , 10.04040404, 10.18181818, 10.32323232, 10.46464646,
       10.60606061, 10.74747475, 10.88888889, 11.03030303, 11.17171717,
       11.31313131, 11.45454545, 11.5959596 , 11.73737374, 11.87878788,
       12.02020202, 12.16161616, 12.3030303 , 12.44444444, 12.58585859,
       12.72727273, 12.86868687, 13.01010101, 13.15151515, 13.29292929,
       13.43434343, 13.57575758, 13.71717172, 13.85858586, 14.        ])

x = np.linspace(0, 14, 100)
y1 = np.sin(x)
y2 = 2 * np.sin(x + 0.5)
y3 = 3 * np.sin(x + 1)
y4 = 4 * np.sin(x + 1.5)



plt.figure(figsize=(10,6))
plt.plot(x, y1, x, y2, x, y3, x, y4)
plt.show()

# sns.set_style()
# "white", "whitegrid", "dark", "darkgrid"

sns.set_style("darkgrid")
plt.figure(figsize=(10,6))
plt.plot(x, y1, x, y2, x, y3, x, y4)
plt.show()

예제2: Seaborn tips data
- boxplot
- swarmplot
- lmplot

tips = sns.load_dataset("tips")
tips

# boxplot
plt.figure(figsize=(8,6))
sns.boxplot(x=tips["total_bill"])
plt.show()

# boxplot
plt.figure(figsize=(8,6))
sns.boxplot(x=tips["day"], y=tips["total_bill"])
plt.show()

# boxplot hue, palette option
# hue: 카테고리 데이터를 표현하는 옵션

plt.figure(figsize=(8,6))
sns.boxplot(x="day", y="total_bill", data=tips, hue="smoker", palette="Set3")
plt.show()

#swarmplot
# color: 0-1 사이 검은색부터 흰색사이 값을 조절

plt.figure(figsize=(8, 6))
sns.swarmplot(x="day", y="total_bill", data=tips, color="0.5")
plt.show()

# boxplot with swarmplot

plt.figure(figsize=(8,6))
sns.boxplot(x="day", y="total_bill", data=tips)
sns.swarmplot(x="day", y="total_bill", data=tips, color="0.25")
plt.show()

# lmplot: total_bill 과 tip 사이 관계 파악

sns.set_style("darkgrid")
sns.lmplot(x="total_bill", y="tip", data=tips, height=7) #size => height 바뀜
plt.show()

# hue option

sns.set_style("darkgrid")
sns.lmplot(x="total_bill", y="tip", data=tips, height=7, hue="smoker")
plt.show()

예제 3 : flights data
- heatmap

flights = sns.load_dataset("flights")
flights.head()

# pivot
# index, columns, values

flights = flights.pivot_table(index="month", columns="year", values="passengers")
flights.head()

# heatmaps

plt.figure(figsize=(10, 8))
sns.heatmap(data=flights, annot=True, fmt="d") # annot=True 데이터 값 표시, fmt="d" 정수형 표현
plt.show()

# colormap

plt.figure(figsize=(10, 8))
sns.heatmap(flights, annot=True, fmt="d", cmap="YlGnBu")
plt.show()

예제 4 : iris data
- pairplot

iris = sns.load_dataset("iris")
iris.tail()

# pairplot

sns.set_style("ticks")
sns.pairplot(iris)
plt.show()

# hue option

sns.pairplot(iris, hue="species")
plt.show()

# 원하는 컬럼만 pairplot

sns.pairplot(iris,
             x_vars=["sepal_width", "sepal_length"],
             y_vars=["petal_width", "petal_length"]
             )
plt.show()

예제5 : anscombe data
- lmplot

anscombe = sns.load_dataset("anscombe")
anscombe.tail()

anscombe["dataset"]. unique()

#출력 : array(['I', 'II', 'III', 'IV'], dtype=object)

sns.set_style("darkgrid")
sns.lmplot(x="x", y="y", data=anscombe.query("dataset == 'I'"), ci = None, height=7) #ci신뢰구간 선택
plt.show()

sns.set_style("darkgrid")
sns.lmplot(x="x", y="y", data=anscombe.query("dataset == 'I'"), ci = None, height=7, scatter_kws={"s":80}) #ci신뢰구간 선택
plt.show()

# order option
sns.set_style("darkgrid")
sns.lmplot(
    x="x",
    y="y",
    data=anscombe.query("dataset == 'II'"),
    order=1,
    ci = None,
    height=7,
    scatter_kws={'s':80}) #ci신뢰구간 선택
plt.show()

# order option
sns.set_style("darkgrid")
sns.lmplot(
    x="x",
    y="y",
    data=anscombe.query("dataset == 'II'"),
    order=2,
    ci = None,
    height=7,
    scatter_kws={'s':80}) #ci신뢰구간 선택
plt.show()

# outlier option - 동떨어진 데이터 다루기
sns.set_style("darkgrid")
sns.lmplot(
    x="x",
    y="y",
    data=anscombe.query("dataset == 'III'"),
    ci = None,
    height=7,
    scatter_kws={'s':80}) #ci신뢰구간 선택
plt.show()

# outlier option - 동떨어진 데이터 다루기
sns.set_style("darkgrid")
sns.lmplot(
    x="x",
    y="y",
    data=anscombe.query("dataset == 'III'"),
    robust=True,
    ci = None,
    height=7,
    scatter_kws={'s':80}) #ci신뢰구간 선택
plt.show()

서울시 범죄 현황 데이터 시각화

crime_anal_norm.head()

# "인구수", "CCTV"와 "살인", "강도"의 상관관계 확인

def drawGraph():
  sns.pairplot(
      data = crime_anal_norm,
      x_vars=["인구수", "CCTV"],
      y_vars=["살인", "강도"],
      kind="reg",
      height=4
  )
  plt.show()
drawGraph()

# "인구수", "CCTV"와 "절도검거율", "강도검거율"의 상관관계 확인
def drawGraph():
  sns.pairplot(
      data = crime_anal_norm,
      x_vars=["인구수", "CCTV"],
      y_vars=["절도검거율", "강도검거율"],
      kind="reg",
      height=4
  )
  plt.show()
drawGraph()

# 검거율 heatmap
# "검거" 컬럼을 기준으로 정렬

def drawGraph():

  #데이터 프레임생성
  target_col = ["강간검거율","강도검거율","살인검거율","절도검거율","폭력검거율","검거"]
  crime_anal_norm_sort = crime_anal_norm.sort_values(by="검거", ascending=False)


  #그래프 설정
  plt.figure(figsize=(10,10))
  sns.heatmap(
      data=crime_anal_norm_sort[target_col],
      annot=True, #데이터값 표현
      fmt="f", #d:정수, f:실수
      linewidths=0.5, #네모칸끼리의 간격설정
      cmap="RdPu"
  )
  plt.title("범죄 검거 비율(정규화된 검거의 합으로 정렬)")
  plt.show()

drawGraph()

# 범죄발생 건수 heatmap
# "범죄" 컬럼을 기준으로 정렬

def drawGraph():


  #데이터 프레임 생성
  target_col=["살인", "강도", "강간", "절도", "폭력", "범죄"]
  crime_anal_norm_sort = crime_anal_norm.sort_values(by="범죄", ascending=False)

  #그래프 설정
  plt.figure(figsize=(10,10))
  sns.heatmap(
      data=crime_anal_norm_sort[target_col],
      annot=True,
      fmt = "f",
      linewidths=0.5,
      cmap="RdPu"
  )
  plt.title("범죄 비율(정규화된 발생 건수로 정렬)")
  plt.show()

drawGraph()

✅ Folium 기본개념

import folium
import pandas as pd
import json

folium.Map()
- 위도경도찾기 : 구글어스 이용

#save()
m.save("folium.html")
!ls

#출력 :
'01. CCTV_result.csv'
'01. Seoul_CCTV.csv'
'01. Seoul_Population.xls'
'02. crime_in_Seoul_1st.csv'
'02. crime_in_Seoul.csv'
'02. crime_in_Seoul_final.csv'
'02. crime_in_Seoul_location.csv'
'02. crime_in_Seoul_raw.csv'
'02. sales-funnel.xlsx'
'02. skorea_municipalities_geo_simple.json'
'02. us-states.json'
'02. US_Unemployment_Oct2012.csv'
'02. 서울특별시 동작구_주택유형별 위치 정보 및 세대수 현황_20210825.csv'
 folium.html
 sample_data


m = folium.Map(
    location=[37.43087134,127.0233423],
    zoom_start=14, # 0-18정도로
    tiles="Stamen WaterColor"
    )
     
m

tiles 종류
https://deparkes.co.uk/2016/06/10/folium-map-tiles/ 참고

folium.Marker()

지도에 마커 생성

folium.Icon()

https://fontawesome.com/search?m=free&o=r 참고

https://getbootstrap.com/docs/3.3/components/ 참고

m = folium.Map(
    location=[37.43087134,127.023342], #대공원
    zoom_start=14,  # 0-18정도로
    tiles="OpenStreetMap"
    )
     
     
# icon basic
folium.Marker(
    (37.43564921,127.00594648),
    icon=folium.Icon(color="black", icon="info-sign")
    ).add_to(m) #대공원역


#icon icon_color
folium.Marker(
    location=[37.43087134,127.023342], #대공원
    popup="<b>LandMark</b>",#볼드체
    icon=folium.Icon(
        color="red",
        icon_color="pink",
        icon="cloud"
        )
).add_to(m)


#tooltip
folium.Marker(
    location=[37.43087134,127.023342], #대공원
    popup="<b>LandMark</b>",#볼드체
    tooltip="<i>서울대공원</i>"  #이탈리아체
).add_to(m)


# html
folium.Marker(
    location=[37.43438407,127.00991914], #대공원
    popup="<a href='https://zero-base.co.kr/' target=_'blink'>제로베이스</a>",
    tooltip="<i>광장</i>"
).add_to(m)


#Icon custom
folium.Marker(
    location=[37.43819238,127.00545875], #과천과학박물관
    popup="과천과학박물관",
    tooltip="Icon custom",
    icon=folium.Icon(
        color="purple",
        icon_color="green",
        icon="android",
        angle=50,   #아이콘의 기울어진 각도
        prefix="fa") # "android"같은 경우, prefix="fa" 설정
).add_to(m)

m

folium.ClickForMarker()
- 지도위에 마우스로 클릭했을 때, 마커를 생성해준다.

m = folium.Map(
    location=[37.43087134,127.023342], #대공원
    zoom_start=14,
    tiles="OpenStreetMap"
    )

m.add_child(folium.ClickForMarker(popup="ClickForMarker"))

folim.LatLngPopup()
- 지도를 마우스로 클릭했을때 위도 경도 정보를 반환해준다.

from folium.features import LatLngPopup
m = folium.Map(
    location=[37.43087134,127.023342], #대공원
    zoom_start=14,
    tiles="OpenStreetMap"
    )

m.add_child(folium.LatLngPopup())

folium.Circle(), folium.CircleMarker()

m = folium.Map(
    location=[37.43087134,127.023342], #대공원
    zoom_start=14,
    tiles="OpenStreetMap"
    )
# Circle
folium.Circle(
    location=[37.43200661,127.01796124], #국립현대미술관
    radius=100,
    fill=True,
    color="#5934eb",
    fill_color="red",
    popup="Circle Popup",
    tooltip="Circle Tooltip"
).add_to(m)


# CircleMarker
folium.CircleMarker(
    location=[37.42935359,127.02267355], #캠핑장
    radius=30,
    fill=True,
    color="#5934eb",
    fill_color="red",
    popup="CircleMarker Popup",
    tooltip="CircleMarker Tooltip"
).add_to(m)
m


#둘이 별로 차이가 없으니 필요한 것을 사용하면 된다.

folium.Choropleth

import json
state_data = pd.read_csv("02. US_Unemployment_Oct2012.csv")
state_data.head(2)

m = folium.Map([43, -102], zoom_start=3)

folium.Choropleth(
    geo_data="02. us-states.json", #경계선 죄표값이 담긴 데이터
    data=state_data, #Series or DataFrame
    columns=["State", "Unemployment"],
    key_on="feature.id",
    fill_color="BuPu",
    fill_opacity=1, #0-1
    line_opacity=1, #0-1
    legend_name="Unemployment rate (%)"
).add_to(m)

m

참고

https://velog.io/@eodud0582/Folium

https://nbviewer.org/github/python-visualization/folium/tree/main/examples/

서울시 범죄 현황에 대한 지도 시각화

import json
import folium
import pandas as pd
import numpy as np


geo_path = "02. skorea_municipalities_geo_simple.json"
geo_str = json.load(open(geo_path, encoding="utf-8"))


crime_anal_norm = pd.read_csv("02. crime_in_Seoul_final.csv", index_col=0, encoding="utf-8")
crime_anal_norm.tail(2)

# 살인발생 건수 지도 시각화

my_map = folium.Map(
    location=[37.5502, 126.982],
    zoom_start=11,
    tiles="Stamen Toner"
)

folium.Choropleth(
    geo_data=geo_str, #우리나라 경계선 좌표값이 담긴 데이터
    data=crime_anal_norm["살인"],
    columns=[crime_anal_norm.index, crime_anal_norm["살인"]],
    key_on="feature.id",
    fill_color="PuRd",
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name="정규화된 살인 발생 건수"
).add_to(my_map)

my_map

# 인구대비 범죄 발생 건수

tmp_criminal = crime_anal_norm["범죄"] / crime_anal_norm["인구수"]


my_map = folium.Map(
    location=[37.5502, 126.982],
    zoom_start=11,
    tiles="Stamen Toner"
)

folium.Choropleth(
    geo_data=geo_str, #우리나라 경계선 좌표값이 담긴 데이터
    data=tmp_criminal,
    columns=[crime_anal_norm.index, tmp_criminal],
    key_on="feature.id",
    fill_color="PuRd",
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name="인구 대비 범죄 발생 건수"
).add_to(my_map)

my_map