EDA - 02. Analysis Seoul Crime (w. Seaborn, Folium)

์†”๋น„ยท2024๋…„ 1์›” 2์ผ
0

mini project


๐ŸŒŸ ๊ฐœ์š” : ์„œ์šธ์‹œ ๋ฒ”์ฃ„ํ˜„ํ™ฉ ๋ถ„์„
๐ŸŒŸ ๋ชฉํ‘œ

  1. ๊ฒฝ์ฐฐ์„œ๋ณ„ ๋ฒ”์ฃ„๋ณ„ ๋ฐ์ดํ„ฐ ํ”ผ๋ด‡ํŒ…
  2. ๊ฒฝ์ฐฐ์„œ๋ช… <> ๊ตฌ๋ณ„ ๋ฐ์ดํ„ฐ ์ •๋ฆฌ
  3. ๋ฒ”์ฃ„ ๋ฐ์ดํ„ฐ ์ •๊ทœํ™”
  4. ์ƒ๊ด€๊ด€๊ณ„ ์‹œ๊ฐํ™”
  5. ์ง€๋„ ์‹œ๊ฐํ™”

ํ”„๋กœ์ ํŠธ ์‹œ์ž‘ ์ „ ํ•œ๊ธ€ ๋ฐ ๋งˆ์ด๋„ˆ์Šค ๋ถ€ํ˜ธ ๊นจ์งํ˜„์ƒ ์„ธํŒ…

from matplotlib import font_manager as fm
from matplotlib import pyplot as plt

#ํ•œ๊ธ€ํฐํŠธ ๊นจ์ง ํ•ด๊ฒฐ
get_ipython().run_line_magic("matplotlib", "inline")
plt.rc('font', family = "Malgun Gothic")

#๋งˆ์ด๋„ˆ์Šค๋ถ€ํ˜ธ ๊นจ์ง ํ•ด๊ฒฐ
import matplotlib as mpl
mpl.rcParams['axes.unicode_minus'] = False



๊ด€์„œ๋ณ„ ๋ฒ”์ฃ„ํ˜„ํ™ฉ ๋ฐ์ดํ„ฐ ๊ฐ€๊ณต


  • crime_raw = pd.read_csv("../data/02. crime_in_Seoul.csv",thousands=",",encoding = "euc-kr")
    crime csvํŒŒ์ผ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ

    thousands = ","
    ์ฒœ๋‹จ์œ„์—์„œ , ๋กœ ๊ตฌ๋ถ„ (์ˆซ์ž๊ฐ’์„ ๋ฌธ์ž๊ฐ’์œผ๋กœ ์ธ์‹ํ•  ์ˆ˜ ์žˆ์–ด์„œ ์ˆซ์ž๊ฐ’์œผ๋กœ ๋‚˜ํƒ€๋‚  ์ˆ˜ ์žˆ๋„๋ก ์„ธํŒ…)



  • crime_raw.info()
    ๋ฐ์ดํ„ฐ ๊ฐœ์š”ํ™•์ธ

    ๐Ÿงท RangeIndex: 65534 ์ด๋‚˜, data๋Š” 310
    ->null data ์ œ์™ธํ•„์š”



  • crime_raw["์ฃ„์ข…"].unique()

    ํŠน์ •์ปฌ๋Ÿผ์—์„œ unique์กฐ์‚ฌ
    nan๊ฐ’์ด ํฌํ•จ๋˜์–ด์žˆ์Œ



  • crime_raw_data = crime_raw[crime_raw["์ฃ„์ข…"].notnull()]

    null ์ œ์™ธํ•œ ๋ฐ์ดํ„ฐ๋งŒ notnull ๋ฉ”์„œ๋“œ๋ฅผ ์ด์šฉํ•˜์—ฌ crime_raw_data ๋ณ€์ˆ˜์— ๋‹ด๊ธฐ



pivot table๋กœ ์ •๋ฆฌ


crime_station = crime_raw_data.pivot_table(
    index = "๊ตฌ๋ถ„",
    columns = ["์ฃ„์ข…","๋ฐœ์ƒ๊ฒ€๊ฑฐ"],
    aggfunc= [np.sum]
)
crime_station.head()

pandas multi index ์ œ๊ฑฐ

์œ„ ๋ฐ์ดํ„ฐํ”„๋ผ์ž„์˜ columns์— ์ค‘๋ณต๊ฐ’์ธ 'sum','๊ฑด์ˆ˜' ์ปฌ๋Ÿผ์„ droplevel๋ฉ”์„œ๋“œ๋กœ ์‚ญ์ œํ•œ๋‹ค.



Google Maps๋ฅผ ์ด์šฉํ•œ ๋ฐ์ดํ„ฐ ์ •๋ฆฌ


์œ„ ๊ณผ์ •๊นŒ์ง€ ์ง„ํ–‰ํ•œ ๊ฐ€๊ณต์—์„œ
index๋Š” ๊ฒฝ์ฐฐ์„œ ์ด๋ฆ„์œผ๋กœ ๋˜์–ด์žˆ๊ณ , ๊ฒฝ์ฐฐ์„œ ์ด๋ฆ„์œผ๋กœ ๊ตฌ ์ด๋ฆ„์„ ์•Œ์•„๋‚ด์•ผํ•œ๋‹ค.

import googlemaps
gmaps_key = "googlemaps key๊ฐ’"
gmaps = googlemaps.Client(key = gmaps_key)

google maps ๋ชจ๋“ˆ์„ค์น˜

gmaps.geocode("์„œ์šธ์˜๋“ฑํฌ๊ฒฝ์ฐฐ์„œ",language="ko") ์‹คํ–‰ ์‹œ

[{'address_components': [{'long_name': '์˜๋“ฑํฌ๊ตฌ',
    'short_name': '์˜๋“ฑํฌ๊ตฌ',
    'types': ['political', 'sublocality', 'sublocality_level_1']},
   {'long_name': '์„œ์šธํŠน๋ณ„์‹œ',
    'short_name': '์„œ์šธํŠน๋ณ„์‹œ',
    'types': ['administrative_area_level_1', 'political']},
   {'long_name': '๋Œ€ํ•œ๋ฏผ๊ตญ',
    'short_name': 'KR',
    'types': ['country', 'political']}],
  'formatted_address': '๋Œ€ํ•œ๋ฏผ๊ตญ ์„œ์šธํŠน๋ณ„์‹œ ์˜๋“ฑํฌ๊ตฌ',
  'geometry': {'bounds': {'northeast': {'lat': 37.556286, 'lng': 126.9498867},
    'southwest': {'lat': 37.4850011, 'lng': 126.8779285}},
   'location': {'lat': 37.5223245, 'lng': 126.9101692},
   'location_type': 'APPROXIMATE',
   'viewport': {'northeast': {'lat': 37.556286, 'lng': 126.9498867},
    'southwest': {'lat': 37.4850011, 'lng': 126.8779285}}},
  'partial_match': True,
  'place_id': 'ChIJk_FAvx2ffDURQAuECsjy0Cw',
  'types': ['political', 'sublocality', 'sublocality_level_1']}]

ํ•„์š”ํ•œ ์ •๋ณด์ธ lat lng formatted_address ๋ฐ์ดํ„ฐ๋ฅผ ๋ฝ‘์•„์„œ crime_station์— ์ถ”๊ฐ€ํ•ด์•ผํ•จ.



  • crime_station["๊ตฌ๋ณ„"] = np.nan
    crime_station["lat"] = np.nan
    crime_station["lng"] = np.nan
    crime_station์— ์ปฌ๋Ÿผ์ถ”๊ฐ€



cnt = 0
for idx, rows in crime_station.iterrows():
    crime_name = "์„œ์šธ" + str(idx) + "๊ฒฝ์ฐฐ์„œ"
    tmp = gmaps.geocode(crime_name,language="ko")

    tmp_gu = tmp[0].get("formatted_address").split()[2]
    tmp_lat = tmp[0].get("geometry")["location"]["lat"]
    tmp_lng = tmp[0].get("geometry")["location"]["lng"]

    crime_station.loc[idx,"lat"] = tmp_lat
    crime_station.loc[idx,"lng"] = tmp_lng
    crime_station.loc[idx,"๊ตฌ๋ณ„"] = tmp_gu

    print(cnt)
    cnt += 1

๋ฐ์ดํ„ฐ ์ถ”๊ฐ€

์ด ๊ณผ์ •์—์„œ ์ด๋ฏธ ์‚ฌ๋ผ์ง„ ๊ฒฝ์ฐฐ์„œ์˜ ๊ฒฝ์šฐ ๋ฐ์ดํ„ฐ๊ฐ€ ๋ถˆ๋Ÿฌ์™€ ์ง€์ง€ ์•Š์•„ crime_station์—์„œ ํ•ด๋‹น ๊ตฌ๋ถ„(๊ฒฝ์ฐฐ์„œ์ด๋ฆ„) drop์„ ์ด์šฉํ•˜์—ฌ ์‚ญ์ œํ•˜์˜€์Œ



tmp = [
    crime_station.columns.get_level_values(0)[n] + crime_station.columns.get_level_values(1)[n]
    for n in range(len(crime_station.columns.get_level_values(0)))
]
tmp

์œ„ ๋ฐ์ดํ„ฐ ๋ณ‘ํ•ฉ๋ž€๊ณผ ํ•˜๋‹จ ์ปฌ๋Ÿผ๋ช… ํ•ฉ์น˜๊ณ 



  • crime_station.columns = tmp
    crime_station.to_csv("../data/02. crime_in_Seoul_raw.csv",sep=",",encoding="utf-8")
    ์ปฌ๋Ÿผ๋ณ€ ๋ณ€๊ฒฝ ํ›„ csvํŒŒ์ผ ์ €์žฅ



๊ตฌ๋ณ„๋กœ ๋ฐ์ดํ„ฐ ์ •๋ฆฌ


  • crime_anal_station = pd.read_csv("../data/02. crime_in_Seoul_raw.csv", index_col = 0, encoding = "utf-8")
    csv ํŒŒ์ผ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ



crime_anal_gu = pd.pivot_table(crime_anal_station, index = "๊ตฌ๋ณ„", aggfunc=np.sum)
del crime_anal_gu["lat"]
crime_anal_gu.drop(["lng"],axis = 1,inplace = True)

crime_anal_gu.head()

pivot table๋กœ ๊ตฌ๋ณ„๋กœ index ๋ณ€๊ฒฝ ํ›„ ํ•„์š”์—†๋Š” ๊ฒฝ๋„ ์œ„๋„ ์ปฌ๋Ÿผ ์‚ญ์ œ



target = ["๊ฐ•๊ฐ„๊ฒ€๊ฑฐ์œจ","๊ฐ•๋„๊ฒ€๊ฑฐ์œจ","์‚ด์ธ๊ฒ€๊ฑฐ์œจ","์ ˆ๋„๊ฒ€๊ฑฐ์œจ","ํญ๋ ฅ๊ฒ€๊ฑฐ์œจ"]

num = ["๊ฐ•๊ฐ„๊ฒ€๊ฑฐ","๊ฐ•๋„๊ฒ€๊ฑฐ","์‚ด์ธ๊ฒ€๊ฑฐ","์ ˆ๋„๊ฒ€๊ฑฐ","ํญ๋ ฅ๊ฒ€๊ฑฐ"]
den = ["๊ฐ•๊ฐ„๋ฐœ์ƒ","๊ฐ•๋„๋ฐœ์ƒ","์‚ด์ธ๋ฐœ์ƒ","์ ˆ๋„๋ฐœ์ƒ","ํญ๋ ฅ๋ฐœ์ƒ"]

crime_anal_gu[target] = crime_anal_gu[num].div(crime_anal_gu[den].values) * 100
crime_anal_gu.head()

๊ฐ ๊ฒ€๊ฑฐ์œจ ์ปฌ๋Ÿผ์ถ”๊ฐ€

์ปฌ๋Ÿผ ๋‚˜๋ˆ„๊ธฐ
1. ํ•˜๋‚˜์˜ ์ปฌ๋Ÿผ์„ ๋‹ค๋ฅธ์ปฌ๋Ÿผ์œผ๋กœ ๋‚˜๋ˆ„๊ธฐ
crime_anal_gu["๊ฐ•๋„๊ฒ€๊ฑฐ"]/crime_anal_gu["๊ฐ•๋„๋ฐœ์ƒ"]
2. ๋‹ค์ˆ˜์˜ ์ปฌ๋Ÿผ์„ ๋‹ค๋ฅธ์ปฌ๋Ÿผ์œผ๋กœ ๋‚˜๋ˆ„๊ธฐ
crime_anal_gu[["๊ฐ•๋„๊ฒ€๊ฑฐ","์‚ด์ธ๊ฒ€๊ฑฐ"]].div(crime_anal_gu["๊ฐ•๋„๋ฐœ์ƒ"],axis = 0)
3. ๋‹ค์ˆ˜์˜์ปฌ๋Ÿผ์„ ๋‹ค์ˆ˜์˜ ์ปฌ๋Ÿผ์œผ๋กœ ๊ฐ๊ฐ ๋‚˜๋ˆ„๊ธฐ
num = ["๊ฐ•๊ฐ„๊ฒ€๊ฑฐ","๊ฐ•๋„๊ฒ€๊ฑฐ","์‚ด์ธ๊ฒ€๊ฑฐ","์ ˆ๋„๊ฒ€๊ฑฐ","ํญ๋ ฅ๊ฒ€๊ฑฐ"]
den = ["๊ฐ•๊ฐ„๋ฐœ์ƒ","๊ฐ•๋„๋ฐœ์ƒ","์‚ด์ธ๋ฐœ์ƒ","์ ˆ๋„๋ฐœ์ƒ","ํญ๋ ฅ๋ฐœ์ƒ"]
crime_anal_gu[num].div(crime_anal_gu[den].values)



  • del crime_anal_gu["๊ฐ•๊ฐ„๊ฒ€๊ฑฐ"]
    del crime_anal_gu["๊ฐ•๋„๊ฒ€๊ฑฐ"]
    del crime_anal_gu["์‚ด์ธ๊ฒ€๊ฑฐ"]
    del crime_anal_gu["์ ˆ๋„๊ฒ€๊ฑฐ"]
    del crime_anal_gu["ํญ๋ ฅ๊ฒ€๊ฑฐ"]

ํ•„์š”์—†๋Š” ๊ฒ€๊ฑฐ์ปฌ๋Ÿผ ์‚ญ์ œ



  • crime_anal_gu[crime_anal_gu[target] > 100 ] = 100
    100๋ณด๋‹ค ํฐ ๋ฐ์ดํ„ฐ 100์œผ๋กœ ๋ณ€๊ฒฝ



  • crime_anal_gu.rename( columns={"๊ฐ•๊ฐ„๋ฐœ์ƒ" : "๊ฐ•๊ฐ„","๊ฐ•๋„๋ฐœ์ƒ" : "๊ฐ•๋„","์‚ด์ธ๋ฐœ์ƒ" : "์‚ด์ธ","์ ˆ๋„๋ฐœ์ƒ" : "์ ˆ๋„","ํญ๋ ฅ๋ฐœ์ƒ" : "ํญ๋ ฅ"}, inplace = True )
    ์ปฌ๋Ÿผ์ด๋ฆ„๋ณ€๊ฒฝ



๋ฐ์ดํ„ฐ ์ตœ์ข…์ •๋ฆฌ


  • col = ["๊ฐ•๊ฐ„","๊ฐ•๋„","์‚ด์ธ","์ ˆ๋„","ํญ๋ ฅ"]
    crime_anal_norm = crime_anal_gu[col] / crime_anal_gu[col].max()
    ์ •๊ทœํ™” : ์ตœ๊ณ ๊ฐ’์€ 1 ์ตœ์†Œ๊ฐ’์€ 0



  • col2 = ["๊ฐ•๊ฐ„๊ฒ€๊ฑฐ์œจ","๊ฐ•๋„๊ฒ€๊ฑฐ์œจ","์‚ด์ธ๊ฒ€๊ฑฐ์œจ","์ ˆ๋„๊ฒ€๊ฑฐ์œจ","ํญ๋ ฅ๊ฒ€๊ฑฐ์œจ"]
    crime_anal_norm[col2] = crime_anal_gu[col2]
    crime_anal_gu์˜ ๊ฒ€๊ฑฐ์œจ ์ปฌ๋Ÿผ crime_anal_norm์— ์ถ”๊ฐ€



  • result_CCTV = pd.read_csv("../data/01. CCTV_result.csv",index_col = "๊ตฌ๋ณ„", encoding= "utf-8")
    crime_anal_norm[["์ธ๊ตฌ์ˆ˜","CCTV"]] = result_CCTV[["์ธ๊ตฌ์ˆ˜","์†Œ๊ณ„"]]
    ๊ตฌ๋ณ„ CCTV ์ž๋ฃŒ์—์„œ ์ธ๊ตฌ์ˆ˜ ์™€ CCTV์ˆ˜ ์ถ”๊ฐ€



  • col = ["๊ฐ•๊ฐ„","๊ฐ•๋„","์‚ด์ธ","์ ˆ๋„","ํญ๋ ฅ"]
    crime_anal_norm["๋ฒ”์ฃ„"] = np.mean(crime_anal_norm[col], axis = 1)
    ๊ฐ ๊ตฌ๋ณ„ ๋ฒ”์ฃ„ ํ‰๊ท ๊ฐ’ ์ปฌ๋Ÿผ ์ถ”๊ฐ€
  • col2 = ["๊ฐ•๊ฐ„๊ฒ€๊ฑฐ์œจ","๊ฐ•๋„๊ฒ€๊ฑฐ์œจ","์‚ด์ธ๊ฒ€๊ฑฐ์œจ","์ ˆ๋„๊ฒ€๊ฑฐ์œจ","ํญ๋ ฅ๊ฒ€๊ฑฐ์œจ"]
    crime_anal_norm["๊ฒ€๊ฑฐ"] = np.mean(crime_anal_norm[col2],axis = 1)
    ๊ฐ ๊ตฌ๋ณ„ ๊ฒ€๊ฑฐ์œจ ํ‰๊ท ๊ฐ’ ์ปฌ๋Ÿผ ์ถ”๊ฐ€

np.meanํ‰๊ท ๊ฐ’

np.mean(
    np.array(
        [[1.000000,1.000000,0.357143,0.977118,0.733773],
         [0.310078,0.358974,0.285714,0.477799,0.463880]]
    ) , axis = 1 #axis = 0 ์—ด axis = 1 ํ–‰ (drop์ด๋ž‘ ๋ฐ˜๋Œ€)
       )



๋ฐ์ดํ„ฐ ์‹œ๊ฐํ™”


pairplot

1. ๊ฐ•๋„, ์‚ด์ธ, ํญ๋ ฅ์— ๋Œ€ํ•œ ์ƒ๊ด€๊ด€๊ณ„ ํ™•์ธ

sns.pairplot(data=crime_anal_norm, 
             vars = ["์‚ด์ธ","๊ฐ•๋„","ํญ๋ ฅ"], 
             kind = "reg", 
             height = 3)

2. ์ธ๊ตฌ์ˆ˜, CCTV์™€ ์‚ด์ธ, ๊ฐ•๋„์˜ ์ƒ๊ด€๊ด€๊ณ„ ํ™•์ธ

def drawGraph() :
    sns.pairplot(data = crime_anal_norm,
                x_vars = ["์ธ๊ตฌ์ˆ˜","CCTV"],
                 y_vars = ["์‚ด์ธ","๊ฐ•๋„"],
                 kind = "reg",
                 height = 4
                )
    plt.show()

drawGraph()

3. ์ธ๊ตฌ์ˆ˜ cctv์™€ ์‚ด์ธ๊ฒ€๊ฑฐ์œจ, ํญ๋ ฅ๊ฒ€๊ฑฐ์œจ์˜ ์ƒ๊ด€๊ด€๊ณ„ํ™•์ธ

def drawGraph() :
    sns.pairplot(data = crime_anal_norm,
                x_vars = ["์ธ๊ตฌ์ˆ˜","CCTV"],
                 y_vars = ["์‚ด์ธ๊ฒ€๊ฑฐ์œจ","ํญ๋ ฅ๊ฒ€๊ฑฐ์œจ"],
                 kind = "reg",
                 height = 4
                )
    plt.show()

drawGraph()

4. ์ธ๊ตฌ์ˆ˜ cctv์™€ ์ ˆ๋„๊ฒ€๊ฑฐ์œจ, ๊ฐ•๋„๊ฒ€๊ฑฐ์œจ์˜ ์ƒ๊ด€๊ด€๊ณ„ํ™•์ธ

def drawGraph() :
    sns.pairplot(data = crime_anal_norm,
                x_vars = ["์ธ๊ตฌ์ˆ˜","CCTV"],
                 y_vars = ["์ ˆ๋„๊ฒ€๊ฑฐ์œจ","๊ฐ•๋„๊ฒ€๊ฑฐ์œจ"],
                 kind = "reg",
                 height = 4
                )
    plt.show()

drawGraph()



heatmap

1. ๊ฒ€๊ฑฐ์œจ heatmap

def drawGraph ():

    #๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ ์ƒ์„ฑ

    target_column = ["๊ฐ•๊ฐ„๊ฒ€๊ฑฐ์œจ", "๊ฐ•๋„๊ฒ€๊ฑฐ์œจ", "์‚ด์ธ๊ฒ€๊ฑฐ์œจ", "์ ˆ๋„๊ฒ€๊ฑฐ์œจ", "ํญ๋ ฅ๊ฒ€๊ฑฐ์œจ"]
    crime_anal_norm_sort = crime_anal_norm.sort_values(by = "๊ฒ€๊ฑฐ", ascending = False) #๋‚ด๋ฆผ์ฐจ์ˆœ
    
    # ๊ทธ๋ž˜ํ”„ ์ƒ์„ฑ
    plt.figure(figsize=(10,10))
    sns.heatmap(
        data = crime_anal_norm_sort[target_column],
        annot= True, #๋ฐ์ดํ„ฐ๊ฐ’ ํ‘œํ˜„
        fmt = "f",  #์‹ค์ˆ˜๋กœํ‘œํ˜„
        linewidths=5, # ๊ฐ„๊ฒฉ์„ค์ •
        cmap = "RdPu"
    )
    plt.title("๋ฒ”์ฃ„๊ฒ€๊ฑฐ๋น„์œจ (์ •๊ทœํ™”๋œ ๊ฒ€๊ฑฐ์˜ ํ•ฉ์œผ๋กœ ์ •๋ ฌ)")
    plt.show()

drawGraph()

2. ๋ฒ”์ฃ„๋ฐœ์ƒ๊ฑด์ˆ˜ heatmap

def drawGraph ():

    #๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ ์ƒ์„ฑ

    target_column = ["๊ฐ•๊ฐ„", "๊ฐ•๋„", "์‚ด์ธ", "์ ˆ๋„", "ํญ๋ ฅ"]
    crime_anal_norm_sort = crime_anal_norm.sort_values(by = "๋ฒ”์ฃ„", ascending = False) #๋‚ด๋ฆผ์ฐจ์ˆœ
    
    # ๊ทธ๋ž˜ํ”„ ์ƒ์„ฑ
    plt.figure(figsize=(10,10))
    sns.heatmap(
        data = crime_anal_norm_sort[target_column],
        annot= True, #๋ฐ์ดํ„ฐ๊ฐ’ ํ‘œํ˜„
        fmt = "f",  #์‹ค์ˆ˜๋กœํ‘œํ˜„
        linewidths=5, # ๊ฐ„๊ฒฉ์„ค์ •
        cmap = "RdPu"
    )
    plt.title("๋ฒ”์ฃ„ ๋น„์œจ (์ •๊ทœํ™”๋œ ๊ฒ€๊ฑฐ์˜ ํ•ฉ์œผ๋กœ ์ •๋ ฌ)")
    plt.show()

drawGraph()



์ง€๋„์‹œ๊ฐํ™”



Zero Base ๋ฐ์ดํ„ฐ๋ถ„์„ ์Šค์ฟจ
Daily Study Note

0๊ฐœ์˜ ๋Œ“๊ธ€