EDA - 04. Oil Price Analysis and Green construction (w/selenium)

์†”๋น„ยท2024๋…„ 1์›” 8์ผ
0
post-thumbnail

mini project


๐ŸŒŸ ๋ชฉํ‘œ | ์„œ์šธ์‹œ ์ง€์—ญ๋ณ„, ๋ธŒ๋žœ๋“œ๋ณ„ ์ฃผ์œ ๊ฐ€๊ฒฉ ๋ถ„์„ ๋ฐ ์‹œ๊ฐํ™”
๐ŸŒŸ ํฌ๋กค๋ง ์ถœ์ฒ˜




1. ๋ฐ์ดํ„ฐํฌ๋กค๋ง


๋ชฉํ‘œ๋ฐ์ดํ„ฐ

  • ์„œ์šธ์‹œ ๊ฐ ๊ตฌ๋ณ„ ๋ฐ์ดํ„ฐ(๋ธŒ๋žœ๋“œ, ๊ฐ€๊ฒฉ, ์…€ํ”„ ์ฃผ์œ ์—ฌ๋ถ€, ์œ„์น˜) ์—‘์…€ํŒŒ์ผ



from selenium import webdriver

  • selenium ์„ค์น˜

์ง€์—ญ - ์‹œ ์ ‘๊ทผ


url = "https://www.opinet.co.kr/searRgSelect.do"
driver = webdriver.Chrome("../driver/chromedriver.exe")
driver.get(url)
  • ํŽ˜์ด์ง€ ์ ‘๊ทผ

from selenium.webdriver.common.by import By


sido_list_raw = driver.find_element(By.ID, "SIDO_NM0")

  • ์‹œ๋„ ์„ ํƒ HTML ๋ž€ ์ ‘๊ทผ

sido_list = sido_list_raw.find_elements(By.TAG_NAME,"option")

  • ์‹œ๋„ ์˜ต์…˜๋ž€ ์ ‘๊ทผ

sido_names = []

for option in sido_list :
    sido_names.append(option.get_attribute("value"))
  • text์™€ value ๊ฐ’์ด ๋‹ค๋ฅด๋ฏ€๋กœ get_attribute๋ฅผ ์ด์šฉํ•ด value๊ฐ’์„ ๊ฐ€์ ธ์™€ sido_name ๋ฆฌ์ŠคํŠธ์— append

sido_names = sido_names[1:]

  • ์‹œ๋„์˜ value๊ฐ’์ด ๊ณต๋ž€์ด๋ฏ€๋กœ ์Šฌ๋ผ์ด์‹ฑ์œผ๋กœ ์ž˜๋ผ์คŒ

sido_list_raw.send_keys(sido_names[0])

  • ์„œ์šธํŠน๋ณ„์‹œ์— ํ•ด๋‹นํ•˜๋Š” sido_names[0] ๊ฐ’ ์ž…๋ ฅ



์ง€์—ญ - ๊ตฌ ์ ‘๊ทผ


gu_list_raw = driver.find_element(By.ID, "SIGUNGU_NM0") #๋ถ€๋ชจํƒœ๊ทธ
gu_list = gu_list_raw.find_elements(By.TAG_NAME,"option") #์ž์‹ํƒœ๊ทธ
gu_name = [(option.text) for option in gu_list ]
gu_name = gu_name[1:]
  • ๊ตฌ์˜ ๊ฒฝ์šฐ option์˜ text์™€ value๊ฐ€ ๋™์ผํ•˜๋ฏ€๋กœ get_attrubute๋ฅผ ๋”ฐ๋กœ ์‚ฌ์šฉํ•˜์ง€ ์•Š๊ณ  ๋ฆฌ์ŠคํŠธ์— ๋‹ด์Œ

# ์„œ์šธ 25๊ฐœ ๊ตฌ ์ฃผ์œ ์†Œ ๋ฐ์ดํ„ฐ ๋‹ค์šด๋กœ๋“œ
import time
from tqdm import tqdm_notebook

for gu in tqdm_notebook(gu_name) :
	# ๊ตฌ ์ด๋ฆ„ ๋ณด๋‚ด๊ธฐ
    driver.find_element(By.ID, "SIGUNGU_NM0").send_keys(gu)
    time.sleep(2)

   # ์—‘์…€ ๋‹ค์šด๋ž€ ํด๋ฆญ
	driver.find_element(By.ID,"glopopd_excel").click()
    time.sleep(2)
  • ๊ตฌ ์ด๋ฆ„ ๋ณด๋‚ด๊ธฐ -> ์—‘์…€๋‹ค์šด
    ์„œ์šธ ์‹œ ์ „ ๊ตฌ์— ๋Œ€ํ•œ ๋ฐ์ดํ„ฐ์ˆ˜์ง‘

driver.close()

  • ์ฐฝ ๋‹ซ๊ธฐ




2. ๋ฐ์ดํ„ฐ์ •๋ฆฌ


๋ชฉํ‘œ
ํฌ๋กค๋ง์œผ๋กœ ๋‹ค์šด๋กœ๋“œ ๋ฐ›์€ ๋ฐ์ดํ„ฐ ํ•œ Data Frame์— ์ •๋ฆฌ

import pandas as pd
from glob import glob



stations_files = glob("../data/์ง€์—ญ_*.xls")

  • ํŒŒ์ผ ๋ชฉ๋ก ํ•œ๋ฒˆ์— ๊ฐ€์ ธ์™€์„œ ๋ณ€์ˆ˜์— ์ €์žฅ

tmp_raw = []

for file in stations_files :
    tmp = pd.read_excel(file, header=2)
    tmp_raw.append(tmp)

tmp_raw
  • for๋ฌธ์œผ๋กœ ํŒŒ์ผ ํ•˜๋‚˜ํ•˜๋‚˜ excel๋กœ ๋ถˆ๋Ÿฌ์˜จ ํ›„ tmp_raw์— ๋ฆฌ์ŠคํŠธํ˜•ํƒœ๋กœ ๋‹ด๊ธฐ

station_raw = pd.concat(tmp_raw)

  • concat์œผ๋กœ Data Frame ํ˜•ํƒœ๋กœ ๋ถ™ํžˆ๊ธฐ

๐ŸŒŸ concat : ํ˜•์‹์ด ๋™์ผํ•˜๊ณ  ์—ฐ๋‹ฌ์•„ ๋ถ™์ด๊ธฐ๋งŒ ํ•  ๋•Œ ์‚ฌ์šฉ

โžก๏ธ index๋ฒˆํ˜ธ reset ํ•„์š”


station = pd.DataFrame({
    "์ƒํ˜ธ" : station_raw["์ƒํ˜ธ"],
    "์ฃผ์†Œ" : station_raw["์ฃผ์†Œ"],
    "๊ฐ€๊ฒฉ" : station_raw["ํœ˜๋ฐœ์œ "],
    "์…€ํ”„" : station_raw["์…€ํ”„์—ฌ๋ถ€"],
    "์ƒํ‘œ" : station_raw["์ƒํ‘œ"]
})
  • ํ•„์š”๋ฐ์ดํ„ฐ๋งŒ station์— ๋‹ด๊ธฐ

station["๊ตฌ"] = [ each_address.split()[1] for each_address in station["์ฃผ์†Œ"]]

  • ์ฃผ์†Œ์—์„œ split์œผ๋กœ ์ž๋ฅธ ํ›„ ๊ตฌ์— ํ•ด๋‹นํ•˜๋Š” ์ •๋ณด๋งŒ "๊ตฌ" ์ปฌ๋Ÿผ ์ƒ์„ฑ ํ›„ ๋‹ด๊ธฐ

station = station[station["๊ฐ€๊ฒฉ"] != "-"]

  • ๊ฐ€๊ฒฉ์—†๋Š” ์ฃผ์œ ์†Œ ์‚ญ์ œ

station["๊ฐ€๊ฒฉ"] = station["๊ฐ€๊ฒฉ"].astype("float")

  • ๊ฐ€๊ฒฉ๋ฐ์ดํ„ฐ floatํ˜•ํƒœ๋กœ ๋ณ€๊ฒฝ

station.reset_index(inplace=True)
del station["index"]

  • ์ธ๋ฑ์Šค ์žฌ์ •๋น„




3. ๋ฐ์ดํ„ฐ ์‹œ๊ฐํ™”


from matplotlib import font_manager as fm
from matplotlib import pyplot as plt

#ํ•œ๊ธ€ํฐํŠธ ๊นจ์ง ํ•ด๊ฒฐ
get_ipython().run_line_magic("matplotlib", "inline")
plt.rc('font', family = "Malgun Gothic")

#๋งˆ์ด๋„ˆ์Šค๋ถ€ํ˜ธ ๊นจ์ง ํ•ด๊ฒฐ
import matplotlib as mpl
mpl.rcParams['axes.unicode_minus'] = False
  • ํ•œ๊ธ€ ๊นจ์ง ํ˜„์ƒ ํ•ด๊ฒฐ



- ์…€ํ”„ ์ฃผ์œ ์†Œ์™€ ์ผ๋ฐ˜ ์ฃผ์œ ์†Œ ๊ฐ€๊ฒฉ์ฐจ์ด ver/pandas

station.boxplot(column="๊ฐ€๊ฒฉ", by = "์…€ํ”„", figsize=(12,8))

- ์…€ํ”„ ์ฃผ์œ ์†Œ์™€ ์ผ๋ฐ˜ ์ฃผ์œ ์†Œ ๊ฐ€๊ฒฉ์ฐจ์ด ver/seaborn

plt.figure(figsize=(12,8))
sns.boxplot( x = "์…€ํ”„" , y = "๊ฐ€๊ฒฉ", data = station, palette= "Set3" )

- ๋ฉ”์ด์ปค๋ณ„ ์…€ํ”„ ์ฃผ์œ ์†Œ์™€ ์ผ๋ฐ˜ ์ฃผ์œ ์†Œ ๊ฐ€๊ฒฉ์ฐจ์ด

plt.figure(figsize=(12,8))
sns.boxplot( x= "์ƒํ‘œ", y = "๊ฐ€๊ฒฉ", hue = "์…€ํ”„", data=station, palette = "Set3" )

- ๊ตฌ๋ณ„ ๊ฐ€๊ฒฉ ์ง€๋„์‹œ๊ฐํ™”

gu_data = pd.pivot_table ( data=station, index="๊ตฌ", values="๊ฐ€๊ฒฉ", aggfunc=np.mean )

  • ๋ฐ์ดํ„ฐ ๊ตฌ๋ณ„ ํ”ผ๋ด‡ํŒ…
geo_path = "../data/02. skorea_municipalities_geo_simple.json"
geo_str = json.load(open(geo_path,encoding="utf-8"))

my_map = folium.Map(location=[37.5502,126.982], zoom_start=10, tiles="CartoDB Positron")
folium.Choropleth(
    geo_data = geo_str,
    data = gu_data,
    columns = [gu_data.index, "๊ฐ€๊ฒฉ"],
    key_on = "feature.id",
    fill_color = "PuRd"
).add_to(my_map)

my_map

์—…๋กœ๋“œ์ค‘..

์ „์ฒด์ฝ”๋“œ


Zero Base ๋ฐ์ดํ„ฐ๋ถ„์„ ์Šค์ฟจ
Daily Study Note

0๊ฐœ์˜ ๋Œ“๊ธ€