사이트 구조 확인
결론
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib.request import urlopen
from tqdm import tqdm_notebook
from matplotlib import font_manager as fm
from matplotlib import rc
import time
import pandas as pd
import googlemaps
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# 페이지 접근
import time
# 페이지 접근
url = "https://www.opinet.co.kr/searRgSelect.do"
driver = webdriver.Chrome("../driver/chromedriver") # Windows: +.exe
driver.get(url)
time.sleep(3)
# 팝업창으로 전환
driver.get(url)
# 팝업창 닫아주기
driver.close()
time.sleep(3)
# 메인화면 창으로 전환
driver.switch_to_window(driver.window_handles[-1])
# 접근 URL 다시 요청
driver.get(url)
sido_select = driver.find_element_by_id("SIDO_NM0")
sido_select.send_keys("서울")
gu_select = driver.find_element_by_id("SIGUNGU_NM0")
gu_list = gu_select.find_elements_by_tag_name("option")
gu_names = [item.get_attribute("value") for item in gu_list]
gu_names = gu_names[1:]
gu_names
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
oil_info = []
for gu_name in tqdm_notebook(gu_names):
time.sleep(1)
driver.find_element_by_id("SIGUNGU_NM0").send_keys(gu_name)
req = driver.page_source
soup = BeautifulSoup(req, "html.parser")
info = soup.find(id = "body1").find_all("tr")
for item in info :
name = item.find("a").text.strip()
brand = item.find("img")['alt']
address = item.find("a")['href'].split(",")[-11].replace("'","")
gasoline_cost = item.find_all("td", class_ = "price")[0].text.strip()
diesel_cost = item.find_all("td", class_ = "price")[1].text.strip()
# self 유무 조건식
if item.find("span", class_= "ico") != None:
if item.find("span", class_= "ico").text == '셀프':
self="Y"
else : self="N"
else : self="N"
gu = address.split(" ")[1]
data = {
"name" : name,
"gu" : gu ,
"brand" : brand,
"address" : address,
"gasoline" : gasoline_cost,
"diesel" : diesel_cost,
"self" : self
}
oil_info.append(data)
len(oil_info)
df_final["gasoline"] = df_final["gasoline"].astype("float")
df_final["diesel"] = df_final["diesel"].astype("float")
df_oil = pd.DataFrame(oil_info)
df_oil.tail()
wash_info = []
driver.find_element_by_id("CWSH_YN").click()
for gu_name in tqdm_notebook(gu_names):
time.sleep(1)
driver.find_element_by_id("SIGUNGU_NM0").send_keys(gu_name)
req = driver.page_source
soup = BeautifulSoup(req, "html.parser")
info = soup.find(id = "body1").find_all("tr")
for item in info :
address = item.find("a")['href'].split(",")[-11].replace("'","")
wash = 'Y'
data = {
"address" : address,
"wash" : wash
}
wash_info.append(data)
len(wash_info)
df_wash = pd.DataFrame(wash_info)
df_wash.tail()
df_final = pd.merge(df_oil, df_wash, on = "address", how = 'left')
df_final = df_final.fillna("N")
df_final.tail(5)
fix_info = []
driver.find_element_by_id("CWSH_YN").click()
time.sleep(1)
driver.find_element_by_id("MAINT_YN").click()
for gu_name in tqdm_notebook(gu_names):
time.sleep(1)
driver.find_element_by_id("SIGUNGU_NM0").send_keys(gu_name)
req = driver.page_source
soup = BeautifulSoup(req, "html.parser")
info_find = soup.find(id = "body1")
if info_find != None :
info = info_find.find_all("tr")
for item in info :
address = item.find("a")['href'].split(",")[-11].replace("'","")
fix = 'Y'
data = {
"address" : address,
"maintenance" : fix
}
fix_info.append(data)
else : continue
len(fix_info)
df_fix = pd.DataFrame(fix_info)
df_fix.tail()
df_final = pd.merge(df_final, df_fix, on = "address", how = 'left')
df_final = df_final.fillna("N")
df_final.tail(5)
driver.find_element_by_id("LPG_BTN").click()
driver.find_element_by_id("SIDO_NM0").send_keys("서울")
charge_list = []
for gu_name in tqdm_notebook(gu_names):
time.sleep(1)
driver.find_element_by_id("SIGUNGU_NM0").send_keys(gu_name)
req = driver.page_source
soup = BeautifulSoup(req, "html.parser")
info = soup.find(id = "body1")
if info != None :
info_find = info.find_all("tr")
for item in info_find:
address = item.find("a")['href'].split(",")[-11].replace("'","")
charge = "Y"
data = {
"address" : address,
"charger" : charge
}
charge_list.append(data)
else : continue
len(charge_list)
df_charge = pd.DataFrame(charge_list)
df_charge.tail()
df_final = pd.merge(df_final, df_charge, on = "address", how = 'left')
df_final = df_final.fillna("N")
df_final.head()
google_maps_key = "AIzaSyAfUt64G3GBz-q0WUk8s4udRolATNQQLD4"
gmaps = googlemaps.Client(key=google_maps_key)
df_final["lat"] = np.nan
df_final["lng"] = np.nan
df_final.tail()
for idx, rows in tqdm_notebook(df_final.iterrows()):
rows["address"]
tmp = gmaps.geocode(rows["address"], language="ko")
if tmp:
lat = tmp[0].get("geometry")["location"]["lat"]
lng = tmp[0].get("geometry")["location"]["lng"]
df_final.loc[idx, "lat"] = lat
df_final.loc[idx, "lng"] = lng
else:
print(idx, rows["address"])
df_final.info()
df_final.tail()
df_final.to_csv("../data/seoul_gas_station_data.csv", sep=",", encoding="utf-8")
df_final.csv = pd.read_csv("../data/seoul_gas_station_data.csv", encoding="utf-8", index_col=0)
df_final.csv.tail()
import matplotlib.pyplot as plt
import seaborn as sns
import platform
from matplotlib import font_manager, rc
get_ipython().run_line_magic("matplotlib", "inline")
# %matplotlib inline
path = "C;/Windows/Fonts/malgun.ttf"
if platform.system() == "Darwin":
rc("font", family = "Arial Unicode MS")
elif platform.system == "Windows":
font_name = font_mananger.Fontproperties(fname=path).get_name()
rc("font", family = "family=font_name")
else:
print("Unknown system. sorry~")
# boxplot(feat. pandas)
df_final.boxplot(column=("gasoline", "diesel"), by="self", figsize=(12, 8))
# boxplot(feat. seaborn)
plt.figure(figsize=(12, 8))
sns.boxplot(x="self", y="gasoline", data=df_final, palette="Set3")
plt.grid(True)
plt.show()
# boxplot(feat. seaborn)
plt.figure(figsize=(12, 8))
sns.boxplot(x="self", y="diesel", data=df_final, palette="Set3")
plt.grid(True)
plt.show()
# boxplot(feat. seaborn)
plt.figure(figsize=(12, 8))
sns.boxplot(x="brand", y="gasoline", hue="self", data=df_final, palette="Set3")
plt.grid(True)
plt.show()
# boxplot(feat. seaborn)
plt.figure(figsize=(12, 8))
sns.boxplot(x="brand", y="diesel", hue="self", data=df_final, palette="Set3")
plt.grid(True)
plt.show()