셀프 주유소가 정말 저렴한가?
from selenium import webdriver
url = 'https://www.opinet.co.kr/searRgSelect.do'
driver = webdriver.Chrome('./driver/chromedriver.exe')
driver.get(url)
- 문제
- 해당 URL로 한 번에 접근이 안됨
- 메인페이지로 접속이 되고, 팝업창이 나온다.
driver.switch_to_window(driver.window_handles[-1])
driver.get(url)
sido_list_row = driver.find_element(By.CSS_SELECTOR,'#SIDO_NM0')
sido_list_row.text
sido_list = sido_list_row.find_elements(By.TAG_NAME,'option')
sido_list[1].text
sido_list[1].get_attribute('value')
sido_names = [option.get_attribute('value') for option in sido_list]
sido_names
sido_names = sido_names[1:]
sido_names
sido_list_row.send_keys(sido_names[0])
gu_list_row = driver.find_element(By.ID,'SIGUNGU_NM0')
gu_list = gu_list_row.find_elements(By.TAG_NAME, 'option')
gu_names = [option.get_attribute('value') for option in gu_list]
gu_names = gu_names[1:]
gu_names, len(gu_names)
gu_list_row.send_keys(gu_names[1])
driver.find_element(By.CSS_SELECTOR,'#glopopd_excel > span').click()
import time
from tqdm import tqdm_notebook
for gu in tqdm_notebook(gu_names):
element = driver.find_element(By.ID,'SIGUNGU_NM0')
element.send_keys(gu)
time.sleep(3)
element_get_excel = driver.find_element(By.CSS_SELECTOR,'#glopopd_excel > span').click()
time.sleep(3)
데이터 정리하기
import pandas as pd
from glob import glob
glob('./data/지역_*.xls')
stations_files = glob('./data/지역_*.xls')
tmp = pd.read_excel(stations_files[0], header=2)
tmp.head()
tmp_raw=[]
for file_name in stations_files:
tmp = pd.read_excel(file_name, header=2)
tmp_raw.append(tmp)
station_raw = pd.concat(tmp_raw)
station_raw
station_raw.info()
stations = pd.DataFrame(
{
'상호': station_raw['상호'],
'주소': station_raw['주소'],
'가격': station_raw['휘발유'],
'셀프': station_raw['셀프여부'],
'상표': station_raw['상표']
}
)
stations.tail()
stations['주소']
for eachAddress in stations['주소']:
print(eachAddress.split()[1])
stations['구'] = [eachAddress.split()[1] for eachAddress in stations['주소']]
stations
stations.reset_index(inplace=True)
stations.head()
del stations['index']
stations.head()
주유 가격 정보 시각화
import matplotlib.pyplot as plt
import seaborn as sns
import platform
from matplotlib import font_manager, rc
%matplotlib inline
rc('font', family='NanumSquare Neo OTF Regular')
stations.boxplot(column='가격', by='셀프', figsize=(12,8));
plt.figure(figsize=(12,8))
sns.boxplot(x='셀프', y='가격', data=stations, palette='Set2')
plt.grid(True)
plt.show()
plt.figure(figsize=(12,8))
sns.boxplot(x='상표', y='가격', hue='셀프', data=stations, palette='Set3')
plt.grid(True)
plt.show()
import json
import folium
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
stations.sort_values(by='가격', ascending=False).head(10)
stations.sort_values(by='가격').head(10)
import numpy as np
gu_data = pd.pivot_table(data=stations, index='구', values='가격', aggfunc=np.mean)
gu_data.head()
geo_path = './data/02. skorea_municipalities_geo_simple.json'
geo_str = json.load(open(geo_path, encoding='utf-8'))
my_map = folium.Map(location=[37.5502, 126.982], zoome_start=10.5, tiles='Stamen Toner')
my_map
my_map.choropleth(
geo_data = geo_str,
data=gu_data,
columns=[gu_data.index,'가격'],
key_on='feature.id',
fill_color='PuRd'
)
my_map