셀프 주유소 가격 비교 EDA

밤비나·2023년 4월 5일

목록 보기

7/7

셀프 주유소가 정말 저렴한가?

https://www.opinet.co.kr/searRgSelect.do
목표 데이터
- 브랜드
- 가격
- 셀프 주유 여부
- 위치

from selenium import webdriver

# 페이지 접근
url = 'https://www.opinet.co.kr/searRgSelect.do'
driver = webdriver.Chrome('./driver/chromedriver.exe')
driver.get(url)

문제
- 해당 URL로 한 번에 접근이 안됨
- 메인페이지로 접속이 되고, 팝업창이 나온다.

# 팝업창 화면 전환 후 닫아주기
driver.switch_to_window(driver.window_handles[-1])

# 접근 페이지 다시 요청
driver.get(url)

# 지역 : 시/도
sido_list_row = driver.find_element(By.CSS_SELECTOR,'#SIDO_NM0')
sido_list_row.text

sido_list = sido_list_row.find_elements(By.TAG_NAME,'option')

sido_list[1].text # '서울'
sido_list[1].get_attribute('value') # '서울특별시'

# 시/도 이름 리스트에 저장
sido_names = [option.get_attribute('value') for option in sido_list]
sido_names

# ''로 나오는 값 제외하고 시/도 저장
sido_names = sido_names[1:]
sido_names

sido_list_row.send_keys(sido_names[0])

# 구
gu_list_row = driver.find_element(By.ID,'SIGUNGU_NM0')
gu_list = gu_list_row.find_elements(By.TAG_NAME, 'option')

gu_names = [option.get_attribute('value') for option in gu_list]
gu_names = gu_names[1:]
gu_names, len(gu_names)

gu_list_row.send_keys(gu_names[1])

# 엑셀 저장
driver.find_element(By.CSS_SELECTOR,'#glopopd_excel > span').click()

import time
from tqdm import tqdm_notebook

# 모든 구 데이터 엑셀로 저장
for gu in tqdm_notebook(gu_names):
    element = driver.find_element(By.ID,'SIGUNGU_NM0')
    element.send_keys(gu)
    time.sleep(3)
    
    element_get_excel = driver.find_element(By.CSS_SELECTOR,'#glopopd_excel > span').click()
    time.sleep(3)

데이터 정리하기

import pandas as pd
from glob import glob

# 파일 목록 한 번에 가져오기
glob('./data/지역_*.xls')

# 파일명 저장
stations_files = glob('./data/지역_*.xls')

# 하나만 읽어보기
tmp = pd.read_excel(stations_files[0], header=2)

tmp.head()

tmp_raw=[]

for file_name in stations_files:
    tmp = pd.read_excel(file_name, header=2)
    tmp_raw.append(tmp)

# 형식이 동일하고 연달아 붙이기만 하면 될 때는 concat
station_raw = pd.concat(tmp_raw)
station_raw

station_raw.info()

stations = pd.DataFrame(
    {
        '상호': station_raw['상호'],
        '주소': station_raw['주소'],
        '가격': station_raw['휘발유'],
        '셀프': station_raw['셀프여부'],
        '상표': station_raw['상표']
        
    }
)
stations.tail()

주소에서 구 나누기

stations['주소']

for eachAddress in stations['주소']:
    print(eachAddress.split()[1])
    
stations['구'] = [eachAddress.split()[1] for eachAddress in stations['주소']]
stations

# 인덱스 재정렬
stations.reset_index(inplace=True)
stations.head()

del stations['index']
stations.head()

주유 가격 정보 시각화

import matplotlib.pyplot as plt
import seaborn as sns
import platform
from matplotlib import font_manager, rc

%matplotlib inline

rc('font', family='NanumSquare Neo OTF Regular')

stations.boxplot(column='가격', by='셀프', figsize=(12,8));

plt.figure(figsize=(12,8))
sns.boxplot(x='셀프', y='가격', data=stations, palette='Set2')
plt.grid(True)
plt.show()

plt.figure(figsize=(12,8))
sns.boxplot(x='상표', y='가격', hue='셀프', data=stations, palette='Set3')
plt.grid(True)
plt.show()

지도 시각화

# 지도 시각화
import json
import folium
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# 가장 비싼 주유소 10개
stations.sort_values(by='가격', ascending=False).head(10)

# 가장 싼 주유소 10개
stations.sort_values(by='가격').head(10)

import numpy as np
gu_data = pd.pivot_table(data=stations, index='구', values='가격', aggfunc=np.mean)
gu_data.head()

geo_path = './data/02. skorea_municipalities_geo_simple.json'
geo_str = json.load(open(geo_path, encoding='utf-8'))

my_map = folium.Map(location=[37.5502, 126.982], zoome_start=10.5, tiles='Stamen Toner')

my_map

my_map.choropleth(
    geo_data = geo_str,
    data=gu_data,
    columns=[gu_data.index,'가격'],
    key_on='feature.id',
    fill_color='PuRd'
)
my_map

밤비나

씨앗 데이터 분석가.

이전 포스트

셀프 주유소 가격 비교 EDA

EDA

셀프 주유소가 정말 저렴한가?

데이터 정리하기

주유 가격 정보 시각화

Selenium

0개의 댓글

관련 채용 정보