Ch5 유가 분석 01-13 (유가분석1-5)

김민지·2023년 4월 5일

Part 04. EDA/웹크롤링/파이썬프로그래밍

목록 보기

8/12

selenium 설치
- Beautiful Soup만으로 해결할 수 없는 것
  -접근할 웹 주소를 알 수 없을 때
  -자바스크립트를 사용하는 웹페이지의 경우
  -웹 브라우저로 접근하지 않으면 안 될 때

Selenium
-웹 브라우저를 원격 조작하는 도구
-자동으로 URL을 열고 클릭 등이 가능
-스크롤, 문자 입력, 화면 캡쳐 등이 가능

selenium은 Python 모듈도 설치하고 크롬 드라이버도 받아야 함

from selenium import webdriver

driver = webdriver.Chrome("../driver/chromedriver.exe")
driver.get("https://www.naver.com")

-> webdriver.Chrome 명령으로 크롬드라이버의 경로 지정
-> get 명령으로 접근하고 싶은 주소 지정

driver.quit()  # 꼭 꺼줘야 함

selenium webdriver 사용하기, 기초 코드

from selenium import webdriver
# from selenium.webdriver.common.by import By

driver = webdriver.Chrome(executable_path="../driver/chromedriver.exe")
driver.get("https://pinkwink.kr")

# 화면 최대 크기로 설정
driver.maximize_window()

# 화면 최소 크기로 설정
driver.minimize_window()

# 화면 크기 설정
driver.set_window_size(600, 600)

# 새로고침
driver.refresh()

# 뒤로 가기
driver.back()

# 앞으로 가기
driver.forward()

# 클릭하기
from selenium.webdriver.common.by import By

first_content = driver.find_element(By.CSS_SELECTOR, "#content > div.cover-masonry > div > ul > li:nth-child(1)")
# 크롬개발자도구를 통해, 클릭하고 싶은 곳 copy selector로 가져오기
first_content.click()

# 새로운 탭 생성하기
driver.execute_script("window.open('https://www.naver.com')")

# 탭 이동
driver.switch_to.window(driver.window_handles[0])  # 탭 순서대로 인덱스 0,1,..

# 탭 개수 확인
len(driver.window_handles)

# 탭 닫기
driver.close()  # 현재탭을 하나씩 닫음

driver.quit()   # 전체 탭 닫기

화면 스크롤

# 스크롤 가능한 높이(길이) 알아내기 (화면 크기에 따라 달라짐)
# execute_script : 자바스크립트 언어를 사용하겠다는 뜻
driver.execute_script("return document.body.scrollHeight")

# 화면 스크롤 하단 이동
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

# 화면 스크롤 상단 이동
driver.execute_script("window.scrollTo(0, 0);")

# 현재 보이는 화면 스크린샷 저장
driver.save_screenshot("./last_height.png")  # 저장경로와 파일명 지정

# 특정 태그 지점까지 스크롤 이동
from selenium.webdriver import ActionChains

some_tag = driver.find_element(By.CSS_SELECTOR, "#content > div.cover-list > div > ul > li:nth-child(1)")
action = ActionChains(driver)
action.move_to_element(some_tag).perform()

검색어 입력, 검색버튼 눌러서 검색하기

from selenium import webdriver
from selenium.webdriver.common.by import By

driver = webdriver.Chrome("../driver/chromedriver.exe")
driver.get("https://www.naver.com")

CSS_SELECTOR

keyword = driver.find_element(By.CSS_SELECTOR, "#query")
keyword.clear()
keyword.send_keys("플레이브")  # clear 하지 않고 key를 넣으면 기존 있던 입력문구에 쌓임

search_btn = driver.find_element(By.CSS_SELECTOR, "#search_btn")
search_btn.click()

XPATH

driver.find_element(By.XPATH, '//*[@id="query"]').send_keys("xpath")
# id=[""]를 안에 넣어야 하기 때문에, 밖에는 ''를 써야 함! (어쨌든 둘이 다른 거여야 함)
# copy xpath

driver.find_element(By.XPATH, '//*[@id="search_btn"]').click()

'//' : 최상위
'*' : 자손 태그 (자식태그보다 밑에 있는 태그)
'/' : 자식 태그 (바로 밑에 있는 태그)
'div[1]' : div 중에서 1번째 태그

pinkwink.kr에서 검색해보기

from selenium import webdriver
from selenium.webdriver.common.by import By

driver = webdriver.Chrome("../driver/chromedriver.exe")
driver.get("https://pinkwink.kr")

# 1. 돋보기 버튼을 선택
from selenium.webdriver import ActionChains

search_tag = driver.find_element(By.CSS_SELECTOR, "#header > div.search")
action = ActionChains(driver)
action.click(search_tag)
action.perform()

-> 그냥 클릭은 안 되는 상태라서 ActionChains로 동작하도록 하기

# 2. 검색어를 입력
driver.find_element(By.CSS_SELECTOR, "#header > div.search > input[type=text]").send_keys("딥러닝")

# 3. 검색 버튼 클릭
driver.find_element(By.CSS_SELECTOR, "#header > div.search.on > button").click()

selenium + beautifulsoup

# 현재 화면의 html 코드 가져오기
driver.page_source

from bs4 import BeautifulSoup

req = driver.page_source
soup = BeautifulSoup(req, "html.parser")

contents = soup.select(".post-item")
contents[2]

셀프 주유소가 정말 저렴한지 알아보기 위한 데이터 확보 작업

https://www.opinet.co.kr/searRgSelect.do
-> 사이트 구조 확인
목표 데이터 : 브랜드, 가격, 셀프 주유 여부, 위치
목표페이지 접근하기

from selenium import webdriver

url = "https://www.opinet.co.kr/searRgSelect.do"
driver = webdriver.Chrome("../driver/chromedriver.exe")
driver.get(url)

driver.get(url)

-> 문제 : 해당 URL로 한 번에 접근이 안 됨. 메인페이지로 접속이 되고, 팝업창이 하나 나옴.

# 팝업창으로 화면 전환 후 닫아주기
driver.switch_to.window(driver.window_handles[-1])

# 접근 페이지 다시 요청
driver.get(url)

import time  # 작업속도 때문에 오류가 나기 때문에 조정해줌


# 페이지 접근
url = "https://www.opinet.co.kr/searRgSelect.do"
driver = webdriver.Chrome("../driver/chromedriver.exe")
driver.get(url)
time.sleep(3)

# # 팝업창으로 전환
# driver.switch_to.window(driver.window_handles[-1])
# # 팝업창 닫아주기
# driver.close()
# time.sleep(3)

# # 메인화면 창으로 전환
# driver.switch_to.window(driver.window_handles[-1])

# 접근 페이지 다시 요청
driver.get(url)

'시/도' 입력칸에 서울시 입력

from selenium.webdriver.common.by import By
# 지역: 시/도

sido_list_raw = driver.find_element(By.ID, "SIDO_NM0")
sido_list_raw.text

sido_list = sido_list_raw.find_elements(By.TAG_NAME, "option")
len(sido_list), sido_list[17].text

-> (18, '제주')

sido_list[1].get_attribute("value")

-> '서울특별시'

sido_names = []

for option in sido_list:
    sido_names.append(option.get_attribute("value"))

sido_names

# 위의 반복문을 한 줄로 표현하기
sido_names = [option.get_attribute("value") for option in sido_list]
sido_names[:5]

-> ['', '서울특별시', '부산광역시', '대구광역시', '인천광역시']

sido_names = sido_names[1:]  # 0번째 공백 데이터를 없앰
sido_names

sido_names[0]

-> '서울특별시'

sido_list_raw.send_keys(sido_names[0])

-> 서울시로 키 입력

'구' 입력칸에 구 입력

# 구

gu_list_raw = driver.find_element(By.ID, "SIGUNGU_NM0") # 부모 태그
gu_list = gu_list_raw.find_elements(By.TAG_NAME, "option") # 자식 태그

gu_names = [option.get_attribute("value") for option in gu_list]
gu_names = gu_names[1:]
gu_names[:5], len(gu_names)

gu_list_raw.send_keys(gu_names[15])

# 엑셀 저장
# 1
driver.find_element(By.CSS_SELECTOR, "#glopopd_excel").click()

# 2
driver.find_element(By.XPATH, '//*[@id="glopopd_excel"]').click()

# 3
element_get_excel = driver.find_element(By.ID, "glopopd_excel")
element_get_excel.click()

# 서울시 모든 구의 엑셀파일 다운받기

import time
from tqdm import tqdm_notebook

for gu in tqdm_notebook(gu_names):
    element = driver.find_element(By.ID, "SIGUNGU_NM0")
    element.send_keys(gu)
    time.sleep(3)
        
    element_get_excel = driver.find_element(By.ID, "glopopd_excel").click()
    time.sleep(3)

driver.close()

데이터 정리하기

import pandas as pd
from glob import glob

# 파일 목록 한 번에 가져오기
glob("../data/지역_*.xls")

# 파일명 저장
stations_files = glob("../data/지역_*.xls")
stations_files[:5]

# 하나만 읽어보기
tmp = pd.read_excel(stations_files[0], header=2)
tmp.tail(2)

tmp_raw = []

for file_name in stations_files:
    tmp = pd.read_excel(file_name, header=2)
    tmp_raw.append(tmp)

concat : 형식이 동일하고 연달아 붙이기만 하면 될 때 사용

stations_raw = pd.concat(tmp_raw)
stations_raw

stations_raw.info()

데이터 프레임 만들기

stations = pd.DataFrame({
    "상호": stations_raw["상호"],
    "주소": stations_raw["주소"],
    "가격": stations_raw["휘발유"],
    "셀프": stations_raw["셀프여부"],
    "상표": stations_raw["상표"]
})
stations.tail()

for eachAddress in stations["주소"]:
    print(eachAddress.split()[1])

-> 주소에서 구이름만 뽑아내기

stations["구"] = [eachAddress.split()[1] for eachAddress in stations["주소"]]
stations

stations["구"].unique(), len(stations["구"].unique())

# 가격 정보 없는 주유소
stations[stations["가격"] == "-"]

# 가격 정보 있는 주유소만 사용
stations = stations[stations["가격"] != "-"]

# 가격 데이터형 변환 object => float
stations["가격"] = stations["가격"].astype("float")

# 인덱스 재정렬
stations.reset_index(inplace=True)
stations.tail()

# 필요없는 기존 인덱스 칼럼 지우기
del stations["index"]
stations.head()

주유 가격 정보 데이터 시각화

import matplotlib.pyplot as plt
import seaborn as sns
import platform
from matplotlib import font_manager, rc

%matplotlib inline

path = "C:/Windows/Fonts/malgun.ttf"

if platform.system() == "Darwin":
    rc("font", family="Arial Unicode MS")
elif platform.system() == "Windows":
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc("font", family=font_name)
else:
    print("Unknown system. sorry.")

# boxplot(feat. pandas)

stations.boxplot(column="가격", by="셀프", figsize=(12, 8))

# boxplot (feat. seaborn)

plt.figure(figsize=(12, 8))
sns.boxplot(x="셀프", y="가격", data=stations, palette="Set3")
plt.grid(True)
plt.show()

# boxplot (feat. seaborn)

plt.figure(figsize=(12, 8))
sns.boxplot(x="상표", y="가격", hue="셀프", data=stations, palette="Set3")
plt.grid(True)
plt.show()

지도 시각화

import json
import folium
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

# 가장 비싼 주유소 10개
stations.sort_values(by="가격", ascending=False).head(10)

# 가장 값싼 주유소 10개
stations.sort_values(by="가격", ascending=True).head(10)

import numpy as np

gu_data = pd.pivot_table(
    data=stations,
    index="구",
    values="가격",
    aggfunc=np.mean
)
gu_data.head()

서울 지도 위에 구별로 경계선 짓고 가격별 색깔 구별하도록 만들기

geo_path = "../data/02. skorea_municipalities_geo_simple.json"
geo_str = json.load(open(geo_path, encoding="utf-8"))

my_map = folium.Map(location=[37.5502, 126.982], zoom_start=10.5, tiles="Stamen Toner")
my_map.choropleth(
    geo_data=geo_str,
    data=gu_data,
    columns=[gu_data.index, "가격"],
    key_on="feature.id",
    fill_color="PuRd"
)
my_map

<제로베이스 데이터 취업 스쿨>

김민지

이전 포스트

Ch4 웹데이터 분석 14-26 (웹데이터4-6)

다음 포스트

Ch5 유가 분석 01-13 (유가분석1-5)

Part 04. EDA/웹크롤링/파이썬프로그래밍

Ch4 웹데이터 분석 14-26 (웹데이터4-6)

Ch6 Naver API 01-07 (Naver API 1-2)

0개의 댓글