◾import
import numpy as np
import pandas as pd
import requests
import time
import tqdm
import warnings
from bs4 import BeautifulSoup
from selenium import webdriver
from tqdm import notebook
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
warnings.simplefilter(action="ignore", category=FutureWarning)
◾iframe
iframe
: html 안의 또 다른 html
- 네이버 금융 추출 테스트
url = 'https://finance.naver.com/marketindex/'
driver = webdriver.Chrome('../driver/chromedriver.exe')
driver.get(url)
contents = driver.find_elements_by_css_selector('#exchangeList > li > a')
print(contents[0].find_element_by_css_selector(".value").text)
news = driver.find_elements_by_css_selector('''#content > div.section_news > div > ul > li > p > a''')
print(news[0].text)
tradings = driver.find_elements_by_css_selector('body > div > table > tbody > tr')
print(tradings)
- 매매 기준율은 iframe에 담겨있어 바로 접근이 불가능하다.
1. Selenium
- iframe 태그 지정 -> switch_to로 iframe으로 프레임 이동 -> 데이터 추출
iframe = driver.find_element_by_id('frame_ex1')
driver.switch_to_frame(iframe)
print(driver.find_element_by_class_name('sale').text)
tradings = driver.find_elements_by_css_selector('body > div > table > tbody > tr')
print(tradings[0].text)
print(tradings[0].find_element_by_css_selector(".tit").text)
print(tradings[0].find_element_by_css_selector(".sale").text)
print(tradings[0].find_elements_by_css_selector("td")[2].text)
print(tradings[0].find_elements_by_css_selector("td")[3].text)
print(tradings[0].find_elements_by_css_selector("td")[4].text)
print(tradings[0].find_elements_by_css_selector("td")[5].text)
print(tradings[0].find_elements_by_css_selector("td")[6].text)
2. Requests
- 개발자 도구의 network에서 원하는 iframe 주소를 찾아 접근
url = 'https://finance.naver.com/marketindex/exchangeList.naver'
response = requests.get(url)
print(response)
print(response.content)
soup = BeautifulSoup(response.content, 'html.parser')
print(soup.prettify())
print(soup.select_one('.sale').text)
◾handless
- 셀레니움시 화면에 보여야 작동하는 경우가 있기 때문에
최대화
, 스크롤 이동
등을 진행
- 웹 브라우저가 없는 경우 또는 웹브라우저 실행없이 동작 :
_____Options()
사용
url = 'https://finance.naver.com/marketindex/'
driver = webdriver.Chrome('../driver/chromedriver.exe')
driver.get(url)
driver.maximize_window()
time.sleep(3)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
iframe = driver.find_element_by_id('frame_ex1')
driver.switch_to_frame(iframe)
time.sleep(3)
tradings = driver.find_elements_by_css_selector('body > div > table > tbody > tr')
time.sleep(2)
datas = []
for trading in tradings[:3]:
title = trading.find_element_by_css_selector('.tit > a').text
sale = trading.find_element_by_css_selector('.sale').text
link = trading.find_element_by_css_selector('.tit > a').get_attribute('href')
datas.append({
'title' : title,
'sale' : sale,
'link' : link
})
time.sleep(3)
driver.quit()
df = pd.DataFrame(datas)
print(df)
◾wait
wait
: 페이지 로딩을 기다리고 로딩이 완료되면 바로 다음 코드 실행
- time.sleep(10) : 물리적으로 10초동안 멈추어 기다림
- implicityly_wait(10) : 전체 페이지 로딩을 10초동안 기다리고, 10초 안에 페이지 로딩이 완료되면 다음 코드 실행
- 전역(global)으로 실행이 가능해 한 번만 실행한다.
- explicitly_wait : 지정한 태그만 로딩이 완료되면, 다음 코드 실행
- 무신사 스토어
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'https://store.musinsa.com/app/'
driver = webdriver.Chrome('../driver/chromedriver.exe')
driver.implicitly_wait(10)
driver.get(url)
driver.find_element_by_css_selector('#default_top > div.header-member > button').click()
driver.find_element_by_css_selector('body > div.musinsa-wrapper.wrapper-member.devicePC > div > form > input:nth-child(2)').clear()
driver.find_element_by_css_selector('body > div.musinsa-wrapper.wrapper-member.devicePC > div > form > input:nth-child(2)').send_keys('test')
driver.find_element_by_css_selector('body > div.musinsa-wrapper.wrapper-member.devicePC > div > form > input:nth-child(3)').clear()
driver.find_element_by_css_selector('body > div.musinsa-wrapper.wrapper-member.devicePC > div > form > input:nth-child(3)').send_keys('test')
driver.find_element_by_css_selector('body > div.musinsa-wrapper.wrapper-member.devicePC > div > form > button').click()
alert = driver.switch_to.alert
alert.accept()
로그인 버튼 부분을 최대 5초동안 기다리며 로딩이 되면 클릭
WebDriverWait(driver, 5).\
until(EC.presence_of_element_located((By.CSS_SELECTOR, "#default_top > div.header-member > button"))).click()
WebDriverWait(driver, 5).\
until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.musinsa-wrapper.wrapper-member.devicePC > div > form > input:nth-child(2)"))).send_keys('test')
WebDriverWait(driver, 5).\
until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.musinsa-wrapper.wrapper-member.devicePC > div > form > input:nth-child(3)"))).send_keys('test')
WebDriverWait(driver, 5).\
until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.musinsa-wrapper.wrapper-member.devicePC > div > form > button"))).click()
driver.execute_script("window.open('{}');".format(best_link))
driver.switch_to.window(driver.window_handles[1])
WebDriverWait(driver, 5).\
until(EC.presence_of_element_located((By.CSS_SELECTOR, "#btn_exclusive"))).click()
WebDriverWait(driver, 5).\
until(EC.presence_of_element_located((By.CSS_SELECTOR, "#btn_sale"))).click()
WebDriverWait(driver, 5).\
until(EC.presence_of_element_located((By.CSS_SELECTOR, "#minPrice"))).send_keys('10000')
WebDriverWait(driver, 5).\
until(EC.presence_of_element_located((By.CSS_SELECTOR, "#maxPrice"))).send_keys('100000')
WebDriverWait(driver, 5).\
until(EC.presence_of_element_located((By.CSS_SELECTOR, "#btn_price_search"))).click()
outers = driver.find_elements_by_css_selector('#searchList > li')
len(outers), outers[0].text
print(outers[0].find_element_by_css_selector('p.list_info > a').get_attribute('title'))
print(outers[0].find_element_by_css_selector('p.price').text.split(' ')[1][:-1])
print(outers[0].find_element_by_css_selector('.icon_new').text.split(' ')[1][:-1])
print(outers[0].find_element_by_css_selector('p.list_info > a').get_attribute('href'))
print(outers[0].find_element_by_css_selector('div.list_img > a > img').get_attribute('data-original'))
res = requests.get(outers[0].find_element_by_css_selector('div.list_img > a > img').get_attribute('data-original'))
with open('./musinsa/outer.png', 'wb') as f:
f.write(res.content)