just by using, bs4, sometime some html element is not scrapped
not sure, but I think it is because ceratin html elements are loaded by JS
which means that we have to until that html file is loaded by JS
and then scrap again
response = requests.get(link)
dom = BeautifulSoup(response.content, "html.parser")
# dom 은 html 코드가 들어있고
# 여기 안에서 기사 내용에 해당하는 애를 가져와야 한다.
elements = dom.select_one("#harmonyContainer")
## element는 list 형태로 온다.
print(elements.text.strip().replace("\n",""))
# print(elements.text.string().replace("\n",""))
# 4. 함수로 만들기
def get_content(link):
response = requests.get(link)
dom = BeautifulSoup(response.content, "html.parser")
elements = dom.select_one("#harmonyContainer")
return elements.text.strip().replace("\n","")
elements is undefined
and the reason is because id "harmoneyContainer" is not caught through bs4
even if it appears in browser
1) Selenium
2) Selenium_WebDriverWait
# -*- encoding: utf-8 -*-
# how to use chrome driver : https://emessell.tistory.com/148
# how to implicitly wait for certain element to be loaded : https://aonee.tistory.com/40
import sys
import io
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
# https://www.aladin.co.kr/m/msearch.aspx?SearchTarget=Book&KeyWord=%EC%8B%A4%EC%A1%B4%EC%A3%BC%EC%9D%98&KeyRecentPublish=0&OutStock=0&ViewType=Detail&CustReviewCount=0&CustReviewRank=0&KeyFullWord=%EC%8B%A4%EC%A1%B4%EC%A3%BC%EC%9D%98&KeyLastWord=%EC%8B%A4%EC%A1%B4%EC%A3%BC%EC%9D%98&CategorySearch=&MViewType=&PriceFilterMax=
sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding = 'utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding = 'utf-8')
books = {}
pages = [1,2]
driver = webdriver.Chrome(r"C:\Users\user.DESKTOP-3NN2QR0\Desktop\CodingStudyStuff\web_study-master\JS\Inflearn\React\Node_React_Psyche\Crawling\chromedriver.exe")
def get_content_in_each_link(link):
driver.set_window_size(1000, 1000)
driver.get(link)
try :
element = WebDriverWait(driver, 100).until(
EC.presence_of_element_located(( By.ID, "Publisher_all"))
)
except Exception as e:
print("오류발생", e)
# driver.implicitly_wait(10)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
element = soup.select_one("#Publisher_all")
try :
print("element", element)
if(element):
print("text", element.text)
return element.text
except Exception as e:
print("failed")
return ""
# return elements.text.strip().replace("\n","")
books["실존주의"] = []
print("books", books)
for page in pages:
url = "https://www.aladin.co.kr/m/msearch.aspx?SearchTarget=Book&KeyWord=%EC%8B%A4%EC%A1%B4%EC%A3%BC%EC%9D%98&KeyRecentPublish=0&OutStock=0&ViewType=Detail&SortOrder=11&CustReviewCount=0&CustReviewRank=0&KeyFullWord=%EC%8B%A4%EC%A1%B4%EC%A3%BC%EC%9D%98&KeyLastWord=%EC%8B%A4%EC%A1%B4%EC%A3%BC%EC%9D%98&CategorySearch=&MViewType=&page={}".format(page)
response = requests.get(url)
dom = BeautifulSoup(response.content, "html.parser")
elements = dom.select(".browse_list_box")
print("page ", page)
for idx, element in enumerate(elements) :
print("%d th element" %(idx +1) )
title = element.select_one('table > tr >td:nth-child(2) > ul > li:first-child > span')
author = element.select_one('table > tr >td:nth-child(2) > ul > li:nth-child(2) > a.nm_book_title_a')
if(hasattr(author,"text")):
author = author.text
else:
author = author
image = element.select_one('table > tr >td:first-child > div > div > a > img')['src']
link = element.select_one('table > tr >td:first-child > div > div > a').get("href")
link_description = get_content_in_each_link(link)
# print("link content", get_content_in_each_link(link))
books["실존주의"].append({
"title": title.text,
"author": author,
"img": image,
"description" :link_description
})
if(idx == 0):
break;
driver.quit()
print(books)