πΌ μ€λΉ
#μ΄ λΆλΆμ μ²μ νλ²λ§ μ€ννλ©΄ λ¨. μ¬λμ΄ νλ κ²μ²λΌ κ²μνλ νΉμ§ νμ¬ !pip install selenium !apt-get update !apt install chromium-chromedriver # λ§μ°μ€, ν€λ³΄λ μ λ ₯ ν¨κ³Ό μ€ μ μλ€. !cp /usr/lib/chromium-browser/chromedriver /usr/bin import matplotlib as mpl import matplotlib.pyplot as plt %config InlineBackend.figure_format = 'retina' !apt -qq -y install fonts-nanum import matplotlib.font_manager as fm fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf' font = fm.FontProperties(fname=fontpath, size=9) plt.rc('font', family='NanumBarunGothic') mpl.font_manager._rebuild() # λΌμ΄λΈλ¬λ¦¬ μν¬νΈ from selenium import webdriver import time from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup import pandas as pd import warnings warnings.filterwarnings('ignore')
μ΄ν΄μλλ―λ‘ κ³΅λΆνμ
from prompt_toolkit.formatted_text.html import html_escape
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver', options=options)
driver.get('https://youtube.com/shorts/-srhx6uyWx0?feature=share') # λ§ν¬ μ΄κΈ°
driver.implicitly_wait(3)
time.sleep(1.5)
driver.execute_script('window.scrollTo(0,800)') # νλ² μ€ν¬λ‘€
time.sleep(3)
driver.find_element(By.CSS_SELECTOR, "#comments-button").click() # λκΈλ²νΌ ν΄λ¦
fBody = driver.find_element(By.CSS_SELECTOR, "#contents")
fBody
scroll = 0
last_height = driver.execute_script("return arguments[0].scrollHeight", fBody) # μ΅μ΄ μ μμ νλ©΄μ΄ κ°μ§ μ€ν¬λ‘€ λμ΄ μ΅λ ν½μ
κ°μ μΆμΆ
while True:
driver.execute_script('arguments[0].scrollTop = arguments[0].scrollTop + arguments[0].scrollHeight;', fBody)
#driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);", fBody) # μ§κΈ λ³΄κ³ μλ νλ©΄μ΄ κ°μ§ μ€ν¬λ‘€ λμ΄ μ΅λ ν½μ
κ°λ§νΌ μ€ν/ ; = window ν¨μμ νΉμ§(μλ°ν¬ν¨) codeκ° λλ¬μΌλ λ€μμΌλ‘ λμ΄κ°λΌλ νμ
time.sleep(5)
new_height = driver.execute_script("return arguments[0].scrollHeight" ,fBody) # μ€ν¬λ‘€ν νμ λ μκΈ΄ νλ©΄μ΄ κ°μ§ μ€ν¬λ‘€ λμ΄ μ΅λ ν½μ
κ° μΆμΆ
if new_height == last_height: # μ€ν¬λ‘€ λ§μ§λ§ νμ΄μ§λ©΄ λ©μΆκΈ°
break
last_height = new_height # μ€ν¬λ‘€ λ μμΌλ©΄ λ€μ λ΄κΈ°
time.sleep(5)
try:
driver.find_element_by_css_selector('#dismiss-button > a').click() # μ νλΈ 1λ¬ λ¬΄λ£ νμ
λ«κΈ°
except:
pass
# λκΈ ν¬λ‘€λ§
html_source = driver.page_source
soup = BeautifulSoup(html_source, 'html.parser')
id_list = soup.select('div#header-author > h3 > #author-text > span') # id 리μ€νΈ
comment_list = soup.select('yt-formatted-string#content-text') # comment 리μ€νΈ
# νμ±ν΄μ λ£μ μ€μ λ°μ΄ν° 리μ€νΈ
id_final = []
comment_final = []
for i in range(len(comment_list)):
temp_id = id_list[i].text
temp_id = temp_id.replace('\n', '').replace('\t', '').replace(' ', '').strip()
id_final.append(temp_id) # λκΈ μμ±μ
temp_comment = comment_list[i].text
temp_comment = temp_comment.replace('\n', '').replace('\t', '').replace('\r', '').strip()
comment_final.append(temp_comment) # λκΈ λ΄μ©
# DataFrame λ§λ€κΈ°(list -> dictionary -> dataframe)
# list -> dictionary
youtube_dic = {"μμ΄λ":id_final, "λκΈ λ΄μ©": comment_final}
# dictionary -> dataframe
youtube_pd = pd.DataFrame(youtube_dic)
youtube_pd.head()