(μ΄κ²λ μ λ λμμ μ¬μ©ν΄μ ν¬λ‘€λ§μ ν κ²μ΄κΈ° λλ¬Έμ ν¬λ‘€λ§ μ μ κΌ μ€μΉ ν΄μ€μΌ λ¨!!)
# λΌμ΄λΈλ¬λ¦¬ μν¬νΈ
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver', options=options) # ν¬λ‘¬ λΈλΌμ°μ μ€λΉ
driver.get('https://www.youtube.com/watch?v=ycEtLNlX_ss') # μ΄λ¦Ό
driver.implicitly_wait(3)
time.sleep(1.5)
driver.execute_script("window.scrollTo(0,800)") # μ€ν¬λ‘€ 800λ§νΌ λ΄λ¦¬κΈ°
time.sleep(3)
# λκΈ μμ§μ μν μ€ν¬λ‘€ λ΄λ¦¬κΈ°
last_height = driver.execute_script("return document.documentElement.scrollHeight") # μ΅μ΄ μ μ μ μ€ν¬λ‘€ λμ΄ μ΄κΈ°ν
# μ€ν¬λ‘€ λ΄λ¦¬κΈ°λ₯Ό λλ λ κΉμ§
while True:
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
time.sleep(2)
new_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_height == last_height:
break
last_height = new_height
time.sleep(2)
try:
driver.find_element_bt_css_selecter('#dismiss-button > a').click() # μ νλΈ 1λ¬ λ¬΄λ£ νμ
λ«κΈ°
time.sleep(1.5)
except:
pass
πΌ ν¬λ‘€λ§μ μν΄ μ½λ©μ ν΅ν΄ μ‘μ λΆμ¬νλ μμ λ€μ΄λ€.
# λκΈ ν¬λ‘€λ§
html_source = driver.page_source
soup = BeautifulSoup(html_source, 'html.parser')
id_list = soup.select('div#header-author > h3 > #author-text > span')
comment_list = soup.select('yt-formatted-string#content-text')
id_final = []
comment_final = []
for i in range(len(comment_list)):
temp_id = id_list[i].text
temp_id = temp_id.replace('\n', '').replace('\t', '').replace(' ', '').strip()
id_final.append(temp_id) # λκΈ μμ±μ
temp_comment = comment_list[i].text
temp_comment = temp_comment.replace('\n', '').replace('\t', '').replace('\r', '').strip()
comment_final.append(temp_comment) # λκΈ λ΄μ©
πΌ λκΈ μμ±μμ λκΈ λ΄μ© ν¬λ‘€λ§ μμ
# dataframe λ§λ€κΈ° (list -> dic -> dataframe)
youtube_dic = {"μμ΄λ":id_final, "λκΈ λ΄μ©":comment_final}
youtube_pd = pd.DataFrame(youtube_dic)
πΌ ν¬λ‘€λ§ν κ²μ λ°μ΄ν°νλ μ ννλ‘ μ μ₯
youtube_pd.to_csv('μ νλΈλκΈ_ν¬λ‘€λ§_μ€ν_20220909.csv', encoding='utf-8-sig', index=False)
πΌ νμΌλ‘ μ μ₯νλ κ²λ μμ§λ§μ ..
df = pd.read_csv('/content/μ νλΈλκΈ_ν¬λ‘€λ§_μ€ν_20220909.csv')
text = " ".join(li for li in df['λκΈ λ΄μ©'].astype(str))
λ°μ΄ν°νλ μννλ‘ λΆλ¬μ¨ λ€μ μλ ν΄λΌμ°λ μκ°νλ₯Ό μν΄ ν μ€νΈλ€μ joinμ ν΅ν΄ λͺ¨λ λΆμ¬μ€λ€.
μλ ν΄λΌμ°λ μκ°ν μ½λλ λ νλλλ‘ ...
νλ©΄!!
μ΄λ κ² λμ΅λλ€
μΉκ΅¬νν
λκΈ 1λ§κ° μ΄νμΈ μμ μ무거λ 보λ΄λ³΄λΌκ³ νκ±°λΌ
μ΄κ² λ¨Ό μμμ΄κΈΈλ μ¬μ΄ν κ° κ°μ₯ ν¬κ² λνλ건μ§λ λͺ¨λ₯΄κ²λ€μ ...
μμμ λμ¨ μ¬λ μ΄λ¦μΈκ°??