🎡 유튜브 shorts 크둀링 🎡

parkeuΒ·2022λ…„ 9μ›” 19일
0

ABCλΆ€νŠΈμΊ ν”„

λͺ©λ‘ 보기
23/55

🐼 μ€€λΉ„

#이 뢀뢄은 처음 ν•œλ²ˆλ§Œ μ‹€ν–‰ν•˜λ©΄ 됨. μ‚¬λžŒμ΄ ν•˜λŠ” κ²ƒμ²˜λŸΌ κ²€μƒ‰ν•˜λŠ” νŠΉμ§• νƒ‘μž¬
!pip install selenium
!apt-get update
!apt install chromium-chromedriver  # 마우슀, ν‚€λ³΄λ“œ μž…λ ₯ 효과 쀄 수 μžˆλ‹€.
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

import matplotlib as mpl
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina'

!apt -qq -y install fonts-nanum

import matplotlib.font_manager as fm
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='NanumBarunGothic') 
mpl.font_manager._rebuild()

# 라이브러리 μž„ν¬νŠΈ
from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup

import pandas as pd

import warnings
warnings.filterwarnings('ignore')

⌨️ λŒ“κΈ€ 크둀링

μ΄ν•΄μ•ˆλ˜λ―€λ‘œ κ³΅λΆ€ν•„μš”

from prompt_toolkit.formatted_text.html import html_escape
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

options = webdriver.ChromeOptions()
options.add_argument('--headless') 
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome('chromedriver', options=options)

driver.get('https://youtube.com/shorts/-srhx6uyWx0?feature=share') # 링크 μ—΄κΈ°
driver.implicitly_wait(3)

time.sleep(1.5)

driver.execute_script('window.scrollTo(0,800)') # ν•œλ²ˆ 슀크둀
time.sleep(3)

driver.find_element(By.CSS_SELECTOR, "#comments-button").click() # λŒ“κΈ€λ²„νŠΌ 클릭
fBody  = driver.find_element(By.CSS_SELECTOR, "#contents")

fBody

scroll = 0

last_height = driver.execute_script("return arguments[0].scrollHeight", fBody) # 졜초 μ ‘μ†μ‹œ 화면이 가진 슀크둀 높이 μ΅œλŒ€ 픽셀값을 μΆ”μΆœ

while True:
    
  driver.execute_script('arguments[0].scrollTop = arguments[0].scrollTop + arguments[0].scrollHeight;', fBody)
  #driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);", fBody) # μ§€κΈˆ 보고 μžˆλŠ” 화면이 가진 슀크둀 높이 μ΅œλŒ€ ν”½μ…€κ°’λ§ŒνΌ μ‹€ν–‰/ ; = window ν•¨μˆ˜μ˜ νŠΉμ§•(μžλ°”ν¬ν•¨) codeκ°€ λλ‚¬μœΌλ‹ˆ λ‹€μŒμœΌλ‘œ λ„˜μ–΄κ°€λΌλŠ” ν‘œμ‹œ
  
  time.sleep(5)

  new_height = driver.execute_script("return arguments[0].scrollHeight" ,fBody) # μŠ€ν¬λ‘€ν•œ 후에 또 생긴 화면이 가진 슀크둀 높이 μ΅œλŒ€ ν”½μ…€κ°’ μΆ”μΆœ

  if new_height == last_height: # 슀크둀 λ§ˆμ§€λ§‰ νŽ˜μ΄μ§€λ©΄ λ©ˆμΆ”κΈ°
      break

  last_height = new_height # 슀크둀 더 있으면 λ‹€μ‹œ λ‹΄κΈ°
  time.sleep(5)

  try:
    driver.find_element_by_css_selector('#dismiss-button > a').click() # 유튜브 1달 무료 νŒμ—…λ‹«κΈ°

  except:
    pass

  
# λŒ“κΈ€ 크둀링 
html_source = driver.page_source
soup = BeautifulSoup(html_source, 'html.parser')

id_list = soup.select('div#header-author > h3 > #author-text > span') # id 리슀트
comment_list = soup.select('yt-formatted-string#content-text') # comment 리슀트  


# νŒŒμ‹±ν•΄μ„œ 넣을 μ‹€μ œ 데이터 리슀트
id_final = []
comment_final = []

for i in range(len(comment_list)):
  temp_id = id_list[i].text
  temp_id = temp_id.replace('\n', '').replace('\t', '').replace(' ', '').strip()
  id_final.append(temp_id) # λŒ“κΈ€ μž‘μ„±μž
  
  temp_comment = comment_list[i].text
  temp_comment = temp_comment.replace('\n', '').replace('\t', '').replace('\r', '').strip()
  comment_final.append(temp_comment) # λŒ“κΈ€ λ‚΄μš©

# DataFrame λ§Œλ“€κΈ°(list -> dictionary -> dataframe)
# list -> dictionary
youtube_dic = {"아이디":id_final, "λŒ“κΈ€ λ‚΄μš©": comment_final}
# dictionary -> dataframe
youtube_pd = pd.DataFrame(youtube_dic)

youtube_pd.head()
profile
배고파용.

0개의 λŒ“κΈ€