Crawling - Youtube data collection

화이티 ·2023년 12월 18일

Crawling

목록 보기
6/7

New formula:

find and rfind in text

→ to find so thu tu in text Ex: label.rfind('조회수')

1. import library

from selenium import webdriver as wb
from bs4 import BeautifulSoup as bs
import requests as req
import os
import pandas as pd
from urllib.request import urlretrieve
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time

2. Get link

yt_url = "https://www.youtube.com/@hoseobiiiiiii._.0410/videos"

3. Process link

driver = wb.Chrome()
driver.get(yt_url)

4. Get title, hyper link, view

title = driver.find_elements([By.ID](http://by.id/),'video-title')
link = driver.find_elements([By.ID](http://by.id/),'video-title-link')
view = driver.find_elements(By.CSS_SELECTOR,'div#separator+span')

5. Get list

  • to get all page from beginning to the end
last_height = driver.execute_script("return document.body.scrollWidth")

while(True):
body = driver.find_element(By.TAG_NAME,'body')
body.send_keys(Keys.END)
time.sleep(1)

current_height=driver.execute_script("return document.body.scrollWidth")

if current_height == last_height:
    break

last_height = current_height
print(last_height, current_height)
yt_video_link = driver.find_elements([By.ID],'video-title-link')
for a_link in yt_video_link:
   title = a_link.text
   href = a_link.get_attribute('href')
    label = a_link.get_attribute('aria-label')
    start_index =label.rfind('조회수')+4
    end_index = label.rfind('회')
    view = label[start_index:end_index]
print(title)
print(label)
print(view)

6. Make dataframe

data = pd.DataFrame(data = zip(titles,links,views), columns =["Title","Link", "View"])
data

7. Export to excel

data.to_excel('name.xlsx', index=False)

🍏Collect comment data

url = "https://www.youtube.com/watch?v=bRrUbFPcygA"
driver = wb.Chrome()
driver.get(url)
cmt = driver.find_elements([By.ID],'content-text')
comment=[]
for i in range(len(cmt)):
comment.append(cmt[i].text.strip('\n'))
comment
comment = pd.DataFrame(comment)
comment.to_excel('Yt_comment.xlsx', index=False)
f = open('yt_review.txt','w',encoding ='utf-8')
for comment in cmt:

    f.write(comment.text)
f.close()
profile
열심히 공부합시다! The best is yet to come! 💜

0개의 댓글