유튜브 크롤링

데비드·2021년 5월 28일

연습장

목록 보기

2/2

이것도 심심해서 만들어 봤다.
검색어를 입력하면 해당 유튜브 영상의 정보를 csv파일로 저장하고
원하는 값을 입력하면 해당 url로 연결하여 영상을 재생하는 코드

pagedown은 귀찮아서 구현하지 않았다..

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib.request
import time
from IPython.display import display
import warnings
warnings.filterwarnings(action='ignore')

path = 'path 값은 이용자의 브라우저 드라이버가 설치된 장소로 지정'

def get_video():
    feature = input('검색어를 입력하시오 : ')
    
    driver = webdriver.Chrome(path)
    driver.get('https://www.youtube.com')
    
    n = 3
    while n > 0:
        print('웹페이지를 불러오는 중입니다..' + '..' * n)
        time.sleep(1)
        n -= 1
    
    src = driver.find_element_by_xpath('//*[@id="search"]')
    src.send_keys(feature)
    src.send_keys(Keys.RETURN)
    
    n = 2
    while n > 0:
        print('검색 결과를 불러오는 중입니다..' + '..' * n)
        time.sleep(1)
        n -= 1
        
    print('데이터 수집 중입니다....')
    
    html = driver.page_source
    soup = BeautifulSoup(html)
    
    df_title = []
    df_link = []
    df_writer = []
    df_view = []
    df_date = []
    
    for i in range(len(soup.find_all('ytd-video-meta-block', 'style-scope ytd-video-renderer byline-separated'))):
        title = soup.find_all('a', {'id' : 'video-title'})[i].text.replace('\n', '')
        link = 'https://www.youtube.com/' + soup.find_all('a', {'id' : 'video-title'})[i]['href']
        writer = soup.find_all('ytd-channel-name', 'long-byline style-scope ytd-video-renderer')[i].text.replace('\n', '').split(' ')[0]
        view = soup.find_all('ytd-video-meta-block', 'style-scope ytd-video-renderer byline-separated')[i].text.split('•')[1].split('\n')[3]
        date = soup.find_all('ytd-video-meta-block', 'style-scope ytd-video-renderer byline-separated')[i].text.split('•')[1].split('\n')[4]
    
        df_title.append(title)
        df_link.append(link)
        df_writer.append(writer)
        df_view.append(view)
        df_date.append(date)
          
    df_just_video = pd.DataFrame(columns=['영상제목', '채널명', '영상url', '조회수', '영상등록날짜'])

    df_just_video['영상제목'] = df_title
    df_just_video['채널명'] = df_writer
    df_just_video['영상url'] = df_link
    df_just_video['조회수'] = df_view
    df_just_video['영상등록날짜'] = df_date
          
    df_just_video.to_csv('../data/df_just_video.csv', encoding='utf-8-sig', index=False)
    
    driver.close()
          
    result = input('데이터프레임 저장이 완료되었습니다! 데이터프레임을 조회하시겠습니까? (y/n)')
    if result == 'y':
        display(df_just_video)
        question = input('원하는 영상을 재생하시겠습니까? (y/n)')
        if question == 'y':
            button = int(input('재생하고자 하는 영상의 번호(출력된 표 가장 왼쪽의 번호)를 입력해주세요.'))
            driver = webdriver.Chrome(path)
            driver.get(df_just_video['영상url'][button])
        else:
            return '프로그램을 종료합니다.'
    else:
        return '프로그램을 종료합니다.'

실행 결과

대충 이런 식으로 실행된다.
번호 입력까지 완료하면 영상 url로 브라우저 자동 연결

데비드

이전 포스트

유튜브 크롤링

연습장

실행 결과

네이버 뉴스기사 크롤링

0개의 댓글