๐Ÿฅธ ์œ ํŠœ๋ธŒ ๋Œ“๊ธ€ ํฌ๋กค๋ง ๐Ÿฅธ

parkeuยท2022๋…„ 9์›” 9์ผ
1

ABC๋ถ€ํŠธ์บ ํ”„

๋ชฉ๋ก ๋ณด๊ธฐ
21/55

๐Ÿผ ์ค€๋น„

#์ด ๋ถ€๋ถ„์€ ์ฒ˜์Œ ํ•œ๋ฒˆ๋งŒ ์‹คํ–‰ํ•˜๋ฉด ๋จ. ์‚ฌ๋žŒ์ด ํ•˜๋Š” ๊ฒƒ์ฒ˜๋Ÿผ ๊ฒ€์ƒ‰ํ•˜๋Š” ํŠน์ง• ํƒ‘์žฌ
!pip install selenium
!apt-get update
!apt install chromium-chromedriver  # ๋งˆ์šฐ์Šค, ํ‚ค๋ณด๋“œ ์ž…๋ ฅ ํšจ๊ณผ ์ค„ ์ˆ˜ ์žˆ๋‹ค.
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

# ํ•œ๊ธ€๊นจ์ง ๋ฐฉ์ง€
import matplotlib as mpl
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina'

!apt -qq -y install fonts-nanum

import matplotlib.font_manager as fm
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='NanumBarunGothic') 
mpl.font_manager._rebuild()

# ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ž„ํฌํŠธ
from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup

import pandas as pd

import warnings
warnings.filterwarnings('ignore')

๐ŸŽ† ๋Œ“๊ธ€ ํฌ๋กค๋ง

from prompt_toolkit.formatted_text.html import html_escape

options = webdriver.ChromeOptions()
options.add_argument('--headless') 
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome('chromedriver', options=options)

driver.get('https://www.youtube.com/watch?v=75kySTFaBQQ&t=2980s') # ๋งํฌ ์—ด๊ธฐ
driver.implicitly_wait(3)

time.sleep(1.5)

driver.execute_script('window.scrollTo(0, 800)') # ํ•œ๋ฒˆ ์Šคํฌ๋กค
time.sleep(3)

last_height = driver.execute_script('return document.documentElement.scrollHeight') # ์Šคํฌ๋กค ์ „์ฒด ๋†’์ด

while True:
  driver.execute_script('window.scrollTo(0, document.documentElement.scrollHeight);') # ์Šคํฌ๋กค ๋‹ค์šด
  time.sleep(1.5)

  new_height = driver.execute_script('return document.documentElement.scrollHeight') # ์Šคํฌ๋กค ๋‹ค์šด ํ›„ ์Šคํฌ๋กค ๋†’์ด 

  if new_height == last_height: # ๋Œ“๊ธ€ ๋งˆ์ง€๋ง‰ ํŽ˜์ด์ง€๋ฉด while๋ฌธ ๋ฒ—์–ด๋‚จ
    break

  last_height = new_height
  time.sleep(1.5)

  try:
    driver.find_element_by_css_selector('#dismiss-button > a').click() # ์œ ํŠœ๋ธŒ 1๋‹ฌ ๋ฌด๋ฃŒ ํŒ์—…๋‹ซ๊ธฐ

  except:
    pass

# ๋Œ“๊ธ€ ํฌ๋กค๋ง 
html_source = driver.page_source
soup = BeautifulSoup(html_source, 'html.parser')

id_list = soup.select('div#header-author > h3 > #author-text > span') # id ๋ฆฌ์ŠคํŠธ
comment_list = soup.select('yt-formatted-string#content-text') # comment ๋ฆฌ์ŠคํŠธ  

# ํŒŒ์‹ฑํ•ด์„œ ๋„ฃ์„ ์‹ค์ œ ๋ฐ์ดํ„ฐ ๋ฆฌ์ŠคํŠธ
id_final = []
comment_final = []

for i in range(len(comment_list)):
  temp_id = id_list[i].text
  temp_id = temp_id.replace('\n', '').replace('\t', '').replace(' ', '').strip()
  id_final.append(temp_id) # ๋Œ“๊ธ€ ์ž‘์„ฑ์ž
  
  temp_comment = comment_list[i].text
  temp_comment = temp_comment.replace('\n', '').replace('\t', '').replace('\r', '').strip()
  comment_final.append(temp_comment) # ๋Œ“๊ธ€ ๋‚ด์šฉ

# DataFrame ๋งŒ๋“ค๊ธฐ(list -> dictionary -> dataframe)
# list -> dictionary
youtube_dic = {"์•„์ด๋””":id_final, "๋Œ“๊ธ€ ๋‚ด์šฉ": comment_final}
# dictionary -> dataframe
youtube_pd = pd.DataFrame(youtube_dic)

youtube_pd.head()

๐Ÿ“จ ๊ฒฐ๊ณผ

๐Ÿ‘€ ์ด๋Ÿฐ์‹์œผ๋กœ ์ˆ˜์ง‘๋ฉ๋‹ˆ๋‹น


โœ‰๏ธ id_list

๐Ÿ“ง comment_list


๐Ÿ‘๏ธ ์›Œ๋“œํด๋ผ์šฐ๋“œ ์‹œ๊ฐํ™”

text =" ".join(li for li in youtube_pd['๋Œ“๊ธ€ ๋‚ด์šฉ'].astype(str))

import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

plt.subplots(figsize = (25, 15))
wordcloud = WordCloud(background_color = 'black', width = 1000, height = 700, font_path = fontpath).generate(text)
plt.axis('off')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.show()


๐Ÿผ ๋‹ค๋ฅธ์˜์ƒ ์‹œ๊ฐํ™” ํ•ด๋ณด๊ธฐ

๋งํฌ : https://www.youtube.com/watch?v=zH9n4777x80
๐Ÿ‘€ ๋งˆ์Šคํฌ ๋ชจ์–‘ ์ ์šฉํ•˜๋Š” ๋ฒ•์€ ์ œ ๋ฒจ๋กœ๊ทธ '๋„ค์ด๋ฒ„ ๊ธฐ์‚ฌํฌ๋กค๋ง' ์— ์žˆ์Šต๋‹ˆ๋‹น ์ฐธ๊ณ ~~~ ๊ท€์ฐฎ์•„์„œ ์•ˆ์ ๋Š”๊ฑฐ ์•„๋‹˜ ์•”ํŠผ์•„๋‹˜ . ..


๐Ÿšฉ ์†Œ๊ฐ ๐Ÿšฉ

์˜ค๋Š˜ ์‡ผ์ธ  ์˜์ƒ ํฌ๋กค๋ง ํ•˜๋Š” ๊ฒƒ ๊นŒ์ง€ ์˜ฌ๋ฆฌ๋ ค๊ณ  ํ–ˆ๋Š”๋ฐ. . ์˜ค๋Š˜ ์„ธ์‹œ๋ถ€ํ„ฐ ๋ฒจ๋กœ๊ทธ ์ž‘์„ฑ์„ ๋‹ค์งํ•˜๊ณ  ์žค์ง€๋งŒ ์˜คํ›„ ๋„ค์‹œ ๋ฐ˜์— ๋ˆˆ์„ ๋–ด๊ณ ,,, ๋ฐฅ๋จน๊ณ  ํ™˜์Šน์—ฐ์• ๋ณด๊ณ  ์”ป๊ณ  ๊ณผ์ž์ข€ ์”น์–ด๋จน๋‹ค๋ณด๋‹ˆ๊นŒ ๋ฒŒ์จ ์—ดํ•œ์‹œ. ๋‚ด์ผ ์ง‘๊ฐ€๋ ค๋ฉด ์ง์„ ์‹ธ์•ผ๋˜๊ณ  ์˜ท๋„ ๊ณจ๋ผ์•ผ ๋˜๊ธฐ ๋•Œ๋ฌธ์— ์‡ผ์ธ  ์˜์ƒ์€ ์ง‘๊ฐ€์„œ ํ•œ ์ผ์š”์ผ ์ฏค ๋‹ค์‹œ ์‹œ๋„ํ•ด๋ด์•ผ์ง€ ํ”ผ์“ฐ -๐Ÿผ

profile
๋ฐฐ๊ณ ํŒŒ์šฉ.

0๊ฐœ์˜ ๋Œ“๊ธ€