๐ผ ์ค๋น
#์ด ๋ถ๋ถ์ ์ฒ์ ํ๋ฒ๋ง ์คํํ๋ฉด ๋จ. ์ฌ๋์ด ํ๋ ๊ฒ์ฒ๋ผ ๊ฒ์ํ๋ ํน์ง ํ์ฌ !pip install selenium !apt-get update !apt install chromium-chromedriver # ๋ง์ฐ์ค, ํค๋ณด๋ ์ ๋ ฅ ํจ๊ณผ ์ค ์ ์๋ค. !cp /usr/lib/chromium-browser/chromedriver /usr/bin # ํ๊ธ๊นจ์ง ๋ฐฉ์ง import matplotlib as mpl import matplotlib.pyplot as plt %config InlineBackend.figure_format = 'retina' !apt -qq -y install fonts-nanum import matplotlib.font_manager as fm fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf' font = fm.FontProperties(fname=fontpath, size=9) plt.rc('font', family='NanumBarunGothic') mpl.font_manager._rebuild() # ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ํฌํธ from selenium import webdriver import time from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup import pandas as pd import warnings warnings.filterwarnings('ignore')
from prompt_toolkit.formatted_text.html import html_escape
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver', options=options)
driver.get('https://www.youtube.com/watch?v=75kySTFaBQQ&t=2980s') # ๋งํฌ ์ด๊ธฐ
driver.implicitly_wait(3)
time.sleep(1.5)
driver.execute_script('window.scrollTo(0, 800)') # ํ๋ฒ ์คํฌ๋กค
time.sleep(3)
last_height = driver.execute_script('return document.documentElement.scrollHeight') # ์คํฌ๋กค ์ ์ฒด ๋์ด
while True:
driver.execute_script('window.scrollTo(0, document.documentElement.scrollHeight);') # ์คํฌ๋กค ๋ค์ด
time.sleep(1.5)
new_height = driver.execute_script('return document.documentElement.scrollHeight') # ์คํฌ๋กค ๋ค์ด ํ ์คํฌ๋กค ๋์ด
if new_height == last_height: # ๋๊ธ ๋ง์ง๋ง ํ์ด์ง๋ฉด while๋ฌธ ๋ฒ์ด๋จ
break
last_height = new_height
time.sleep(1.5)
try:
driver.find_element_by_css_selector('#dismiss-button > a').click() # ์ ํ๋ธ 1๋ฌ ๋ฌด๋ฃ ํ์
๋ซ๊ธฐ
except:
pass
# ๋๊ธ ํฌ๋กค๋ง
html_source = driver.page_source
soup = BeautifulSoup(html_source, 'html.parser')
id_list = soup.select('div#header-author > h3 > #author-text > span') # id ๋ฆฌ์คํธ
comment_list = soup.select('yt-formatted-string#content-text') # comment ๋ฆฌ์คํธ
# ํ์ฑํด์ ๋ฃ์ ์ค์ ๋ฐ์ดํฐ ๋ฆฌ์คํธ
id_final = []
comment_final = []
for i in range(len(comment_list)):
temp_id = id_list[i].text
temp_id = temp_id.replace('\n', '').replace('\t', '').replace(' ', '').strip()
id_final.append(temp_id) # ๋๊ธ ์์ฑ์
temp_comment = comment_list[i].text
temp_comment = temp_comment.replace('\n', '').replace('\t', '').replace('\r', '').strip()
comment_final.append(temp_comment) # ๋๊ธ ๋ด์ฉ
# DataFrame ๋ง๋ค๊ธฐ(list -> dictionary -> dataframe)
# list -> dictionary
youtube_dic = {"์์ด๋":id_final, "๋๊ธ ๋ด์ฉ": comment_final}
# dictionary -> dataframe
youtube_pd = pd.DataFrame(youtube_dic)
youtube_pd.head()
๐ ์ด๋ฐ์์ผ๋ก ์์ง๋ฉ๋๋น
โ๏ธ id_list
๐ง comment_list
text =" ".join(li for li in youtube_pd['๋๊ธ ๋ด์ฉ'].astype(str))
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
plt.subplots(figsize = (25, 15))
wordcloud = WordCloud(background_color = 'black', width = 1000, height = 700, font_path = fontpath).generate(text)
plt.axis('off')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.show()
๋งํฌ : https://www.youtube.com/watch?v=zH9n4777x80
๐ ๋ง์คํฌ ๋ชจ์ ์ ์ฉํ๋ ๋ฒ์ ์ ๋ฒจ๋ก๊ทธ '๋ค์ด๋ฒ ๊ธฐ์ฌํฌ๋กค๋ง' ์ ์์ต๋๋น ์ฐธ๊ณ ~~~ ๊ท์ฐฎ์์ ์์ ๋๊ฑฐ ์๋ ์ํผ์๋ . ..
์ค๋ ์ผ์ธ ์์ ํฌ๋กค๋ง ํ๋ ๊ฒ ๊น์ง ์ฌ๋ฆฌ๋ ค๊ณ ํ๋๋ฐ. . ์ค๋ ์ธ์๋ถํฐ ๋ฒจ๋ก๊ทธ ์์ฑ์ ๋ค์งํ๊ณ ์ค์ง๋ง ์คํ ๋ค์ ๋ฐ์ ๋์ ๋ด๊ณ ,,, ๋ฐฅ๋จน๊ณ ํ์น์ฐ์ ๋ณด๊ณ ์ป๊ณ ๊ณผ์์ข ์น์ด๋จน๋ค๋ณด๋๊น ๋ฒ์จ ์ดํ์. ๋ด์ผ ์ง๊ฐ๋ ค๋ฉด ์ง์ ์ธ์ผ๋๊ณ ์ท๋ ๊ณจ๋ผ์ผ ๋๊ธฐ ๋๋ฌธ์ ์ผ์ธ ์์์ ์ง๊ฐ์ ํ ์ผ์์ผ ์ฏค ๋ค์ ์๋ํด๋ด์ผ์ง ํผ์ฐ -๐ผ