2022-09-01

jmยท2022๋…„ 9์›” 14์ผ
0

TIL

๋ชฉ๋ก ๋ณด๊ธฐ
15/22

๐Ÿ“Œ ๋„ค์ด๋ฒ„ ๋žญํ‚น ๋‰ด์Šค ํฌ๋กค๋ง

โœ… ๋งŽ์ด ๋ณธ ๋‰ด์Šค

from urllib.request import urlopen
from bs4 import BeautifulSoup
from pytz import timezone

import pandas as pd
import datetime

๐Ÿ”ผ ํฌ๋กค๋ง์— ํ•„์š”ํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ(url, BeautifulSoup ๋“ฑ)์™€ ์ˆ˜์ง‘์ผ์ž๋ฅผ ์ถœ๋ ฅํ•˜๊ธฐ ์œ„ํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ž„ํฌํŠธ

๐Ÿ”ผ ํฌ๋กค๋งํ•  ๋žญํ‚น ๋‰ด์Šค ํ™”๋ฉด

๊ฐœ๋ฐœ์ž ๋ชจ๋“œ๋กœ ๋ณด๋ฉด ๐Ÿ”ฝ

์ด๋Ÿฐ์‹์œผ๋กœ ๋ณผ ์ˆ˜ ์žˆ๋‹ค. ์ด html ํƒœ๊ทธ๋“ค์„ ํ™•์ธํ•˜๋ฉด์„œ ํฌ๋กค๋งํ•ด์•ผํ•œ๋‹ค.


# 1) ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„ ์ƒ์„ฑ
data = pd.DataFrame(columns=['์–ธ๋ก ์‚ฌ๋ช…', '์ˆœ์œ„', '๊ธฐ์‚ฌ์ œ๋ชฉ', '๊ธฐ์‚ฌ๋งํฌ', '์ˆ˜์ง‘์ผ์ž'])

# 2) ๋„ค์ด๋ฒ„ ๋žญํ‚น ๋‰ด์Šค ์ ‘์† ์ฃผ์†Œ ์ค€๋น„ : https://news.naver.com/main/ranking/popularDay.naver
url = 'https://news.naver.com/main/ranking/popularDay.naver'

# 3) url์—์„œ html ๊ฐ€์ ธ์˜ค๊ธฐ
html = urlopen(url)

# 4) html์„ ํŒŒ์‹ฑํ•  ์ˆ˜ ์žˆ๋Š” object๋กœ ๋ณ€ํ™˜
bsObject = BeautifulSoup(html, 'html.parser', from_encoding='UTF-8')

๐Ÿ”ผ ๋จผ์ € ํฌ๋กค๋งํ•œ ๊ฒƒ์„ ๋‹ด์„ ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„์„ ํ˜•์„ฑํ•˜๊ณ  url์„ ์ž…๋ ฅํ›„ urlopen์„ ํ•ด์ค€๋‹ค. BeautifulSoup์„ ์ด์šฉํ•˜์—ฌ ํŒŒ์‹ฑํ•  ์ˆ˜ ์žˆ๊ฒŒ ๋ณ€ํ™˜.

# 5) ๋„ค์ด๋ฒ„ ๋žญํ‚น ๋‰ด์Šค ์ •๋ณด๊ฐ€ ์žˆ๋Š” div๋งŒ ๊ฐ€์ ธ์˜ค๊ธฐ
div = bsObject.find_all('div', {'class', 'rankingnews_box'})

๐Ÿ”ผ ์œ„์˜ ์บก์ณ๋ณธ์„ ๋‹ค์‹œ ๋ณด๋ฉด ํ™•์ธํ•  ์ˆ˜ ์žˆ๋“ฏ ๋žญํ‚น ๋‰ด์Šค ์ •๋ณด๊ฐ€ div ํƒœ๊ทธ ์•ˆ์— rankingnews_box ๋ผ๋Š” ํด๋ž˜์Šค๋กœ ์„ ์–ธ๋˜์–ด ์žˆ๋Š” ๊ฒƒ์„ ๋ณผ ์ˆ˜ ์žˆ๋‹ค.
find_all์„ ์‚ฌ์šฉํ•˜์—ฌ ์ด๊ฒƒ๋“ค์„ ๊ฐ€์ ธ์˜จ๋‹ค.

๐Ÿ”ผ ์œ„์˜ ์บก์ณ๋ณธ์„ ํ™•์ธํ•ด๋ณด๋ฉด ์–ธ๋ก ์‚ฌ๋ช…์ด
<strong class="rankingnews_name">์•„์‹œ์•„๊ฒฝ์ œ</strong>
๋กœ ์„ ์–ธ๋˜์–ด ์žˆ๋Š” ๊ฒƒ์„ ๋ณผ ์ˆ˜ ์žˆ๋‹ค.

# 6) ๋„ค์ด๋ฒ„ ๋žญํ‚น ๋‰ด์Šค ์ƒ์„ธ ์ •๋ณด ์ถ”์ถœ
for index_div in range(0, len(div)):
  # 6-1) ์–ธ๋ก ์‚ฌ๋ช… ์ถ”์ถœ
  strong = div[index_div].find('strong', {'class', 'rankingnews_name'})
  press = strong.text

๐Ÿ”ผ ํ•ด์„œ find๋ฅผ ์ด์šฉํ•ด์„œ 'strong'์•ˆ์˜ class ๋ช… rankingnews_name์„ ์ž…๋ ฅํ•ด์„œ ์ถ”์ถœ. ๊ทธ ํ›„ press๋ผ๋Š” ๋ณ€์ˆ˜์— ์ €์žฅ.
(โœ๏ธ find์™€ find_all์˜ ์ฐจ์ด๋Š”?

๐Ÿ”ผ ์œ„๋ฅผ ๋ณด๋ฉด ul ํƒœ๊ทธ ์•ˆ class = "rankingnews_list"๋กœ ์„ ์–ธ๋˜์–ด ๋žญํ‚น ๋‰ด์Šค๋“ค์ด li ํƒœ๊ทธ๋กœ ํ•˜๋‚˜ ํ•˜๋‚˜ ์ž…๋ ฅ๋˜์–ด ์žˆ๋Š” ๊ฒƒ์„ ๋ณผ ์ˆ˜ ์žˆ๋‹ค.

  # 6-2) ๋žญํ‚น ๋‰ด์Šค ์ •๋ณด ์ถ”์ถœ
  ul = div[index_div].find_all('ul', {'class', 'rankingnews_list'})
  for index_r in range(0, len(ul)):
    li = ul[index_r].find_all('li')
    for index_l in range(0, len(li)):
      try:  # ์˜ˆ์™ธ์ฒ˜๋ฆฌ
        # ์ˆœ์œ„
        rank = li[index_l].find('em', {'class','list_ranking_num'}).text
        # ๋‰ด์Šค ์ œ๋ชฉ
        title= li[index_l].find('a').text
        # ๋‰ด์Šค ๋งํฌ
        link = li[index_l].find('a').attrs['href']
        # 7) dataframe ์ €์žฅ(append)
        data = data.append({'์–ธ๋ก ์‚ฌ๋ช…':press,
                            '์ˆœ์œ„':rank,
                            '๊ธฐ์‚ฌ์ œ๋ชฉ':title,
                            '๊ธฐ์‚ฌ๋งํฌ':link,
                            '์ˆ˜์ง‘์ผ์ž':datetime.datetime.now(timezone('Asia/Seoul')).strftime('%Y-%m-%d %H:%M:%S')}, ignore_index=True)

      except:
        pass
      
      print('Complets of ' + rank + ' : ', title)

print('----------------------------------------')
print(data)

๐Ÿ”ผ ํ•ด์„œ for๋ฌธ์„ ์‚ฌ์šฉํ•ด ul ํƒœ๊ทธ ์•ˆ์— li ํƒœ๊ทธ๋“ค์„ ํ•˜๋‚˜ํ•˜๋‚˜ ๋ถˆ๋Ÿฌ์˜จ๋‹ค.
๊ทธ ํ›„ ๋˜ for๋ฌธ๊ณผ try๋ฅผ ์ด์šฉํ•˜์—ฌ li ํƒœ๊ทธ ์† ์ˆœ์œ„, ๋‰ด์Šค ์ œ๋ชฉ, ๋‰ด์Šค ๋งํฌ๋ฅผ ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„์— appendํ•œ๋‹ค.

๐Ÿ”ผ ์ด๋ฅผ ๋ณด๋ฉด ์ˆœ์œ„๋Š” <em class="list_ranking_num">1</em>,
๊ธฐ์‚ฌ ์ œ๋ชฉ๊ณผ ๋งํฌ๋Š”
<a href="https://n.news.naver.com/article/052/0001788082?ntype=RANKING" class="list_title nclicks('RBP.rnknws')">ํœด๊ฐ€์ฒ ยท๋ช…์ ˆ ๋‘๋ ค์šด ๋ฐ˜๋ ค๊ฒฌ๋“ค..."์ถ”์„์€ ๊ฐ€์กฑ๊ณผ ํ•จ๊ป˜ํ•˜๊ณ  ์‹ถ์–ด์š”"</a>
(a ํƒœ๊ทธ ์•ˆ์— ๊ฐ™์ด ์žˆ์ง€๋งŒ ๋งํฌ๋Š” href๋กœ ์„ ์–ธ๋˜์–ด์žˆ์–ด์„œ attrs['href']๋กœ ์ถ”์ถœํ•˜๊ณ  ์ œ๋ชฉ์€ ํ…์ŠคํŠธ๋ฏ€๋กœ .text๋กœ ์ถ”์ถœ)

์ˆ˜์ง‘์ผ์ž๋Š” ์œ„์—์„œ ์ž„ํฌํŠธํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋ฅผ ํ™œ์šฉํ•˜์—ฌ ํ˜„์žฌ์‹œ๊ฐ„์„ ํ•œ๊ตญ์‹œ๊ฐ„์œผ๋กœ ๋ณ€ํ™˜ํ•˜์—ฌ ์ถœ๋ ฅ.


data.to_csv('๋„ค์ด๋ฒ„๋žญํ‚น๋‰ด์Šค_๋งŽ์ด๋ณธ๋‰ด์Šค_ํฌ๋กค๋ง_20220901.csv', encoding='utf-8-sig', index=False)

๐Ÿ”ผ ํฌ๋กค๋งํ•œ ๋ฐ์ดํ„ฐ๋Š” ์ €์žฅ!!


day_df = pd.read_csv('/content/๋„ค์ด๋ฒ„๋žญํ‚น๋‰ด์Šค_๋งŽ์ด๋ณธ๋‰ด์Šค_ํฌ๋กค๋ง_20220901.csv')

๐Ÿ”ผ ๊ทธ ํ›„ ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„ ํ˜•์‹์œผ๋กœ ๋ถˆ๋Ÿฌ์˜จ๋‹ค.


day_df['๊ธฐ์‚ฌ์ œ๋ชฉ'].replace('[^\w]', ' ', regex=True, inplace=True)

๐Ÿ”ผ ๊ทธ ๋‹ค์Œ์—๋Š” ๊ธฐ์‚ฌ์ œ๋ชฉ์— ์žˆ๋Š” ํŠน์ˆ˜๋ถ€ํ˜ธ๋“ค์„ ์—†์• ๊ธฐ ์œ„ํ•œ ์ „์ฒ˜๋ฆฌ๋ฅผ ํ•ด์ค€๋‹ค.


import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

๐Ÿ”ผ ์›Œ๋“œํด๋ผ์šฐ๋“œ๋ฅผ ์œ„ํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ž„ํฌํŠธ.
์ฝ”๋žฉ์€ ํ•œ๊ธ€ ๊นจ์ง ๋ฐฉ์ง€ ์ฝ”๋“œ๋„ ์‹คํ–‰์‹œ์ผœ์ฃผ์–ด์•ผ ํ•จ!!

# wordCloud ๋ผ์ด๋“œ๋ฒ„๋ฆฌ์—์„œ๋Š” ํ•˜๋‚˜์˜ ๋ฌธ์ž์—ด๋กœ ์ œ๊ณตํ•ด์•ผํ•จ
# 391์˜ ๊ธฐ์‚ฌ์ œ๋ชฉ์„ ํ•˜๋‚˜์˜ text๋กœ ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ

day_text = " ".join(li for li in day_df.๊ธฐ์‚ฌ์ œ๋ชฉ.astype(str))
day_text

๐Ÿ”ผ ์›Œ๋“œํด๋ผ์šฐ๋“œ๋ฅผ ์œ„ํ•œ ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ

plt.subplots(figsize=(25,15))
wordcloud = WordCloud(background_color='white', width=1000, height=700, font_path=fontpath).generate(day_text)
plt.axis('off')
plt.imshow(wordcloud, interpolation='bilinear')
plt.show()

๐Ÿ”ผ ์›Œ๋“œํด๋ผ์šฐ๋“œ๋ฅผ ์‹คํ–‰์‹œํ‚ค๋ฉด!

๐Ÿ”ผ ์ด๋Ÿฐ ๊ฒฐ๊ณผ๊ฐ€ ๋‚˜์˜จ๋‹ค. ์ด๋Š” 9์›” 12์ผ์ž ๊ธฐ์ค€์œผ๋กœ ํฌ๋กค๋ง๋œ ๋ฐ์ดํ„ฐ๋ฅผ ๊ฐ€์ง€๊ณ  ํ•œ ์›Œ๋“œ ํด๋ผ์šฐ๋“œ์ด๋‹ค.


โœ… ๋Œ“๊ธ€ ๋งŽ์€ ๋‰ด์Šค

๋Œ“๊ธ€ ๋งŽ์€ ๋‰ด์Šค๋„ ๋งํฌ๋งŒ ์ˆ˜์ •ํ•˜๊ณ  ์ฝ”๋“œ๋Š” ์œ„์™€ ๋™์ผํ•˜๊ฒŒ ํ•˜๋ฉด ๋œ๋‹ค.
(๋งํฌ:https://news.naver.com/main/ranking/popularMemo.naver)

data.to_csv('๋„ค์ด๋ฒ„๋žญํ‚น๋‰ด์Šค_๋Œ“๊ธ€๋งŽ์€๋‰ด์Šค_ํฌ๋กค๋ง_20220901.csv', encoding='utf-8-sig', index=False)
memo_df = pd.read_csv('/content/๋„ค์ด๋ฒ„๋žญํ‚น๋‰ด์Šค_๋Œ“๊ธ€๋งŽ์€๋‰ด์Šค_ํฌ๋กค๋ง_20220901.csv')

๐Ÿ”ผ ๋ฐ์ดํ„ฐ ์ €์žฅ ํ›„ ๋ฐ์ดํ„ฐ๋ฅผ ๋ถˆ๋Ÿฌ์˜จ ๋‹ค์Œ

import re
 
def clean_text(inputString):
  text_rmv = re.sub('[-=+,#/\?:^.@*\"โ€ป~ใ†!ใ€โ€˜|\(\)\[\]`\'โ€ฆใ€‹\โ€\โ€œ\โ€™ยท]', ' ', inputString)
  return text_rmv
  
# text ๋ถ™์ด๋ฉด์„œ ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ
memo_text = " ".join(clean_text(li) for li in memo_df.๊ธฐ์‚ฌ์ œ๋ชฉ.astype(str))  # df.๊ธฐ์‚ฌ์ œ๋ชฉ = df['๊ธฐ์‚ฌ์ œ๋ชฉ']

๋งŽ์ด ๋ณธ ๋‰ด์Šค์™€ ๋‹ค๋ฅธ ๋ฐฉ์‹์œผ๋กœ ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ. re๋ผ๋Š” ๊ฒƒ์„ ์ž„ํฌํŠธ ํ•œ ํ›„ ํŠน์ˆ˜๊ธฐํ˜ธ๋ฅผ ์‚ญ์ œํ•˜๋Š” ํ•จ์ˆ˜๋ฅผ ์„ ์–ธํ•œ๋‹ค. ๊ทธํ›„ join์„ ์ด์šฉํ•ด ํ…์ŠคํŠธ๋ฅผ ๋ถ™์ผ ๋•Œ ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ๊นŒ์ง€ ํ•œ๋ฒˆ์— ํ•˜๋Š” ๋ฐฉ์‹.

๊ทธ ํ›„ ์›Œ๋“œ ํด๋ผ์šฐ๋“œ๋ฅผ ํ•ด์ฃผ๋ฉด(์ฝ”๋“œ๋Š” ์œ„์™€ ๋™์ผํ•˜๋‹ค)


๐Ÿ”ผ ์ด๋Ÿฐ ๊ฒฐ๊ณผ๊ฐ€ ๋‚˜์˜จ๋‹ค(์ด ์—ญ์‹œ 9์›” 12์ผ์ž ๋žญํ‚น๋‰ด์Šค ๊ธฐ์ค€ ์‹œ๊ฐํ™”)

0๊ฐœ์˜ ๋Œ“๊ธ€