๐Ÿ“Œ ์ „๊ตญ ์˜๋ฃŒ ๊ธฐ๊ด€ ๋ฐ์ดํ„ฐ ๋ถ„์„


โœ… ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ž„ํฌํŠธ ๋ฐ ๋ฐ์ดํ„ฐ ์ค€๋น„ํ•˜๊ธฐ

import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

df = pd.read_csv('/content/data.csv', encoding='EUC-KR')

# NaN ๋ฐ์ดํ„ฐ ํ™•์ธํ•˜๊ธฐ
df.isna().sum()  -> ์†Œ์žฌ์ง€์ „ํ™”, ์˜๋ฃŒ๊ธฐ๊ด€์ข…๋ณ„๋ช…, ์˜๋ฃŒ์ธ์ˆ˜, ์ž…์›์‹ค์ˆ˜, ๋ณ‘์ƒ์ˆ˜, ์ง„๋ฃŒ๊ณผ๋ชฉ๋‚ด์šฉ๋ช…

# ์ปฌ๋Ÿผ๋ณ„ ๋ฐ์ดํ„ฐ ํ™•์ธ -> df.์ปฌ๋Ÿผ๋ช….unique()

โœ… ๋ฐ์ดํ„ฐ ๋ถ„์„ํ•˜๊ธฐ

# ์ƒ์„ธ์˜์—…์ƒํƒœ๋ช… ๋ณ„ ์˜๋ฃŒ ๊ธฐ๊ด€ ์ˆ˜
df['์ƒ์„ธ์˜์—…์ƒํƒœ๋ช…'].value_counts()

gb_df = df.groupby(by=['์ƒ์„ธ์˜์—…์ƒํƒœ๋ช…']).size().reset_index(name='์˜๋ฃŒ๊ธฐ๊ด€์ˆ˜')

๐Ÿ”ผ gb_df ์ž…๋ ฅํ•ด ์ถœ๋ ฅํ•ด๋ณด๋ฉด ์ด๋ ‡๊ฒŒ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ํ˜•์‹์œผ๋กœ ๋‚˜ํƒ€๋‚œ๋‹ค.


# ์˜์—…์ƒํƒœ๋ณ„ ์˜๋ฃŒ๊ธฐ๊ด€์ˆ˜ ํ™•์ธ
px.histogram(gb_df, x='์ƒ์„ธ์˜์—…์ƒํƒœ๋ช…', y='์˜๋ฃŒ๊ธฐ๊ด€์ˆ˜')

๐Ÿ”ผ ๊ฒฐ๊ณผ


# ํŒŒ์ด์ฐจํŠธ
px.pie(gb_df, names='์ƒ์„ธ์˜์—…์ƒํƒœ๋ช…', values='์˜๋ฃŒ๊ธฐ๊ด€์ˆ˜')

๐Ÿ”ผ ๊ฒฐ๊ณผ


# ์ง€์—ญ๋ช… ๋ณ„ ์˜๋ฃŒ ๊ธฐ๊ด€ ์ˆ˜
df['์ง€์—ญ๋ช…'].value_counts()
lgb_df = df.groupby(by=['์ง€์—ญ๋ช…', '๋„์‹œ๋ช…', '์ƒ์„ธ์˜์—…์ƒํƒœ๋ช…', '์˜๋ฃŒ๊ธฐ๊ด€์ข…๋ณ„๋ช…']).size().reset_index(name='์˜๋ฃŒ๊ธฐ๊ด€์ˆ˜')

์ด ์—ญ์‹œ ๋ณ€์ˆ˜๋ช…์„ ์ž…๋ ฅํ•˜์—ฌ ์ถœ๋ ฅํ• ์‹œ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ํ˜•ํƒœ๋กœ ๋˜์–ด์žˆ๋Š” ๊ฒƒ์„ ํ™•์ธํ•  ์ˆ˜ ์žˆ๋‹ค.


# ์ง€์—ญ๋ณ„ ์˜๋ฃŒ๊ธฐ๊ด€์ˆ˜ ํ˜„ํ™ฉ
px.histogram(lgb_df, x='์ง€์—ญ๋ช…', y='์˜๋ฃŒ๊ธฐ๊ด€์ˆ˜')

๐Ÿ”ผ ๊ฒฐ๊ณผ


# ์ง€์—ญ์— ๋”ฐ๋ฅธ ์˜์—…์ƒํƒœ๋ณ„ ์˜๋ฃŒ๊ธฐ๊ด€ ์ˆ˜ ํ™•์ธ
px.histogram(lgb_df, x='์ง€์—ญ๋ช…', y='์˜๋ฃŒ๊ธฐ๊ด€์ˆ˜', color='์ƒ์„ธ์˜์—…์ƒํƒœ๋ช…')

๐Ÿ”ผ ๊ฒฐ๊ณผ


# ์ง€์—ญ์— ๋”ฐ๋ฅธ ์‹œ๊ตฐ๊ตฌ๋ณ„ ์˜๋ฃŒ๊ธฐ๊ด€ ์ˆ˜ ํ™•์ธ
px.histogram(lgb_df, x='์ง€์—ญ๋ช…', y='์˜๋ฃŒ๊ธฐ๊ด€์ˆ˜', color='๋„์‹œ๋ช…')

๐Ÿ”ผ ๊ฒฐ๊ณผ


# ์ง€์—ญ์— ๋”ฐ๋ฅธ ์˜๋ฃŒ๊ธฐ๊ด€์ข…๋ฅ˜๋ณ„ ์˜๋ฃŒ๊ธฐ๊ด€ ์ˆ˜ ํ™•์ธ
px.histogram(lgb_df, x='์ง€์—ญ๋ช…', y='์˜๋ฃŒ๊ธฐ๊ด€์ˆ˜', color='์˜๋ฃŒ๊ธฐ๊ด€์ข…๋ณ„๋ช…')

๐Ÿ”ผ ๊ฒฐ๊ณผ


# ์ง€์—ญ๋ณ„ ์˜๋ฃŒ๊ธฐ๊ด€์ˆ˜ ํ™•์ธํ•˜๋Š” ํŒŒ์ด ์ฐจํŠธ
px.pie(lgb_df, names='์ง€์—ญ๋ช…', values='์˜๋ฃŒ๊ธฐ๊ด€์ˆ˜')

๐Ÿ”ผ ๊ฒฐ๊ณผ


# ์˜๋ฃŒ๊ธฐ๊ด€์ข…๋ณ„ ์˜๋ฃŒ๊ธฐ๊ด€์ˆ˜ ํ˜„ํ™ฉ
px.histogram(lgb_df, x='์˜๋ฃŒ๊ธฐ๊ด€์ข…๋ณ„๋ช…', y='์˜๋ฃŒ๊ธฐ๊ด€์ˆ˜')

๐Ÿ”ผ ๊ฒฐ๊ณผ


# ์˜๋ฃŒ๊ธฐ๊ด€์ข…๋ณ„ ์˜๋ฃŒ๊ธฐ๊ด€์ˆ˜ ํ™•์ธํ•˜๋Š” ํŒŒ์ด ์ฐจํŠธ
px.pie(lgb_df, names='์˜๋ฃŒ๊ธฐ๊ด€์ข…๋ณ„๋ช…', values='์˜๋ฃŒ๊ธฐ๊ด€์ˆ˜')

๐Ÿ”ผ ๊ฒฐ๊ณผ


โœ… ์›Œ๋“œ ํด๋ผ์šฐ๋“œ ์‹œ๊ฐํ™”

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# ์ง„๋ฃŒ๊ณผ๋ชฉ๋ช…์„ ํ•˜๋‚˜์˜ text ํ˜•ํƒœ๋กœ ๋ณ€ํ™˜
text = " ".join(cont for cont in df.์ง„๋ฃŒ๊ณผ๋ชฉ๋‚ด์šฉ๋ช….astype(str))

๐Ÿ”ผ ์›Œ๋“œ ํด๋ผ์šฐ๋“œ ์‹œ๊ฐํ™”๋ฅผ ์œ„ํ•œ ์ „์ฒ˜๋ฆฌ ๊ณผ์ •
(์ง„๋ฃŒ๊ณผ๋ชฉ๋‚ด์šฉ๋ช…์œผ๋กœ ์›Œ๋“œ ํด๋ผ์šฐ๋“œ ์‹œ๊ฐํ™”)


plt.subplots(figsize=(25, 15))

wordcloud = WordCloud(background_color='black', width=1000, height=700, font_path=fontpath).generate(text)

plt.axis('off')
plt.imshow(wordcloud, interpolation='bilinear')
plt.show()

์œ„์™€ ๊ฐ™์€ ๊ฒฐ๊ณผ๊ฐ€ ๋‚˜์˜จ๋‹ค.


โœ… ๋‚ด ์œ„์น˜์™€ ๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ์˜๋ฃŒ๊ธฐ๊ด€ ์ฐพ๊ธฐ

โœ๏ธ ๋‹จ ์˜์—…์ค‘์ธ ๋ณ‘์›๋งŒ ๊ฒ€์ƒ‰๋˜๋„๋ก ํ•˜๊ธฐ
โœ๏ธ ๋‚ด ์œ„์น˜๋Š” ๋„๋กœ๋ช… ์ฃผ์†Œ๋กœ ์ž…๋ ฅ ๋ฐ›๊ณ  ๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ์˜๋ฃŒ๊ธฐ๊ด€์€ 5๊ฐœ๋งŒ ์ถ”์ถœํ•˜์—ฌ ์ง€๋„ ์‹œ๊ฐํ™”
โœ๏ธ ๋‚ด ์œ„์น˜ ๋งˆ์ปค์™€ ๋ณ‘์› ๋งˆ์ปค๋ฅผ ํ‘œ์‹œํ•˜๊ณ  ๋ณ‘์› ๋งˆ์ปค๋ฅผ ํด๋ฆญํ•˜๋ฉด ๋ณ‘์›๋ช…์ด ๋‚˜์˜ค๋„๋ก

โœ”๏ธ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ž„ํฌํŠธ ๋ฐ ๋ฐ์ดํ„ฐ ์ค€๋น„ํ•˜๊ธฐ

# 1) ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ž„ํฌํŠธ
import folium
import pandas as pd

# 2) ํŒŒ์ผ ์—…๋กœ๋“œ
df = pd.read_csv('/content/drive/MyDrive/data.csv', encoding='EUC-KR')

# NaN ๋ฐ์ดํ„ฐ ํ™•์ธํ•˜๊ธฐ
# df.isna().sum()

โœ”๏ธ ๋‚ด ์œ„์น˜ ์ฃผ์†Œ๋ฅผ ์ž…๋ ฅํ•˜๊ณ  ๊ฐ€๊นŒ์šด ๋ณ‘์› 5๊ฐœ๋งŒ ์ถ”์ถœํ•˜์—ฌ ์ €์žฅํ•˜๊ธฐ

# 3) ์ฃผ์†Œ๋ฅผ ์ขŒํ‘œ๋กœ ๋ณ€ํ™˜ํ•  ํ•จ์ˆ˜ ์ค€๋น„
from geopy.geocoders import Nominatim

def geocoding(address):
  geolocoder = Nominatim(user_agent = 'South Korea', timeout=None)
  geo = geolocoder.geocode(address)
  crd = {"lat":float(geo.latitude), "lng":float(geo.longitude)}

  return crd

# 4) ์‚ฌ์šฉ์ž์—๊ฒŒ ์ฃผ์†Œ๋ฅผ ์ž…๋ ฅ๋ฐ›๊ธฐ
address = input("๋‹น์‹ ์˜ ์ฃผ์†Œ๋ฅผ ์ž…๋ ฅํ•˜์‹œ์˜ค.")
crd = geocoding(address)

# 5) ์ฃผ์†Œ ์ขŒํ‘œ๋กœ ๋ณ€ํ™˜ํ•˜์—ฌ tuple ํ˜•ํƒœ๋กœ ๋ณ€์ˆ˜์— ๋‹ด๊ธฐ
from geopy.distance import geodesic

myhome = folium.Map(location=[crd['lat'],crd['lng']], zoom_start=14)

# 6) ๋ณ‘์›๋ฐ์ดํ„ฐ dataframe์— ๊ฑฐ๋ฆฌ ๊ณ„์‚ฐํ•˜์—ฌ ๋‹ด๊ธฐ
hpt = pd.DataFrame(columns=['์‚ฌ์—…์žฅ๋ช…', '๋„์‹œ๋ช…', '์˜๋ฃŒ๊ธฐ๊ด€์ข…๋ณ„๋ช…', '์ƒ์„ธ์˜์—…์ƒํƒœ๋ช…', '์œ„๋„', '๊ฒฝ๋„', '๊ฑฐ๋ฆฌ'])
myhome = (crd['lat'], crd['lng'])

adr_s = address.split(' ')[0]
df = df.loc[df.์ง€์—ญ๋ช….str.contains(adr_s)]

for n in df.index:
  hpt_loc = (df.loc[n, '์œ„๋„'], df.loc[n, '๊ฒฝ๋„'])  # tuple ํ˜•ํƒœ

  # hpt dataframe์— ๋‹ด๊ธฐ
  hpt.loc[n] = [df.loc[n, '์‚ฌ์—…์žฅ๋ช…'],
                df.loc[n, '๋„์‹œ๋ช…'],
                df.loc[n, '์˜๋ฃŒ๊ธฐ๊ด€์ข…๋ณ„๋ช…'],
                df.loc[n, '์ƒ์„ธ์˜์—…์ƒํƒœ๋ช…'],
                df.loc[n, '์œ„๋„'], df.loc[n, '๊ฒฝ๋„'],
                geodesic(myhome, hpt_loc).kilometers]

# 7) ๋‚ด ์œ„์น˜์— ๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ์˜์—…์ค‘์ธ ๋ณ‘์› 5๊ฐœ ๋ฝ‘๊ธฐ
my_hpt = hpt.loc[hpt['์ƒ์„ธ์˜์—…์ƒํƒœ๋ช…'] == '์˜์—…์ค‘']
my_hpt = my_hpt.sort_values(by=['๊ฑฐ๋ฆฌ']).head(5)

โœ”๏ธ ์ง€๋„ ์‹œ๊ฐํ™”

# 8) ์ง€๋„ ์ค€๋น„
my_map = folium.Map(location=[crd['lat'], crd['lng']], zoom_start=14)
for n in my_hpt.index:
  folium.Marker([my_hpt.loc[n, '์œ„๋„'], my_hpt.loc[n, '๊ฒฝ๋„']],
                popup='<pre>'+my_hpt.loc[n, '์‚ฌ์—…์žฅ๋ช…']+'</pre>',
                icon=folium.Icon(icon='hospital-o', prefix='fa')).add_to(my_map)
  folium.Marker([crd['lat'], crd['lng']], icon=folium.Icon(color='red', icon='glyphicon glyphicon-home')).add_to(my_map)

my_map

๐Ÿ”ผ ์ง€๋„ ์‹œ๊ฐํ™” ๊ฒฐ๊ณผ



๐Ÿ“Œ ๋„ค์ด๋ฒ„ ๊ณต๊ฐ๋‰ด์Šค ํฌ๋กค๋ง

#์ด ๋ถ€๋ถ„์€ ์ฒ˜์Œ ํ•œ๋ฒˆ๋งŒ ์‹คํ–‰ํ•˜๋ฉด ๋จ
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

์…€๋ ˆ๋‹ˆ์›€ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์‚ฌ์šฉ์„ ์œ„ํ•œ ์„ค์น˜

# 1) ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ž„ํฌํŠธ
from selenium import webdriver
from bs4 import BeautifulSoup

import re
import time
from pytz import timezone
import datetime

import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


# 2) ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„ ์ƒ์„ฑ
data = pd.DataFrame(columns=['์ˆœ์œ„', '๊ณต๊ฐ์ข…๋ฅ˜', '๊ธฐ์‚ฌ์ œ๋ชฉ', '๊ธฐ์‚ฌ๋งํฌ', '๊ธฐ์‚ฌ๋‚ด์šฉ', '๊ณต๊ฐ์ˆ˜', '์ˆ˜์ง‘์ผ์ž'])


options = webdriver.ChromeOptions()
options.add_argument('--headless')   # headless -> ์ฐฝ์„ ๋„์šฐ์ง€ ์•Š๊ณ  ๊ฐ€์ƒ์œผ๋กœ ์ง„ํ–‰ํ•˜๋Š” ๊ฒƒ
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage') # deb/shm ๋””๋ ‰ํ† ๋ฆฌ ์‚ฌ์šฉX
driver = webdriver.Chrome('chromedriver', options=options)

url_list = ['https://entertain.naver.com/ranking/sympathy/love',
            'https://entertain.naver.com/ranking/sympathy/cheer',
            'https://entertain.naver.com/ranking/sympathy/congrats',
            'https://entertain.naver.com/ranking/sympathy/expect',
            'https://entertain.naver.com/ranking/sympathy/surprise',
            'https://entertain.naver.com/ranking/sympathy/sad']

# https://entertain.naver.com/ranking/sympathy/cheer
# https://entertain.naver.com/ranking/sympathy/congrats
# https://entertain.naver.com/ranking/sympathy/expect
# https://entertain.naver.com/ranking/sympathy/surprise
# https://entertain.naver.com/ranking/sympathy/sad


for i in range(len(url_list)):
  driver.get(url_list[i])

  driver.implicitly_wait(3)

  time.sleep(1.5)

  driver.execute_script('window.scrollTo(0,800)')
  time.sleep(3)

  html_source = driver.page_source
  soup = BeautifulSoup(html_source, 'html.parser')

  li = soup.select('li._inc_news_lst3_rank_reply') #ul.news_lst news_lst3 count_info > li

  # ๊ณต๊ฐ์ข…๋ฅ˜
  sym = url_list[i].split('.')[2].split('/')[3]

  for index_l in range(0, len(li)):
    try:
    # ์ˆœ์œ„
      rank = li[index_l].find('em', {'class', 'blind'}).text.replace('\n', '').replace('\t', '').strip()

    # ๋‰ด์Šค ์ œ๋ชฉ
      title = li[index_l].find('a', {'class', 'tit'}).text.replace('\n', '').replace('\t', '').strip()

    # ๋‰ด์Šค ๋‚ด์šฉ
      summary = li[index_l].find('p', {'class', 'summary'}).text.replace('\n', '').replace('\t', '').strip()

    # ๋‰ด์Šค ๋งํฌ
      link = li[index_l].find('a').attrs['href']

    # ๊ณต๊ฐ์ˆ˜
      sym_s = li[index_l].find('a', {'class', 'likeitnews_item_likeit cheer'}).text.replace('\n','').replace('\t','').strip().split('์ˆ˜')[1]

    # dataframe์— ์ €์žฅ (append)
      data = data.append({'์ˆœ์œ„' : rank,
                          '๊ณต๊ฐ์ข…๋ฅ˜' : sym,
                          '๊ธฐ์‚ฌ์ œ๋ชฉ' : title,
                          '๊ธฐ์‚ฌ๋งํฌ' : 'http://entertain.naver.com' + link,
                          '๊ธฐ์‚ฌ๋‚ด์šฉ' : summary,
                          '๊ณต๊ฐ์ˆ˜' : sym_s,
                          '์ˆ˜์ง‘์ผ์ž' : datetime.datetime.now(timezone('Asia/Seoul')).strftime('%Y-%m-%d %H:%M:%S')}, ignore_index=True)

    except:
      pass

    print('Complets of ' + rank + ' : ' + title)

print('---------------------------------')
print(data)

์ฝ”๋“œ๋Š” ์ด๋ ‡๊ฒŒ ์งœ๋ดค์—ˆ๋Š”๋ฐ ์‚ฌ์‹ค ์ด๋ ‡๊ฒŒ ํ•˜๋ฉด ์•ˆ๋œ๋‹ค!!!!!
print('Complets of ' + rank + ' : ' + title)
์š” ๋ถ€๋ถ„์€ ์ž˜ ์ถœ๋ ฅ๋˜๋Š”๋ฐ ๋ฐ์ดํ„ฐ append ๊ณผ์ •์—์„œ ๋ฌธ์ œ๊ฐ€ ์žˆ๋Š”๋“ฏํ•˜๋‹ค..
cheer๋งŒ ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„์— ์ €์žฅ๋จ
๊ธ€์„œ ์ผ๋‹จ์€ ๊ฑ ํฌ๊ธฐํ•˜๊ฒŸ์Šต๋‹ˆ๋‹น๐Ÿ‘

0๊ฐœ์˜ ๋Œ“๊ธ€

๊ด€๋ จ ์ฑ„์šฉ ์ •๋ณด

Powered by GraphCDN, the GraphQL CDN