import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
df = pd.read_csv('/content/data.csv', encoding='EUC-KR')
# NaN ๋ฐ์ดํฐ ํ์ธํ๊ธฐ
df.isna().sum() -> ์์ฌ์ง์ ํ, ์๋ฃ๊ธฐ๊ด์ข
๋ณ๋ช
, ์๋ฃ์ธ์, ์
์์ค์, ๋ณ์์, ์ง๋ฃ๊ณผ๋ชฉ๋ด์ฉ๋ช
# ์ปฌ๋ผ๋ณ ๋ฐ์ดํฐ ํ์ธ -> df.์ปฌ๋ผ๋ช
.unique()
# ์์ธ์์
์ํ๋ช
๋ณ ์๋ฃ ๊ธฐ๊ด ์
df['์์ธ์์
์ํ๋ช
'].value_counts()
gb_df = df.groupby(by=['์์ธ์์
์ํ๋ช
']).size().reset_index(name='์๋ฃ๊ธฐ๊ด์')
๐ผ gb_df ์ ๋ ฅํด ์ถ๋ ฅํด๋ณด๋ฉด ์ด๋ ๊ฒ ๋ฐ์ดํฐํ๋ ์ํ์์ผ๋ก ๋ํ๋๋ค.
# ์์
์ํ๋ณ ์๋ฃ๊ธฐ๊ด์ ํ์ธ
px.histogram(gb_df, x='์์ธ์์
์ํ๋ช
', y='์๋ฃ๊ธฐ๊ด์')
๐ผ ๊ฒฐ๊ณผ
# ํ์ด์ฐจํธ
px.pie(gb_df, names='์์ธ์์
์ํ๋ช
', values='์๋ฃ๊ธฐ๊ด์')
๐ผ ๊ฒฐ๊ณผ
# ์ง์ญ๋ช
๋ณ ์๋ฃ ๊ธฐ๊ด ์
df['์ง์ญ๋ช
'].value_counts()
lgb_df = df.groupby(by=['์ง์ญ๋ช
', '๋์๋ช
', '์์ธ์์
์ํ๋ช
', '์๋ฃ๊ธฐ๊ด์ข
๋ณ๋ช
']).size().reset_index(name='์๋ฃ๊ธฐ๊ด์')
์ด ์ญ์ ๋ณ์๋ช ์ ์ ๋ ฅํ์ฌ ์ถ๋ ฅํ ์ ๋ฐ์ดํฐํ๋ ์ํํ๋ก ๋์ด์๋ ๊ฒ์ ํ์ธํ ์ ์๋ค.
# ์ง์ญ๋ณ ์๋ฃ๊ธฐ๊ด์ ํํฉ
px.histogram(lgb_df, x='์ง์ญ๋ช
', y='์๋ฃ๊ธฐ๊ด์')
๐ผ ๊ฒฐ๊ณผ
# ์ง์ญ์ ๋ฐ๋ฅธ ์์
์ํ๋ณ ์๋ฃ๊ธฐ๊ด ์ ํ์ธ
px.histogram(lgb_df, x='์ง์ญ๋ช
', y='์๋ฃ๊ธฐ๊ด์', color='์์ธ์์
์ํ๋ช
')
๐ผ ๊ฒฐ๊ณผ
# ์ง์ญ์ ๋ฐ๋ฅธ ์๊ตฐ๊ตฌ๋ณ ์๋ฃ๊ธฐ๊ด ์ ํ์ธ
px.histogram(lgb_df, x='์ง์ญ๋ช
', y='์๋ฃ๊ธฐ๊ด์', color='๋์๋ช
')
๐ผ ๊ฒฐ๊ณผ
# ์ง์ญ์ ๋ฐ๋ฅธ ์๋ฃ๊ธฐ๊ด์ข
๋ฅ๋ณ ์๋ฃ๊ธฐ๊ด ์ ํ์ธ
px.histogram(lgb_df, x='์ง์ญ๋ช
', y='์๋ฃ๊ธฐ๊ด์', color='์๋ฃ๊ธฐ๊ด์ข
๋ณ๋ช
')
๐ผ ๊ฒฐ๊ณผ
# ์ง์ญ๋ณ ์๋ฃ๊ธฐ๊ด์ ํ์ธํ๋ ํ์ด ์ฐจํธ
px.pie(lgb_df, names='์ง์ญ๋ช
', values='์๋ฃ๊ธฐ๊ด์')
๐ผ ๊ฒฐ๊ณผ
# ์๋ฃ๊ธฐ๊ด์ข
๋ณ ์๋ฃ๊ธฐ๊ด์ ํํฉ
px.histogram(lgb_df, x='์๋ฃ๊ธฐ๊ด์ข
๋ณ๋ช
', y='์๋ฃ๊ธฐ๊ด์')
๐ผ ๊ฒฐ๊ณผ
# ์๋ฃ๊ธฐ๊ด์ข
๋ณ ์๋ฃ๊ธฐ๊ด์ ํ์ธํ๋ ํ์ด ์ฐจํธ
px.pie(lgb_df, names='์๋ฃ๊ธฐ๊ด์ข
๋ณ๋ช
', values='์๋ฃ๊ธฐ๊ด์')
๐ผ ๊ฒฐ๊ณผ
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# ์ง๋ฃ๊ณผ๋ชฉ๋ช
์ ํ๋์ text ํํ๋ก ๋ณํ
text = " ".join(cont for cont in df.์ง๋ฃ๊ณผ๋ชฉ๋ด์ฉ๋ช
.astype(str))
๐ผ ์๋ ํด๋ผ์ฐ๋ ์๊ฐํ๋ฅผ ์ํ ์ ์ฒ๋ฆฌ ๊ณผ์
(์ง๋ฃ๊ณผ๋ชฉ๋ด์ฉ๋ช
์ผ๋ก ์๋ ํด๋ผ์ฐ๋ ์๊ฐํ)
plt.subplots(figsize=(25, 15))
wordcloud = WordCloud(background_color='black', width=1000, height=700, font_path=fontpath).generate(text)
plt.axis('off')
plt.imshow(wordcloud, interpolation='bilinear')
plt.show()
์์ ๊ฐ์ ๊ฒฐ๊ณผ๊ฐ ๋์จ๋ค.
โ๏ธ ๋จ ์์
์ค์ธ ๋ณ์๋ง ๊ฒ์๋๋๋ก ํ๊ธฐ
โ๏ธ ๋ด ์์น๋ ๋๋ก๋ช
์ฃผ์๋ก ์
๋ ฅ ๋ฐ๊ณ ๊ฐ์ฅ ๊ฐ๊น์ด ์๋ฃ๊ธฐ๊ด์ 5๊ฐ๋ง ์ถ์ถํ์ฌ ์ง๋ ์๊ฐํ
โ๏ธ ๋ด ์์น ๋ง์ปค์ ๋ณ์ ๋ง์ปค๋ฅผ ํ์ํ๊ณ ๋ณ์ ๋ง์ปค๋ฅผ ํด๋ฆญํ๋ฉด ๋ณ์๋ช
์ด ๋์ค๋๋ก
# 1) ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ํฌํธ
import folium
import pandas as pd
# 2) ํ์ผ ์
๋ก๋
df = pd.read_csv('/content/drive/MyDrive/data.csv', encoding='EUC-KR')
# NaN ๋ฐ์ดํฐ ํ์ธํ๊ธฐ
# df.isna().sum()
# 3) ์ฃผ์๋ฅผ ์ขํ๋ก ๋ณํํ ํจ์ ์ค๋น
from geopy.geocoders import Nominatim
def geocoding(address):
geolocoder = Nominatim(user_agent = 'South Korea', timeout=None)
geo = geolocoder.geocode(address)
crd = {"lat":float(geo.latitude), "lng":float(geo.longitude)}
return crd
# 4) ์ฌ์ฉ์์๊ฒ ์ฃผ์๋ฅผ ์
๋ ฅ๋ฐ๊ธฐ
address = input("๋น์ ์ ์ฃผ์๋ฅผ ์
๋ ฅํ์์ค.")
crd = geocoding(address)
# 5) ์ฃผ์ ์ขํ๋ก ๋ณํํ์ฌ tuple ํํ๋ก ๋ณ์์ ๋ด๊ธฐ
from geopy.distance import geodesic
myhome = folium.Map(location=[crd['lat'],crd['lng']], zoom_start=14)
# 6) ๋ณ์๋ฐ์ดํฐ dataframe์ ๊ฑฐ๋ฆฌ ๊ณ์ฐํ์ฌ ๋ด๊ธฐ
hpt = pd.DataFrame(columns=['์ฌ์
์ฅ๋ช
', '๋์๋ช
', '์๋ฃ๊ธฐ๊ด์ข
๋ณ๋ช
', '์์ธ์์
์ํ๋ช
', '์๋', '๊ฒฝ๋', '๊ฑฐ๋ฆฌ'])
myhome = (crd['lat'], crd['lng'])
adr_s = address.split(' ')[0]
df = df.loc[df.์ง์ญ๋ช
.str.contains(adr_s)]
for n in df.index:
hpt_loc = (df.loc[n, '์๋'], df.loc[n, '๊ฒฝ๋']) # tuple ํํ
# hpt dataframe์ ๋ด๊ธฐ
hpt.loc[n] = [df.loc[n, '์ฌ์
์ฅ๋ช
'],
df.loc[n, '๋์๋ช
'],
df.loc[n, '์๋ฃ๊ธฐ๊ด์ข
๋ณ๋ช
'],
df.loc[n, '์์ธ์์
์ํ๋ช
'],
df.loc[n, '์๋'], df.loc[n, '๊ฒฝ๋'],
geodesic(myhome, hpt_loc).kilometers]
# 7) ๋ด ์์น์ ๊ฐ์ฅ ๊ฐ๊น์ด ์์
์ค์ธ ๋ณ์ 5๊ฐ ๋ฝ๊ธฐ
my_hpt = hpt.loc[hpt['์์ธ์์
์ํ๋ช
'] == '์์
์ค']
my_hpt = my_hpt.sort_values(by=['๊ฑฐ๋ฆฌ']).head(5)
# 8) ์ง๋ ์ค๋น
my_map = folium.Map(location=[crd['lat'], crd['lng']], zoom_start=14)
for n in my_hpt.index:
folium.Marker([my_hpt.loc[n, '์๋'], my_hpt.loc[n, '๊ฒฝ๋']],
popup='<pre>'+my_hpt.loc[n, '์ฌ์
์ฅ๋ช
']+'</pre>',
icon=folium.Icon(icon='hospital-o', prefix='fa')).add_to(my_map)
folium.Marker([crd['lat'], crd['lng']], icon=folium.Icon(color='red', icon='glyphicon glyphicon-home')).add_to(my_map)
my_map
๐ผ ์ง๋ ์๊ฐํ ๊ฒฐ๊ณผ
#์ด ๋ถ๋ถ์ ์ฒ์ ํ๋ฒ๋ง ์คํํ๋ฉด ๋จ
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
์ ๋ ๋์ ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ฌ์ฉ์ ์ํ ์ค์น
# 1) ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ํฌํธ
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
from pytz import timezone
import datetime
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# 2) ๋ฐ์ดํฐ ํ๋ ์ ์์ฑ
data = pd.DataFrame(columns=['์์', '๊ณต๊ฐ์ข
๋ฅ', '๊ธฐ์ฌ์ ๋ชฉ', '๊ธฐ์ฌ๋งํฌ', '๊ธฐ์ฌ๋ด์ฉ', '๊ณต๊ฐ์', '์์ง์ผ์'])
options = webdriver.ChromeOptions()
options.add_argument('--headless') # headless -> ์ฐฝ์ ๋์ฐ์ง ์๊ณ ๊ฐ์์ผ๋ก ์งํํ๋ ๊ฒ
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage') # deb/shm ๋๋ ํ ๋ฆฌ ์ฌ์ฉX
driver = webdriver.Chrome('chromedriver', options=options)
url_list = ['https://entertain.naver.com/ranking/sympathy/love',
'https://entertain.naver.com/ranking/sympathy/cheer',
'https://entertain.naver.com/ranking/sympathy/congrats',
'https://entertain.naver.com/ranking/sympathy/expect',
'https://entertain.naver.com/ranking/sympathy/surprise',
'https://entertain.naver.com/ranking/sympathy/sad']
# https://entertain.naver.com/ranking/sympathy/cheer
# https://entertain.naver.com/ranking/sympathy/congrats
# https://entertain.naver.com/ranking/sympathy/expect
# https://entertain.naver.com/ranking/sympathy/surprise
# https://entertain.naver.com/ranking/sympathy/sad
for i in range(len(url_list)):
driver.get(url_list[i])
driver.implicitly_wait(3)
time.sleep(1.5)
driver.execute_script('window.scrollTo(0,800)')
time.sleep(3)
html_source = driver.page_source
soup = BeautifulSoup(html_source, 'html.parser')
li = soup.select('li._inc_news_lst3_rank_reply') #ul.news_lst news_lst3 count_info > li
# ๊ณต๊ฐ์ข
๋ฅ
sym = url_list[i].split('.')[2].split('/')[3]
for index_l in range(0, len(li)):
try:
# ์์
rank = li[index_l].find('em', {'class', 'blind'}).text.replace('\n', '').replace('\t', '').strip()
# ๋ด์ค ์ ๋ชฉ
title = li[index_l].find('a', {'class', 'tit'}).text.replace('\n', '').replace('\t', '').strip()
# ๋ด์ค ๋ด์ฉ
summary = li[index_l].find('p', {'class', 'summary'}).text.replace('\n', '').replace('\t', '').strip()
# ๋ด์ค ๋งํฌ
link = li[index_l].find('a').attrs['href']
# ๊ณต๊ฐ์
sym_s = li[index_l].find('a', {'class', 'likeitnews_item_likeit cheer'}).text.replace('\n','').replace('\t','').strip().split('์')[1]
# dataframe์ ์ ์ฅ (append)
data = data.append({'์์' : rank,
'๊ณต๊ฐ์ข
๋ฅ' : sym,
'๊ธฐ์ฌ์ ๋ชฉ' : title,
'๊ธฐ์ฌ๋งํฌ' : 'http://entertain.naver.com' + link,
'๊ธฐ์ฌ๋ด์ฉ' : summary,
'๊ณต๊ฐ์' : sym_s,
'์์ง์ผ์' : datetime.datetime.now(timezone('Asia/Seoul')).strftime('%Y-%m-%d %H:%M:%S')}, ignore_index=True)
except:
pass
print('Complets of ' + rank + ' : ' + title)
print('---------------------------------')
print(data)
์ฝ๋๋ ์ด๋ ๊ฒ ์ง๋ดค์๋๋ฐ ์ฌ์ค ์ด๋ ๊ฒ ํ๋ฉด ์๋๋ค!!!!!
print('Complets of ' + rank + ' : ' + title)
์ ๋ถ๋ถ์ ์ ์ถ๋ ฅ๋๋๋ฐ ๋ฐ์ดํฐ append ๊ณผ์ ์์ ๋ฌธ์ ๊ฐ ์๋๋ฏํ๋ค..
cheer๋ง ๋ฐ์ดํฐ ํ๋ ์์ ์ ์ฅ๋จ
๊ธ์ ์ผ๋จ์ ๊ฑ ํฌ๊ธฐํ๊ฒ์ต๋๋น๐