import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
df = pd.read_csv('/content/data.csv', encoding='EUC-KR')
# NaN λ°μ΄ν° νμΈνκΈ°
df.isna().sum() -> μμ¬μ§μ ν, μλ£κΈ°κ΄μ’
λ³λͺ
, μλ£μΈμ, μ
μμ€μ, λ³μμ, μ§λ£κ³Όλͺ©λ΄μ©λͺ
# 컬λΌλ³ λ°μ΄ν° νμΈ -> df.컬λΌλͺ
.unique()
# μμΈμμ
μνλͺ
λ³ μλ£ κΈ°κ΄ μ
df['μμΈμμ
μνλͺ
'].value_counts()
gb_df = df.groupby(by=['μμΈμμ
μνλͺ
']).size().reset_index(name='μλ£κΈ°κ΄μ')
πΌ gb_df μ λ ₯ν΄ μΆλ ₯ν΄λ³΄λ©΄ μ΄λ κ² λ°μ΄ν°νλ μνμμΌλ‘ λνλλ€.
# μμ
μνλ³ μλ£κΈ°κ΄μ νμΈ
px.histogram(gb_df, x='μμΈμμ
μνλͺ
', y='μλ£κΈ°κ΄μ')
πΌ κ²°κ³Ό
# νμ΄μ°¨νΈ
px.pie(gb_df, names='μμΈμμ
μνλͺ
', values='μλ£κΈ°κ΄μ')
πΌ κ²°κ³Ό
# μ§μλͺ
λ³ μλ£ κΈ°κ΄ μ
df['μ§μλͺ
'].value_counts()
lgb_df = df.groupby(by=['μ§μλͺ
', 'λμλͺ
', 'μμΈμμ
μνλͺ
', 'μλ£κΈ°κ΄μ’
λ³λͺ
']).size().reset_index(name='μλ£κΈ°κ΄μ')
μ΄ μμ λ³μλͺ μ μ λ ₯νμ¬ μΆλ ₯ν μ λ°μ΄ν°νλ μννλ‘ λμ΄μλ κ²μ νμΈν μ μλ€.
# μ§μλ³ μλ£κΈ°κ΄μ νν©
px.histogram(lgb_df, x='μ§μλͺ
', y='μλ£κΈ°κ΄μ')
πΌ κ²°κ³Ό
# μ§μμ λ°λ₯Έ μμ
μνλ³ μλ£κΈ°κ΄ μ νμΈ
px.histogram(lgb_df, x='μ§μλͺ
', y='μλ£κΈ°κ΄μ', color='μμΈμμ
μνλͺ
')
πΌ κ²°κ³Ό
# μ§μμ λ°λ₯Έ μκ΅°κ΅¬λ³ μλ£κΈ°κ΄ μ νμΈ
px.histogram(lgb_df, x='μ§μλͺ
', y='μλ£κΈ°κ΄μ', color='λμλͺ
')
πΌ κ²°κ³Ό
# μ§μμ λ°λ₯Έ μλ£κΈ°κ΄μ’
λ₯λ³ μλ£κΈ°κ΄ μ νμΈ
px.histogram(lgb_df, x='μ§μλͺ
', y='μλ£κΈ°κ΄μ', color='μλ£κΈ°κ΄μ’
λ³λͺ
')
πΌ κ²°κ³Ό
# μ§μλ³ μλ£κΈ°κ΄μ νμΈνλ νμ΄ μ°¨νΈ
px.pie(lgb_df, names='μ§μλͺ
', values='μλ£κΈ°κ΄μ')
πΌ κ²°κ³Ό
# μλ£κΈ°κ΄μ’
λ³ μλ£κΈ°κ΄μ νν©
px.histogram(lgb_df, x='μλ£κΈ°κ΄μ’
λ³λͺ
', y='μλ£κΈ°κ΄μ')
πΌ κ²°κ³Ό
# μλ£κΈ°κ΄μ’
λ³ μλ£κΈ°κ΄μ νμΈνλ νμ΄ μ°¨νΈ
px.pie(lgb_df, names='μλ£κΈ°κ΄μ’
λ³λͺ
', values='μλ£κΈ°κ΄μ')
πΌ κ²°κ³Ό
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# μ§λ£κ³Όλͺ©λͺ
μ νλμ text ννλ‘ λ³ν
text = " ".join(cont for cont in df.μ§λ£κ³Όλͺ©λ΄μ©λͺ
.astype(str))
πΌ μλ ν΄λΌμ°λ μκ°νλ₯Ό μν μ μ²λ¦¬ κ³Όμ
(μ§λ£κ³Όλͺ©λ΄μ©λͺ
μΌλ‘ μλ ν΄λΌμ°λ μκ°ν)
plt.subplots(figsize=(25, 15))
wordcloud = WordCloud(background_color='black', width=1000, height=700, font_path=fontpath).generate(text)
plt.axis('off')
plt.imshow(wordcloud, interpolation='bilinear')
plt.show()
μμ κ°μ κ²°κ³Όκ° λμ¨λ€.
βοΈ λ¨ μμ
μ€μΈ λ³μλ§ κ²μλλλ‘ νκΈ°
βοΈ λ΄ μμΉλ λλ‘λͺ
μ£Όμλ‘ μ
λ ₯ λ°κ³ κ°μ₯ κ°κΉμ΄ μλ£κΈ°κ΄μ 5κ°λ§ μΆμΆνμ¬ μ§λ μκ°ν
βοΈ λ΄ μμΉ λ§μ»€μ λ³μ λ§μ»€λ₯Ό νμνκ³ λ³μ λ§μ»€λ₯Ό ν΄λ¦νλ©΄ λ³μλͺ
μ΄ λμ€λλ‘
# 1) λΌμ΄λΈλ¬λ¦¬ μν¬νΈ
import folium
import pandas as pd
# 2) νμΌ μ
λ‘λ
df = pd.read_csv('/content/drive/MyDrive/data.csv', encoding='EUC-KR')
# NaN λ°μ΄ν° νμΈνκΈ°
# df.isna().sum()
# 3) μ£Όμλ₯Ό μ’νλ‘ λ³νν ν¨μ μ€λΉ
from geopy.geocoders import Nominatim
def geocoding(address):
geolocoder = Nominatim(user_agent = 'South Korea', timeout=None)
geo = geolocoder.geocode(address)
crd = {"lat":float(geo.latitude), "lng":float(geo.longitude)}
return crd
# 4) μ¬μ©μμκ² μ£Όμλ₯Ό μ
λ ₯λ°κΈ°
address = input("λΉμ μ μ£Όμλ₯Ό μ
λ ₯νμμ€.")
crd = geocoding(address)
# 5) μ£Όμ μ’νλ‘ λ³ννμ¬ tuple ννλ‘ λ³μμ λ΄κΈ°
from geopy.distance import geodesic
myhome = folium.Map(location=[crd['lat'],crd['lng']], zoom_start=14)
# 6) λ³μλ°μ΄ν° dataframeμ 거리 κ³μ°νμ¬ λ΄κΈ°
hpt = pd.DataFrame(columns=['μ¬μ
μ₯λͺ
', 'λμλͺ
', 'μλ£κΈ°κ΄μ’
λ³λͺ
', 'μμΈμμ
μνλͺ
', 'μλ', 'κ²½λ', '거리'])
myhome = (crd['lat'], crd['lng'])
adr_s = address.split(' ')[0]
df = df.loc[df.μ§μλͺ
.str.contains(adr_s)]
for n in df.index:
hpt_loc = (df.loc[n, 'μλ'], df.loc[n, 'κ²½λ']) # tuple νν
# hpt dataframeμ λ΄κΈ°
hpt.loc[n] = [df.loc[n, 'μ¬μ
μ₯λͺ
'],
df.loc[n, 'λμλͺ
'],
df.loc[n, 'μλ£κΈ°κ΄μ’
λ³λͺ
'],
df.loc[n, 'μμΈμμ
μνλͺ
'],
df.loc[n, 'μλ'], df.loc[n, 'κ²½λ'],
geodesic(myhome, hpt_loc).kilometers]
# 7) λ΄ μμΉμ κ°μ₯ κ°κΉμ΄ μμ
μ€μΈ λ³μ 5κ° λ½κΈ°
my_hpt = hpt.loc[hpt['μμΈμμ
μνλͺ
'] == 'μμ
μ€']
my_hpt = my_hpt.sort_values(by=['거리']).head(5)
# 8) μ§λ μ€λΉ
my_map = folium.Map(location=[crd['lat'], crd['lng']], zoom_start=14)
for n in my_hpt.index:
folium.Marker([my_hpt.loc[n, 'μλ'], my_hpt.loc[n, 'κ²½λ']],
popup='<pre>'+my_hpt.loc[n, 'μ¬μ
μ₯λͺ
']+'</pre>',
icon=folium.Icon(icon='hospital-o', prefix='fa')).add_to(my_map)
folium.Marker([crd['lat'], crd['lng']], icon=folium.Icon(color='red', icon='glyphicon glyphicon-home')).add_to(my_map)
my_map
πΌ μ§λ μκ°ν κ²°κ³Ό
#μ΄ λΆλΆμ μ²μ νλ²λ§ μ€ννλ©΄ λ¨
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
μ λ λμ λΌμ΄λΈλ¬λ¦¬ μ¬μ©μ μν μ€μΉ
# 1) λΌμ΄λΈλ¬λ¦¬ μν¬νΈ
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
from pytz import timezone
import datetime
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# 2) λ°μ΄ν° νλ μ μμ±
data = pd.DataFrame(columns=['μμ', '곡κ°μ’
λ₯', 'κΈ°μ¬μ λͺ©', 'κΈ°μ¬λ§ν¬', 'κΈ°μ¬λ΄μ©', '곡κ°μ', 'μμ§μΌμ'])
options = webdriver.ChromeOptions()
options.add_argument('--headless') # headless -> μ°½μ λμ°μ§ μκ³ κ°μμΌλ‘ μ§ννλ κ²
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage') # deb/shm λλ ν 리 μ¬μ©X
driver = webdriver.Chrome('chromedriver', options=options)
url_list = ['https://entertain.naver.com/ranking/sympathy/love',
'https://entertain.naver.com/ranking/sympathy/cheer',
'https://entertain.naver.com/ranking/sympathy/congrats',
'https://entertain.naver.com/ranking/sympathy/expect',
'https://entertain.naver.com/ranking/sympathy/surprise',
'https://entertain.naver.com/ranking/sympathy/sad']
# https://entertain.naver.com/ranking/sympathy/cheer
# https://entertain.naver.com/ranking/sympathy/congrats
# https://entertain.naver.com/ranking/sympathy/expect
# https://entertain.naver.com/ranking/sympathy/surprise
# https://entertain.naver.com/ranking/sympathy/sad
for i in range(len(url_list)):
driver.get(url_list[i])
driver.implicitly_wait(3)
time.sleep(1.5)
driver.execute_script('window.scrollTo(0,800)')
time.sleep(3)
html_source = driver.page_source
soup = BeautifulSoup(html_source, 'html.parser')
li = soup.select('li._inc_news_lst3_rank_reply') #ul.news_lst news_lst3 count_info > li
# 곡κ°μ’
λ₯
sym = url_list[i].split('.')[2].split('/')[3]
for index_l in range(0, len(li)):
try:
# μμ
rank = li[index_l].find('em', {'class', 'blind'}).text.replace('\n', '').replace('\t', '').strip()
# λ΄μ€ μ λͺ©
title = li[index_l].find('a', {'class', 'tit'}).text.replace('\n', '').replace('\t', '').strip()
# λ΄μ€ λ΄μ©
summary = li[index_l].find('p', {'class', 'summary'}).text.replace('\n', '').replace('\t', '').strip()
# λ΄μ€ λ§ν¬
link = li[index_l].find('a').attrs['href']
# 곡κ°μ
sym_s = li[index_l].find('a', {'class', 'likeitnews_item_likeit cheer'}).text.replace('\n','').replace('\t','').strip().split('μ')[1]
# dataframeμ μ μ₯ (append)
data = data.append({'μμ' : rank,
'곡κ°μ’
λ₯' : sym,
'κΈ°μ¬μ λͺ©' : title,
'κΈ°μ¬λ§ν¬' : 'http://entertain.naver.com' + link,
'κΈ°μ¬λ΄μ©' : summary,
'곡κ°μ' : sym_s,
'μμ§μΌμ' : datetime.datetime.now(timezone('Asia/Seoul')).strftime('%Y-%m-%d %H:%M:%S')}, ignore_index=True)
except:
pass
print('Complets of ' + rank + ' : ' + title)
print('---------------------------------')
print(data)
μ½λλ μ΄λ κ² μ§λ΄€μλλ° μ¬μ€ μ΄λ κ² νλ©΄ μλλ€!!!!!
print('Complets of ' + rank + ' : ' + title)
μ λΆλΆμ μ μΆλ ₯λλλ° λ°μ΄ν° append κ³Όμ μμ λ¬Έμ κ° μλλ―νλ€..
cheerλ§ λ°μ΄ν° νλ μμ μ μ₯λ¨
κΈμ μΌλ¨μ κ± ν¬κΈ°νκ²μ΅λλΉπ