2022-09-06

jmΒ·2022λ…„ 9μ›” 14일
0

TIL

λͺ©λ‘ 보기
18/22

πŸ“Œ μ „κ΅­ 의료 κΈ°κ΄€ 데이터 뢄석


βœ… 라이브러리 μž„ν¬νŠΈ 및 데이터 μ€€λΉ„ν•˜κΈ°

import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

df = pd.read_csv('/content/data.csv', encoding='EUC-KR')

# NaN 데이터 ν™•μΈν•˜κΈ°
df.isna().sum()  -> μ†Œμž¬μ§€μ „ν™”, μ˜λ£ŒκΈ°κ΄€μ’…λ³„λͺ…, 의료인수, μž…μ›μ‹€μˆ˜, λ³‘μƒμˆ˜, μ§„λ£Œκ³Όλͺ©λ‚΄μš©λͺ…

# μ»¬λŸΌλ³„ 데이터 확인 -> df.컬럼λͺ….unique()

βœ… 데이터 λΆ„μ„ν•˜κΈ°

# μƒμ„Έμ˜μ—…μƒνƒœλͺ… 별 의료 κΈ°κ΄€ 수
df['μƒμ„Έμ˜μ—…μƒνƒœλͺ…'].value_counts()

gb_df = df.groupby(by=['μƒμ„Έμ˜μ—…μƒνƒœλͺ…']).size().reset_index(name='μ˜λ£ŒκΈ°κ΄€μˆ˜')

πŸ”Ό gb_df μž…λ ₯ν•΄ 좜λ ₯해보면 μ΄λ ‡κ²Œ λ°μ΄ν„°ν”„λ ˆμž„ν˜•μ‹μœΌλ‘œ λ‚˜νƒ€λ‚œλ‹€.


# μ˜μ—…μƒνƒœλ³„ μ˜λ£ŒκΈ°κ΄€μˆ˜ 확인
px.histogram(gb_df, x='μƒμ„Έμ˜μ—…μƒνƒœλͺ…', y='μ˜λ£ŒκΈ°κ΄€μˆ˜')

πŸ”Ό κ²°κ³Ό


# 파이차트
px.pie(gb_df, names='μƒμ„Έμ˜μ—…μƒνƒœλͺ…', values='μ˜λ£ŒκΈ°κ΄€μˆ˜')

πŸ”Ό κ²°κ³Ό


# 지역λͺ… 별 의료 κΈ°κ΄€ 수
df['지역λͺ…'].value_counts()
lgb_df = df.groupby(by=['지역λͺ…', 'λ„μ‹œλͺ…', 'μƒμ„Έμ˜μ—…μƒνƒœλͺ…', 'μ˜λ£ŒκΈ°κ΄€μ’…λ³„λͺ…']).size().reset_index(name='μ˜λ£ŒκΈ°κ΄€μˆ˜')

이 μ—­μ‹œ λ³€μˆ˜λͺ…을 μž…λ ₯ν•˜μ—¬ 좜λ ₯ν• μ‹œ λ°μ΄ν„°ν”„λ ˆμž„ν˜•νƒœλ‘œ λ˜μ–΄μžˆλŠ” 것을 확인할 수 μžˆλ‹€.


# 지역별 μ˜λ£ŒκΈ°κ΄€μˆ˜ ν˜„ν™©
px.histogram(lgb_df, x='지역λͺ…', y='μ˜λ£ŒκΈ°κ΄€μˆ˜')

πŸ”Ό κ²°κ³Ό


# 지역에 λ”°λ₯Έ μ˜μ—…μƒνƒœλ³„ μ˜λ£ŒκΈ°κ΄€ 수 확인
px.histogram(lgb_df, x='지역λͺ…', y='μ˜λ£ŒκΈ°κ΄€μˆ˜', color='μƒμ„Έμ˜μ—…μƒνƒœλͺ…')

πŸ”Ό κ²°κ³Ό


# 지역에 λ”°λ₯Έ μ‹œκ΅°κ΅¬λ³„ μ˜λ£ŒκΈ°κ΄€ 수 확인
px.histogram(lgb_df, x='지역λͺ…', y='μ˜λ£ŒκΈ°κ΄€μˆ˜', color='λ„μ‹œλͺ…')

πŸ”Ό κ²°κ³Ό


# 지역에 λ”°λ₯Έ μ˜λ£ŒκΈ°κ΄€μ’…λ₯˜λ³„ μ˜λ£ŒκΈ°κ΄€ 수 확인
px.histogram(lgb_df, x='지역λͺ…', y='μ˜λ£ŒκΈ°κ΄€μˆ˜', color='μ˜λ£ŒκΈ°κ΄€μ’…λ³„λͺ…')

πŸ”Ό κ²°κ³Ό


# 지역별 μ˜λ£ŒκΈ°κ΄€μˆ˜ ν™•μΈν•˜λŠ” 파이 차트
px.pie(lgb_df, names='지역λͺ…', values='μ˜λ£ŒκΈ°κ΄€μˆ˜')

πŸ”Ό κ²°κ³Ό


# μ˜λ£ŒκΈ°κ΄€μ’…λ³„ μ˜λ£ŒκΈ°κ΄€μˆ˜ ν˜„ν™©
px.histogram(lgb_df, x='μ˜λ£ŒκΈ°κ΄€μ’…λ³„λͺ…', y='μ˜λ£ŒκΈ°κ΄€μˆ˜')

πŸ”Ό κ²°κ³Ό


# μ˜λ£ŒκΈ°κ΄€μ’…λ³„ μ˜λ£ŒκΈ°κ΄€μˆ˜ ν™•μΈν•˜λŠ” 파이 차트
px.pie(lgb_df, names='μ˜λ£ŒκΈ°κ΄€μ’…λ³„λͺ…', values='μ˜λ£ŒκΈ°κ΄€μˆ˜')

πŸ”Ό κ²°κ³Ό


βœ… μ›Œλ“œ ν΄λΌμš°λ“œ μ‹œκ°ν™”

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# μ§„λ£Œκ³Όλͺ©λͺ…을 ν•˜λ‚˜μ˜ text ν˜•νƒœλ‘œ λ³€ν™˜
text = " ".join(cont for cont in df.μ§„λ£Œκ³Όλͺ©λ‚΄μš©λͺ….astype(str))

πŸ”Ό μ›Œλ“œ ν΄λΌμš°λ“œ μ‹œκ°ν™”λ₯Ό μœ„ν•œ μ „μ²˜λ¦¬ κ³Όμ •
(μ§„λ£Œκ³Όλͺ©λ‚΄μš©λͺ…μœΌλ‘œ μ›Œλ“œ ν΄λΌμš°λ“œ μ‹œκ°ν™”)


plt.subplots(figsize=(25, 15))

wordcloud = WordCloud(background_color='black', width=1000, height=700, font_path=fontpath).generate(text)

plt.axis('off')
plt.imshow(wordcloud, interpolation='bilinear')
plt.show()

μœ„μ™€ 같은 κ²°κ³Όκ°€ λ‚˜μ˜¨λ‹€.


βœ… λ‚΄ μœ„μΉ˜μ™€ κ°€μž₯ κ°€κΉŒμš΄ μ˜λ£ŒκΈ°κ΄€ μ°ΎκΈ°

✏️ 단 μ˜μ—…μ€‘μΈ λ³‘μ›λ§Œ κ²€μƒ‰λ˜λ„λ‘ ν•˜κΈ°
✏️ λ‚΄ μœ„μΉ˜λŠ” λ„λ‘œλͺ… μ£Όμ†Œλ‘œ μž…λ ₯ λ°›κ³  κ°€μž₯ κ°€κΉŒμš΄ μ˜λ£ŒκΈ°κ΄€μ€ 5개만 μΆ”μΆœν•˜μ—¬ 지도 μ‹œκ°ν™”
✏️ λ‚΄ μœ„μΉ˜ λ§ˆμ»€μ™€ 병원 마컀λ₯Ό ν‘œμ‹œν•˜κ³  병원 마컀λ₯Ό ν΄λ¦­ν•˜λ©΄ 병원λͺ…이 λ‚˜μ˜€λ„λ‘

βœ”οΈ 라이브러리 μž„ν¬νŠΈ 및 데이터 μ€€λΉ„ν•˜κΈ°

# 1) 라이브러리 μž„ν¬νŠΈ
import folium
import pandas as pd

# 2) 파일 μ—…λ‘œλ“œ
df = pd.read_csv('/content/drive/MyDrive/data.csv', encoding='EUC-KR')

# NaN 데이터 ν™•μΈν•˜κΈ°
# df.isna().sum()

βœ”οΈ λ‚΄ μœ„μΉ˜ μ£Όμ†Œλ₯Ό μž…λ ₯ν•˜κ³  κ°€κΉŒμš΄ 병원 5개만 μΆ”μΆœν•˜μ—¬ μ €μž₯ν•˜κΈ°

# 3) μ£Όμ†Œλ₯Ό μ’Œν‘œλ‘œ λ³€ν™˜ν•  ν•¨μˆ˜ μ€€λΉ„
from geopy.geocoders import Nominatim

def geocoding(address):
  geolocoder = Nominatim(user_agent = 'South Korea', timeout=None)
  geo = geolocoder.geocode(address)
  crd = {"lat":float(geo.latitude), "lng":float(geo.longitude)}

  return crd

# 4) μ‚¬μš©μžμ—κ²Œ μ£Όμ†Œλ₯Ό μž…λ ₯λ°›κΈ°
address = input("λ‹Ήμ‹ μ˜ μ£Όμ†Œλ₯Ό μž…λ ₯ν•˜μ‹œμ˜€.")
crd = geocoding(address)

# 5) μ£Όμ†Œ μ’Œν‘œλ‘œ λ³€ν™˜ν•˜μ—¬ tuple ν˜•νƒœλ‘œ λ³€μˆ˜μ— λ‹΄κΈ°
from geopy.distance import geodesic

myhome = folium.Map(location=[crd['lat'],crd['lng']], zoom_start=14)

# 6) 병원데이터 dataframe에 거리 κ³„μ‚°ν•˜μ—¬ λ‹΄κΈ°
hpt = pd.DataFrame(columns=['사업μž₯λͺ…', 'λ„μ‹œλͺ…', 'μ˜λ£ŒκΈ°κ΄€μ’…λ³„λͺ…', 'μƒμ„Έμ˜μ—…μƒνƒœλͺ…', 'μœ„λ„', '경도', '거리'])
myhome = (crd['lat'], crd['lng'])

adr_s = address.split(' ')[0]
df = df.loc[df.지역λͺ….str.contains(adr_s)]

for n in df.index:
  hpt_loc = (df.loc[n, 'μœ„λ„'], df.loc[n, '경도'])  # tuple ν˜•νƒœ

  # hpt dataframe에 λ‹΄κΈ°
  hpt.loc[n] = [df.loc[n, '사업μž₯λͺ…'],
                df.loc[n, 'λ„μ‹œλͺ…'],
                df.loc[n, 'μ˜λ£ŒκΈ°κ΄€μ’…λ³„λͺ…'],
                df.loc[n, 'μƒμ„Έμ˜μ—…μƒνƒœλͺ…'],
                df.loc[n, 'μœ„λ„'], df.loc[n, '경도'],
                geodesic(myhome, hpt_loc).kilometers]

# 7) λ‚΄ μœ„μΉ˜μ— κ°€μž₯ κ°€κΉŒμš΄ μ˜μ—…μ€‘μΈ 병원 5개 뽑기
my_hpt = hpt.loc[hpt['μƒμ„Έμ˜μ—…μƒνƒœλͺ…'] == 'μ˜μ—…μ€‘']
my_hpt = my_hpt.sort_values(by=['거리']).head(5)

βœ”οΈ 지도 μ‹œκ°ν™”

# 8) 지도 μ€€λΉ„
my_map = folium.Map(location=[crd['lat'], crd['lng']], zoom_start=14)
for n in my_hpt.index:
  folium.Marker([my_hpt.loc[n, 'μœ„λ„'], my_hpt.loc[n, '경도']],
                popup='<pre>'+my_hpt.loc[n, '사업μž₯λͺ…']+'</pre>',
                icon=folium.Icon(icon='hospital-o', prefix='fa')).add_to(my_map)
  folium.Marker([crd['lat'], crd['lng']], icon=folium.Icon(color='red', icon='glyphicon glyphicon-home')).add_to(my_map)

my_map

πŸ”Ό 지도 μ‹œκ°ν™” κ²°κ³Ό



πŸ“Œ 넀이버 κ³΅κ°λ‰΄μŠ€ 크둀링

#이 뢀뢄은 처음 ν•œλ²ˆλ§Œ μ‹€ν–‰ν•˜λ©΄ 됨
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

μ…€λ ˆλ‹ˆμ›€ 라이브러리 μ‚¬μš©μ„ μœ„ν•œ μ„€μΉ˜

# 1) 라이브러리 μž„ν¬νŠΈ
from selenium import webdriver
from bs4 import BeautifulSoup

import re
import time
from pytz import timezone
import datetime

import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


# 2) 데이터 ν”„λ ˆμž„ 생성
data = pd.DataFrame(columns=['μˆœμœ„', '곡감쒅λ₯˜', 'κΈ°μ‚¬μ œλͺ©', '기사링크', 'κΈ°μ‚¬λ‚΄μš©', '곡감수', 'μˆ˜μ§‘μΌμž'])


options = webdriver.ChromeOptions()
options.add_argument('--headless')   # headless -> 창을 λ„μš°μ§€ μ•Šκ³  κ°€μƒμœΌλ‘œ μ§„ν–‰ν•˜λŠ” 것
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage') # deb/shm 디렉토리 μ‚¬μš©X
driver = webdriver.Chrome('chromedriver', options=options)

url_list = ['https://entertain.naver.com/ranking/sympathy/love',
            'https://entertain.naver.com/ranking/sympathy/cheer',
            'https://entertain.naver.com/ranking/sympathy/congrats',
            'https://entertain.naver.com/ranking/sympathy/expect',
            'https://entertain.naver.com/ranking/sympathy/surprise',
            'https://entertain.naver.com/ranking/sympathy/sad']

# https://entertain.naver.com/ranking/sympathy/cheer
# https://entertain.naver.com/ranking/sympathy/congrats
# https://entertain.naver.com/ranking/sympathy/expect
# https://entertain.naver.com/ranking/sympathy/surprise
# https://entertain.naver.com/ranking/sympathy/sad


for i in range(len(url_list)):
  driver.get(url_list[i])

  driver.implicitly_wait(3)

  time.sleep(1.5)

  driver.execute_script('window.scrollTo(0,800)')
  time.sleep(3)

  html_source = driver.page_source
  soup = BeautifulSoup(html_source, 'html.parser')

  li = soup.select('li._inc_news_lst3_rank_reply') #ul.news_lst news_lst3 count_info > li

  # 곡감쒅λ₯˜
  sym = url_list[i].split('.')[2].split('/')[3]

  for index_l in range(0, len(li)):
    try:
    # μˆœμœ„
      rank = li[index_l].find('em', {'class', 'blind'}).text.replace('\n', '').replace('\t', '').strip()

    # λ‰΄μŠ€ 제λͺ©
      title = li[index_l].find('a', {'class', 'tit'}).text.replace('\n', '').replace('\t', '').strip()

    # λ‰΄μŠ€ λ‚΄μš©
      summary = li[index_l].find('p', {'class', 'summary'}).text.replace('\n', '').replace('\t', '').strip()

    # λ‰΄μŠ€ 링크
      link = li[index_l].find('a').attrs['href']

    # 곡감수
      sym_s = li[index_l].find('a', {'class', 'likeitnews_item_likeit cheer'}).text.replace('\n','').replace('\t','').strip().split('수')[1]

    # dataframe에 μ €μž₯ (append)
      data = data.append({'μˆœμœ„' : rank,
                          '곡감쒅λ₯˜' : sym,
                          'κΈ°μ‚¬μ œλͺ©' : title,
                          '기사링크' : 'http://entertain.naver.com' + link,
                          'κΈ°μ‚¬λ‚΄μš©' : summary,
                          '곡감수' : sym_s,
                          'μˆ˜μ§‘μΌμž' : datetime.datetime.now(timezone('Asia/Seoul')).strftime('%Y-%m-%d %H:%M:%S')}, ignore_index=True)

    except:
      pass

    print('Complets of ' + rank + ' : ' + title)

print('---------------------------------')
print(data)

μ½”λ“œλŠ” μ΄λ ‡κ²Œ μ§œλ΄€μ—ˆλŠ”λ° 사싀 μ΄λ ‡κ²Œ ν•˜λ©΄ μ•ˆλœλ‹€!!!!!
print('Complets of ' + rank + ' : ' + title)
μš” 뢀뢄은 잘 좜λ ₯λ˜λŠ”λ° 데이터 append κ³Όμ •μ—μ„œ λ¬Έμ œκ°€ μžˆλŠ”λ“―ν•˜λ‹€..
cheer만 데이터 ν”„λ ˆμž„μ— μ €μž₯됨
κΈ€μ„œ 일단은 걍 ν¬κΈ°ν•˜κ²ŸμŠ΅λ‹ˆλ‹ΉπŸ‘

0개의 λŒ“κΈ€