[6주차] EDA

xktm-woonge·2023년 6월 13일
post-thumbnail

BeautifulSoup

from bs4 import BeautifulSoup

page = open('zerobase.html', 'r').read()
soup = BeautifulSoup(page, 'html.parser')
soup.prettify()

# head 확인
soup.head

# body 확인
soup.body

# p 태그 확인
# 처음 발견한 p태그 반환
# find()
soup.p
soup.find('p')

# 조건
soup.find('p', class_="'inner-text first-item")

soup.find('p', {'class':"'inner-text first-item", 'id':'first'})

# 여러 태크 반환
# 리스트 형태로 반환
# find_all
soup.find_all('p')

soup.find_all(class_='outer-text')

BeautifulSoup 예제 1-1 네이버 금융

from urllib.request import urlopen
from bs4 import BeautifulSoup

url = 'https://finance.naver.com/marketindex/'
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
soup

# 동일한 기능 1
soup.find_all('span', 'value'), (len(soup.find_all('span', 'value')))

# 동일한 기능 2
soup.find_all('span', class_='value'), (len(soup.find_all('span', class_='value')))

# 동일한 기능 3
soup.find_all('span', {"class":'value'}), (len(soup.find_all('span', {"class":'value'})))

soup.find_all('span', {"class":'value'})[0].text,soup.find_all('span', {"class":'value'})[0].string,soup.find_all('span', {"class":'value'})[0].get_text()

from urllib.request import urlopen
from bs4 import BeautifulSoup

url = 'https://finance.naver.com/marketindex/'
response = urlopen(url)
# soup = BeautifulSoup(page, 'html.parser')
# soup
response.status # http 상태 코드

BeautifulSoup 예제 1-2 네이버 금융

  • requests
  • find, find_all = select_one, select
  • find, select_one : 단일선택
  • find_all, select : 다중선택
import requests
# from urllib.request.Request
from bs4 import BeautifulSoup
import pandas as pd

url = 'https://finance.naver.com/marketindex/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
soup

#soup.find_all('li', 'on')
# id => #, class => .
exchangeList = soup.select('#exchangeList > li')
len(exchangeList), exchangeList

title = exchangeList[0].select_one('.h_lst').text
exchage = exchangeList[0].select_one('.value').text
change = exchangeList[0].select_one('.change').text
updown = exchangeList[0].select_one('div.head_info.point_dn >.blind').text
title, exchage, change, updown

exchage_datas = []
baseUrl = 'https://finance.naver.com'
for item in exchangeList:
    data = {
        'title' : item.select_one('.h_lst').text,
        'exchage' : item.select_one('.value').text,
        'change' : item.select_one('.change').text,
        'updown' : item.select_one('div.head_info.point_dn >.blind').text,
        'link' : baseUrl + item.select_one('a').get('href')
    }
    exchage_datas.append(data)
    
exchage_datas
df = pd.DataFrame(exchage_datas)
df.to_excel('naverfinance.xlsx', encoding='utf-8')
df

위키백과 문서 정보 가져오기

from urllib.request import urlopen, Request
import urllib

html = 'https://ko.wikipedia.org/wiki/{search_words}'

req = Request(html.format(search_words=urllib.parse.quote('여명의_눈동자')))

response = urlopen(req)
soup = BeautifulSoup(response, 'html.parser')
print(soup.prettify())

n = 0

for each in soup.find_all('ul'):
    print('=>' + str(n) + '===================')
    print(each.get_text())
    
    n += 1

soup.find_all('ul')[32].text.strip().replace('\xa0', '').replace('\n', '')

시카고 맛집

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup


url_base = 'https://www.chicagomag.com/'
url_sub = 'chicago-magazine/november-2012/best-sandwiches-chicago/'

url = url_base + url_sub

req = Request(url, headers={'User-Agent' :'Chrome'})
html = urlopen(req)

soup = BeautifulSoup(html, 'html.parser')
print(soup.prettify())

soup.find_all('div', 'sammy')

temp_one = soup.find_all('div', 'sammy')[0]
temp_one

temp_one.find('div', {'class' : 'sammyListing'}).get_text()

import re

tmp_string=temp_one.find(class_='sammyListing').get_text()
re.split(('\n|\r\n'),tmp_string)

from urllib.parse import urljoin

url_base = 'https://www.chicagomag.com/'

rank = []
main_menu = []
cafe_name = []
url_add = []

list_soup = soup.find_all('div', 'sammy')

for item in list_soup:
    rank.append(item.find(class_='sammyRank').get_text())
    tmp_string=item.find(class_='sammyListing').get_text()
    main_menu.append(re.split(('\n|\r\n'),tmp_string)[0])
    cafe_name.append(re.split(('\n|\r\n'),tmp_string)[1])
    url_add.append(urljoin(url_base, item.find('a')['href']))
import pandas as pd

data = {
    'Rank' : rank,
    'Menu' : main_menu,
    'Cafe' : cafe_name,
    'URL' : url_add
}
df = pd.DataFrame(data, columns=['Rank', 'Cafe', 'Menu', 'URL'])
df

시카고 맛집 데이터 하위페이지

import pandas as pd
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup

df = pd.read_csv('chicago.csv', index_col=0)
df

req = Request(df['URL'][0], headers={'User-Agent' :'Chrome'})
html = urlopen(req).read()
soup_tmp = BeautifulSoup(html, 'html.parser')
soup_tmp.find('p', 'addy')

price_tmp = soup_tmp.find('p', 'addy').text
price_tmp

import re
re.split('.,', price_tmp)

price_tmp = re.split('.,', price_tmp)[0]
price_tmp

from tqdm import tqdm

price = []
address = []

for idx, row in tqdm(df.iterrows()):
    req = Request(row['URL'], headers={'User-Agent' :'Chrome'})
    html = urlopen(req).read()
    soup_tmp = BeautifulSoup(html, 'html.parser')
    gettings = soup_tmp.find('p', 'addy').get_text()
    price_tmp = re.split('.,', gettings)[0]
    tmp = re.search('\$\d+\.(\d+)?', price_tmp).group()
    price.append(tmp)
    address.append(price_tmp[len(tmp)+2:])
    print(idx)
df['Price'] = price
df['Address'] = address

df = df.loc[:, ["Rank", "Cafe", "Menu", "Price", "Address"]]

df.set_index('Rank', inplace=True)
df.head()

시카고 맛집 데이터 지도 시각화

import folium
import pandas as pd
import numpy as np
import googlemaps
from tqdm import tqdm

lat = []
lng = []

for idx, row in tqdm(df.iterrows()):
    if not row["Address"] == "Multiple location":
        target_name = row["Address"] + ", " + "Chicago"
        # print(target_name)
        gmaps_output = gmaps.geocode(target_name)
        location_ouput = gmaps_output[0].get("geometry")
        lat.append(location_ouput["location"]["lat"])
        lng.append(location_ouput["location"]["lng"])
        # location_output = gmaps_output[0]
    else:
        lat.append(np.nan)
        lng.append(np.nan)
        
df["lat"] = lat 
df["lng"] = lng 
df.tail()

mapping = folium.Map(location=[41.8781136, -87.6297982], zoom_start=11)

for idx, row in df.iterrows():
    if not row["Address"] == "Multiple location":
        folium.Marker(
            location=[row["lat"], row["lng"]],
            popup=row["Cafe"],
            tooltip=row["Menu"],
            icon=folium.Icon(
                icon="coffee",
                prefix="fa"
            )
        ).add_to(mapping)

mapping

profile
끄적끄적..

0개의 댓글