Web data analysis 1

J·2024년 8월 23일

Beautiful Soup EDA KOR Web data Analysis pandas python

EDA (Exploratory Data Analysis)

목록 보기

3/17

1.시카고 맛집 데이터 분석 - 개요

Beautiful Soup을 통한 웹 스크래핑

최종목표
총 51개 페이지에서 각 가게의 정보를 가져온다

가게이름
대표메뉴
대표메뉴의 가격
가게주소

자료출처 : https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/

2.시카고 맛집 데이터 분석 - 메인 페이지

from urllib.request import Request, urlopen 
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

url_base = "https://www.chicagomag.com/"
url_sub = "Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/"
url = url_base + url_sub  
ua = UserAgent()
req = Request(url, headers={"user-agent": ua.ie})
html = urlopen(req)
soup = BeautifulSoup(html, "html.parser")
print(soup.prettify())

div의 sammy 클래스가 해당 BLT Old Oak Tap의 내용을 표시한다

div의 sammy 클래스를 읽자

soup.find_all("div", "sammy")

총 50개의 데이터가 들어왔는지 확인

len(soup.find_all("div", "sammy"))

원하는 정보인 랭킹,가게이름, 메뉴가 있는지 확인

find 명령어 사용 가능 확인 (type = bs4.element.Tag)

tmp_one= soup.find_all("div", "sammy")[0]
type(tmp_one)

랭킹 데이터 각기 출력

tmp_one.find(class_="sammyRank")

tmp_one.find(class_="sammyRank").get_text()

현 상태는 가게 이름과 메뉴 데이터가 같이 있음

tmp_one.find("div", {"class":"sammyListing"}).get_text()

또 연결되는 홈페이지 주소가 상대경로

tmp_one.find("a")["href"]

방법은 re 모듈의 split으로 쉽게 구분 가능

import re 

tmp_string = tmp_one.find(class_="sammyListing").get_text()
re.split(("\n|\r\n"), tmp_string)

print(re.split(("\n|\r\n"), tmp_string)[0])
print(re.split(("\n|\r\n"), tmp_string)[1])

반복문을 통해 50개 전체의 데이터를 가져오자

from urllib.parse import urljoin 

url_base = "http://www.chicagomag.com"

rank = [] 
main_menu = [] 
cafe_name = [] 
url_add = [] 

list_soup = soup.find_all("div", "sammy") # soup.select(".sammy")

for item in list_soup: 
    rank.append(item.find(class_="sammyRank").get_text())
    tmp_string = item.find(class_="sammyListing").get_text() 
    main_menu.append(re.split(("\n|\r\n"), tmp_string)[0])
    cafe_name.append(re.split(("\n|\r\n"), tmp_string)[1])
    url_add.append(urljoin(url_base, item.find("a")["href"]))

확인

len(rank), len(main_menu), len(cafe_name), len(url_add)

rank[:5]

main_menu[:5]

cafe_name[:5]

url_add[:5]

Dataframe으로 합치기

import pandas as pd 

data = {
    "Rank": rank, 
    "Menu": main_menu,
    "Cafe": cafe_name,
    "URL": url_add, 
}

df = pd.DataFrame(data)
df.tail(2)

columns 순서 변경

df = pd.DataFrame(data, columns=["Rank", "Cafe", "Menu", "URL"])
df.tail()

데이터 저장

df.to_csv("../data/03. best_sandwiches_list_chicago.csv", sep=",", encoding="utf-8")

3.시카고 맛집 데이터 분석 - 하위페이지 작업

URL정보를 따라 각각의 가격과 주소 추출 (1개 케이스 먼저 확인)

	df["URL"][0]

원하는 정보가 있는 부분의 태그 확인 (p tag "addy" class)

req = Request(df["URL"][0], headers={"user-agent":ua.ie})
html = urlopen(req).read()
soup_tmp = BeautifulSoup(html, "html.parser")
soup_tmp.find("p", "addy")

print(soup_tmp.find("p", "addy"))

Regular Expression으로 가격과 주소 분리 (먼저 텍스트 변환 후 진행)

price_tmp = soup_tmp.find("p", "addy").text
price_tmp

.,으로 분리

import re 
re.split(".,", price_tmp)

price_tmp = re.split(".,", price_tmp)[0]
price_tmp

tmp = re.search("\$\d+\.(\d+)?", price_tmp).group()
#숫자로 시작하다가 꼭.을 만나고 그 뒤 숫자가 있을 수도 있고 아닐 수도 있다

가격이 끝나는 지점 위치부터 그 뒤는 나머지 주소

tmp = re.search("\$\d+\.(\d+)?", price_tmp).group()
price_tmp[len(tmp) +2:]

반복문을 이용해 나머지 데이터 추출

from tqdm import tqdm 

price = [] 
address = [] 

for idx, row in tqdm(df.iterrows()):
    req = Request(row["URL"], headers={"user-agent":"Chrome"})
    html = urlopen(req).read()

    soup_tmp = BeautifulSoup(html, "html.parser")

    gettings = soup_tmp.find("p", "addy").get_text()

    price_tmp = re.split(".,", gettings)[0]
    tmp = re.search("\$\d+\.(\d+)?", price_tmp).group()

    price.append(tmp)
    address.append(price_tmp[len(tmp)+2:])

    print(idx)

확인

len(price), len(address)

price[:5]

address[:5]

URL대신 추출한 가격과 주소 넣기

df["Price"] = price 
df["Address"] = address

df = df.loc[:, ["Rank", "Cafe", "Menu", "Price", "Address"]]
df.set_index("Rank", inplace=True)
df.head()

다시 저장

df.to_csv("../data/03. best_sandwiches_list_chicago2.csv", sep=",", encoding="UTF-8")

4.시카고 맛집 데이터 분석 - 지도 시각화 (Folium)

필요 모듈 설치 및 데이터 호출

import folium
import pandas as pd 
import numpy as np 
import googlemaps
from tqdm import tqdm 

df = pd.read_csv("../data/03. best_sandwiches_list_chicago2.csv", index_col=0)
df.tail(10)

Google maps키 등록

gmaps_key = "개인마다 다름"
gmaps = googlemaps.Client(key=gmaps_key)

각 가게의 좌표 담기

lat = [] 
lng = [] 

for idx, row in tqdm(df.iterrows()):
    if not row["Address"] == "Multiple location":
        target_name = row["Address"] + ", " + "Chicago"
        # print(target_name)
        gmaps_output = gmaps.geocode(target_name)
        location_ouput = gmaps_output[0].get("geometry")
        lat.append(location_ouput["location"]["lat"])
        lng.append(location_ouput["location"]["lng"])
        # location_output = gmaps_output[0]
    else:
        lat.append(np.nan)
        lng.append(np.nan)

확인

len(lat), len(lng)

기존 데이터에 위도와 경도 추가

df["lat"] = lat 
df["lng"] = lng 
df.tail()

지도위에 표시

mapping = folium.Map(location=[41.8781136, -87.6297982], zoom_start=11)

for idx, row in df.iterrows():
    if not row["Address"] == "Multiple location":
        folium.Marker(
            location=[row["lat"], row["lng"]],
            popup=row["Cafe"],
            tooltip=row["Menu"],
            icon=folium.Icon(
                icon="coffee",
                prefix="fa"
            )
        ).add_to(mapping)

mapping

Full of adventure

이전 포스트

Analysis Seoul Crime

다음 포스트