
【웹 데이터 분석】 시카고 맛집 데이터 수집 및 분석
시카고의 50개 맛집 샌드위치 가게에 대한 이름과 메뉴 등의 정보를 모아 정리한다.
import pandas as pd
import ssl
from urllib.request import urlopen, Request
from urllib.parse import urljoin
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
ca_filepath = "../chicagomag.crt"
ctxt = ssl.create_default_context(cafile=ca_filepath)
url_base = "https://www.chicagomag.com/"
url_sub = "Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/"
url = url_base + url_sub
ua = UserAgent()
req = Request(url, headers={"user-agent": ua.fallback})
html = urlopen(req, context=ctxt)
soup = BeautifulSoup(html, "html.parser")

rank = []
main_menu = []
cafe_name = []
url_add = []
list_soup = soup.find_all("div", "sammy") # soup.select(".sammy")
for item in list_soup:
rank.append(item.find(class_="sammyRank").get_text())
tmp_string = item.find(class_="sammyListing").get_text()
main_menu.append(re.split(("\n|\r\n"), tmp_string)[0])
cafe_name.append(re.split(("\n|\r\n"), tmp_string)[1])
url_add.append(urljoin(url_base, item.find("a")["href"]))
len(rank), len(main_menu), len(cafe_name), len(url_add)
// (50, 50, 50, 50)
data = {
"Rank": rank,
"Cafe": cafe_name,
"Menu": main_menu,
"URL": url_add
}
df = pd.DataFrame(data)


price = []
address = []
for idx, row in df.iterrows():
req = Request(row["URL"], headers={"user-agent": ua.fallback})
html = urlopen(req, context=ctxt).read()
soup_tmp = BeautifulSoup(html, "html.parser")
gettings = soup_tmp.find("p", "addy").get_text()
tmp = re.split(".,", gettings)[0]
price_tmp = re.search("\$\d+\.(\d+)?", tmp).group()
price.append(price_tmp)
address.append(tmp[len(price_tmp)+2:])
len(price), len(address) // (50, 50)
df["Price"] = price
df["Address"] = address
df = df.loc[:, ["Rank", "Cafe", "Menu", "Price", "Address"]]
// index 재설정
df.set_index("Rank", inplace=True)

import pandas as pd
import numpy as np
import googlemaps
import folium
from tqdm import tqdm
gmaps = googlemaps.Client(key="google_api_key")
lat = [], lng = []
for idx, row in tqdm(df.iterrows()):
if not row["Address"] == "Multiple location":
target_name = row["Address"] + ", " + "Chicago"
gmaps_output = gmaps.geocode(target_name)
location_output = gmaps_output[0].get("geometry")
lat.append(location_output["location"]["lat"])
lng.append(location_output["location"]["lng"])
else:
lat.append(np.nan)
lng.append(np.nan)
df["lat"] = lat
df["lng"] = lng

chicago_map = folium.Map(location=[41.8781136, -87.6297982], zoom_start=11)
for idx, row in df.iterrows():
if row["Address"] != "Multiple location":
folium.Marker(
location=[row["lat"], row["lng"]],
popup=row["Cafe"],
tooltip=row["Menu"],
icon=folium.Icon(icon="coffee", prefix="fa")
).add_to(chicago_map)
chicago_map
