BeautifulSoup Basic
install
find() : 태그 추출
find_all(): 여러개의 태그 추출
https://www.chicagomag.com/chicago-magazine/november-2012/best-sandwiches-chicago/
chicago magazine the 50 best sandwiches
# !pip install fake-useragent
from urllib.request import Request, urlopen
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url_base = "https://www.chicagomag.com/"
# 필요한 내용을 담을 빈 리스트
# 리스트로 하나씩 컬럼을 만들고, DataFrame으로 합칠 예정
rank=[]
main_menu=[]
cafe_name=[]
url_add=[]
list_soup = soup.find_all("div","sammy") # soup.select(".sammy")
for item in list_soup:
rank.append(item.find(class_="sammyRank").text)
tmp_string = item.find(class_="sammyListing").text
main_menu.append(re.split(("\n|\r\n"), tmp_string)[0]) # |는 or의 의미
cafe_name.append(re.split(("\n|\r\n"), tmp_string)[1])
url_add.append(urljoin(url_base, item.find("a")["href"])) # 모르겠으면 강의 메모 보기
import pandas as pd
data = {
"Rank": rank,
"Menu": main_menu,
"Cafe": cafe_name,
"URL": url_add
}
df = pd.DataFrame(data)
df.tail(2)
df = pd.DataFrame(data, columns=["Rank","Cafe","Menu","URL"])
df.tail()
from tqdm import tqdm
price = []
address = []
for idx, row in tqdm(df.iterrows()):
req = req = Request(row["URL"], headers={"user-agent":"Chrome"})
html = urlopen(req).read()
soup_tmp = BeautifulSoup(html, "html.parser")
gettings = soup_tmp.find("p","addy").get_text()
price_tmp = re.split(".,", gettings)[0]
tmp = re.search("\$\d+\.(\d+)?", price_tmp).group()
price.append(tmp)
address.append(price_tmp[len(tmp)+2:])
df["Price"] = price
df["Address"] = address
df = df.loc[:,["Rank","Cafe","Menu","Price","Address"]] # 대괄호 안에 첫번째 : 는 모든 행을 가져온다는 뜻
df.set_index("Rank",inplace=True)
df.head()
import folium
import googlemaps
gmaps_key = "개인 API"
gmaps = googlemaps.Client(key=gmaps_key)
lat = []
lng = []
for idx, row in tqdm(df.iterrows()):
if not row["Address"] == "Multiple location":
target_name = row["Address"] + ", " + "Chicago"
gmaps_output = gmaps.geocode(target_name)
location_output = gmaps_output[0].get("geometry")
lat.append(location_output["location"]["lat"])
lng.append(location_output["location"]["lng"])
else:
lat.append(np.nan)
lng.append(np.nan)
df["lat"] = lat
df["lng"] = lng
df.tail()
mapping = folium.Map(location=[41.8781136, -87.6297982], zoom_start=11)
for idx, row in df.iterrows():
if not row["Address"] == "Multiple location":
folium.Marker(
location=[row["lat"], row["lng"]],
popup=row["Cafe"],
tooltip=row["Menu"],
icon=folium.Icon(
icon="coffee",
prefix="fa"
)
).add_to(mapping)
mapping