Ch4 웹데이터 분석 14-26 (웹데이터4-6)

김민지·2023년 4월 1일

Part 04. EDA/웹크롤링/파이썬프로그래밍

목록 보기

7/12

시카고 맛집 메인페이지 분석

# !pip install fake-useragent
from urllib.request import Request, urlopen
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

url_base = "https://www.chicagomag.com/"
url_sub = "chicago-magazine/november-2012/best-sandwiches-chicago/"
url = url_base + url_sub

# ua = UserAgent()  -> (브라우저 정보)랜덤값을 생성해서 넣어줌
# ua.ie
# req = Request(url, headers={"user-agent":ua.ie})
req = Request(url, headers={"User-Agent":"Chrome"}) # 만약 문제가 생길경우 headers값을 줌.
response = urlopen(req)
response.status

html = urlopen(req)
soup = BeautifulSoup(html, "html.parser")
print(soup.prettify())

시카고 맛집 [0]번째 데이터부터 가져와보기

# soup.find_all("div", "sammy"), len(soup.find_all("div", "sammy"))
soup.select(".sammy"), len(soup.select(".sammy"))

tmp_one = soup.find_all("div", "sammy")[0]
type(tmp_one)

-> bs4.element.Tag : find함수 사용가능

# tmp_one.find(class_="sammyRank").get_text()
tmp_one.select_one(".sammyRank").text

-> '1'

# tmp_one.find("div", {"class":"sammyListing"}).text
tmp_one.select_one(".sammyListing").get_text()

-> 'BLT\nOld Oak Tap\nRead more '

# tmp_one.find("a")["href"]
tmp_one.select_one("a").get("href")

-> '/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

import re

tmp_string = tmp_one.find(class_="sammyListing").get_text()
re.split(("\n|\r\n"), tmp_string)

-> ['BLT', 'Old Oak Tap', 'Read more ']

print(re.split(("\n|\r\n"), tmp_string)[0])
print(re.split(("\n|\r\n"), tmp_string)[1])

-> BLT
Old Oak Tap

시카고 맛집 50개 데이터 반복문으로 가져오기

from urllib.parse import urljoin

url_base = "https://www.chicagomag.com"

# 필요한 내용을 담을 빈 리스트
# 리스트로 하나씩 컬럼을 만들고, DataFrame으로 합칠 예정
rank = []
main_menu = []
cafe_name = []
url_add = []

list_soup = soup.find_all("div", "sammy")

for item in list_soup:
    rank.append(item.find(class_="sammyRank").get_text())
    tmp_string = item.find(class_="sammyListing").get_text()
    main_menu.append(re.split(("\n|\r\n"), tmp_string)[0])
    cafe_name.append(re.split(("\n|\r\n"), tmp_string)[1])
    url_add.append(urljoin(url_base, item.find("a")["href"])) # urlbase가 있으면 안붙이고, 없으면(상대주소라면) urlbase를 붙여주는 기능

len(rank), len(main_menu), len(cafe_name), len(url_add)

-> (50, 50, 50, 50) : 다 가져왔는지 확인해보기

rank[:5]

main_menu[:5]

cafe_name[:5]

url_add[:5]

-> 상위 5개 데이터씩 제대로 가져왔나 확인해보기

데이터프레임 만들기

import pandas as pd

data = {
    "Rank": rank,
    "Menu": main_menu,
    "Cafe": cafe_name,
    "URL": url_add
}

df = pd.DataFrame(data)
df.tail(2)

# 컬럼 순서 변경
df = pd.DataFrame(data, columns=["Rank", "Cafe", "Menu", "URL"])
df.tail()

# 데이터 저장
df.to_csv(
    "../data/03. best_sandwiches_list_chicago.csv", sep=",", encoding="utf-8"
)

시카고 맛집 데이터 하위 페이지 분석

URL 정보를 따라 50개 페이지 각각의 가격, 주소 가져오기

# requirements
import pandas as pd
from urllib.request import urlopen, Request
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

df = pd.read_csv("../data/03. best_sandwiches_list_chicago.csv", index_col=0)
df.tail()

df["URL"][0]

req = Request(df["URL"][0], headers={"user-agent":ua.ie})
html = urlopen(req).read()
soup_tmp = BeautifulSoup(html, "html.parser")
soup_tmp.find("p", "addy") # soup_find.select_one(".addy")

$10. 2109 W. Chicago Ave., 773-772-0406, theoldoaktap.com

# regular expression
price_tmp = soup_tmp.find("p", "addy").text
price_tmp

-> '\n$10. 2109 W. Chicago Ave., 773-772-0406, theoldoaktap.com'

import re
re.split(".,", price_tmp)

-> ['\n$10. 2109 W. Chicago Ave', ' 773-772-040', ' theoldoaktap.com']

price_tmp = re.split(".,", price_tmp)[0]
price_tmp

-> '\n$10. 2109 W. Chicago Ave'

Regular Expression
- .x : 임의의 한 문자를 표현함(x가 마지막으로 끝남)
- x+ : x가 1번 이상 반복함
- x? : x가 존재하거나 존재하지 않음
- x* : x가 0번 이상 반복함
- x|y : x 또는 y를 찾음 (or연산자)

가격, 주소 가져오기

tmp = re.search("\$\d+\.(\d+)?", price_tmp).group()  # 가격
price_tmp[len(tmp) + 2:] # 주소

-> '2109 W. Chicago Ave'

반복문으로 50개 데이터(가격,주소) 가져오기

from tqdm import tqdm

price = []
address = []

for idx, row in df.iterrows():
    req = Request(row["URL"], headers={"user-agent":"Chrome"})
    html = urlopen(req).read()
    
    soup_tmp = BeautifulSoup(html, "lxml")

    gettings = soup_tmp.find("p", "addy").get_text()
    price_tmp = re.split(".,", gettings)[0]
    tmp = re.search("\$\d+\.(\d+)?", price_tmp).group()
    price.append(tmp)
    address.append(price_tmp[len(tmp) + 2:])
    print(idx)

df["Price"] = price
df["Address"] = address

df = df.loc[:, ["Rank", "Cafe", "Menu", "Price", "Address"]]
df.set_index("Rank", inplace=True)
df.head()

df.to_csv(
    "../data/03. best_sandwiches_list_chicago2.csv", sep=",", encoding="utf-8"
)

pd.read_csv("../data/03. best_sandwiches_list_chicago2.csv", index_col=0)

시카고 맛집 데이터 지도 시각화

# requirements

import folium
import pandas as pd
import numpy as np
import googlemaps
from tqdm import tqdm  # 실행했을 때 진행률을 볼 수 있음

df = pd.read_csv("../data/03. best_sandwiches_list_chicago2.csv", index_col=0)
df.tail(10)

gmaps_key = "AIzaSyAc-8jqf4pKjDW91-dggnio0Sda7IdTZqA"
gmaps= googlemaps.Client(key=gmaps_key)

lat = []
lng = []

for idx, row in tqdm(df.iterrows()):
    if not row["Address"] == "Multiple location":
        target_name = row["Address"] + ", " + "Chicago"
        gmaps_output = gmaps.geocode(target_name)
        location_output = gmaps_output[0].get("geometry")
        lat.append(location_output["location"]["lat"])
        lng.append(location_output["location"]["lng"])
    else:
        lat.append(np.nan)
        lng.append(np.nan)

df["lat"] = lat
df["lng"] = lng
df.tail()

mapping = folium.Map(location=[41.8781136, -87.6297982], zoom_start=11)

for idx, row in df.iterrows():
    if not row["Address"] == "Multiple location":
        folium.Marker(
            location=[row["lat"], row["lng"]],
            popup=row["Cafe"],
            tooltip=row["Menu"],
            icon=folium.Icon(
                icon="coffee",
                prefix="fa"
            )

        ).add_to(mapping)

mapping

네이버 영화 평점 사이트 분석

영화랭킹 탭 이동
영화랭킹에서 평점순(현재상영영화) 선택
웹페이지 주소에는 많은 정보가 담겨있음
원하는 정보를 얻기 위해서 변화시켜줘야 하는 주소의 규칙을 찾을 수 있음
여기에서는 날짜 정보를 변경해주면 해당 페이지에 접근이 가능함

# requirements
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup

url = "https://movie.naver.com/movie/sdb/rank/rmovie.naver?sel=cur&date=20210914"
response = urlopen(url)
# response.status
soup = BeautifulSoup(response, "html.parser")
print(soup.prettify())

# 영화 제목 태그
soup.find_all("div", "tit5") # soup.select("div.tit5")

# 영화제목 가져오기
# 1
soup.find_all("div", "tit5")[0].a.string 

# 2
soup.select(".tit5")[0].find("a").text

# 3
soup.select(".tit5")[0].select_one("a").get_text()

# 영화 평점 태그 가져오기
soup.find_all("td", "point") # soup.select(".point")

# 제대로 다 가져왔는지 확인
len(soup.find_all("td", "point")), len(soup.find_all("div", "tit5"))

# 데이터 하나만 확인해보기
soup.find_all("td", class_="point")[0].text #soup.select("td.point")[0].string

# 영화 제목 리스트

end = len(soup.find_all("div", "tit5"))

movie_name = []

for n in range(0, end):
    movie_name.append(
        soup.find_all("div", "tit5")[n].a.string
    )
    
# movie_name = [soup.select(".tit5")[n].a.text for n in range(0, end)]    
    
movie_name

# 영화 평점 리스트

end = len(soup.find_all("td", "point"))

movie_point = [soup.find_all("td", "point")[n].string for n in range(0, end)]
movie_point

# 전체 데이터 수 확인

len(movie_name), len(movie_point)

네이버 영화 평점 데이터 확보

여러 날짜의 데이터 얻기
자동화를 위한 코드 만들기
-> 날짜만 변경하면 우리가 원하는 데이터를 얻을 수 있음

date = pd.date_range("2021.01.01", periods=100, freq="D")
date

date[0].strftime("%Y-%m-%d")

-> '2021-01-01'

date[0].strftime("%Y.%m.%d")

-> '2021.01.01'

# 문자열 format

test_string = "Hi, I'm {name}"
test_string.format(name="Zerobase")
test_string.format(name="Pinkwink")

import time
from tqdm import tqdm

movie_date = []
movie_name = []
movie_point = []

for today in tqdm(date):
    url = "https://movie.naver.com/movie/sdb/rank/rmovie.naver?sel=cur&date={date}"
    response = urlopen(url.format(date=today.strftime("%Y%m%d")))
    soup = BeautifulSoup(response, "html.parser")
    
    end = len(soup.find_all("td", "point"))
    
    movie_date.extend([today for _ in range(0, end)])
    movie_name.extend([soup.select("div.tit5")[n].find("a").text for n in range(0, end)])
    movie_point.extend([soup.find_all("td", "point")[n].string for n in range(0, end)])
    
    time.sleep(0.5)  # 천천히 실행되도록 함

len(movie_date), len(movie_name), len(movie_point)

movie_point[:5]

movie_name[:5]

# 데이터프레임 만들기
movie = pd.DataFrame({
    "date":movie_date,
    "name":movie_name,
    "point":movie_point
})
movie.tail()

movie.info()  # 평점이 object타입으로 되어있음

# 나중 계산할 경우를 대비하여 float 타입으로 바꿔 저장
movie["point"] = movie["point"].astype(float)

# 데이터 저장

movie.to_csv(
    "../data/03. naver_moive_data.csv", sep=",", encoding="utf-8"
)

# 데이터 불러오기
df = pd.read_csv("../data/03. naver_moive_data.csv", index_col=0)
df

네이버 영화 평점 데이터 정리 및 시각화

import numpy as np
import pandas as pd

movie = pd.read_csv("../data/04_naver_movie_raw_data.csv", index_col=0)
movie.tail()

영화 이름으로 인덱스를 잡기
점수의 합산 구하기
100일간 네이버 영화 평점 합산 기준으로 베스트&워스트 10 선정

# pivot table

movie_unique = pd.pivot_table(data=movie, index="name", aggfunc=np.sum)
movie_unique

movie_best = movie_unique.sort_values(by="point", ascending=False) # 내림차순
movie_best.head()

tmp = movie.query("name == ['러빙 빈센트']")
tmp

# 시각화

import matplotlib.pyplot as plt
from matplotlib import rc

rc("font", family="Malgun Gothic")
%matplotlib inline
# get_ipython().run_line_magic("matplotlib", "inline")

plt.figure(figsize=(20, 8))
plt.plot(tmp["date"], tmp["point"]) # 선 그래프 x축-날짜, y축-평점 -> 날짜에 따른 평점 변화를 선그래프로 표현
plt.title("날짜별 평점")
plt.xlabel("날짜")
plt.ylabel("평점")
plt.xticks(rotation="vertical")
plt.legend(labels=["평점 추이"], loc="best")
plt.grid=True
plt.show()

# 상위 10개 영화
movie_best.head(10)

# 하위 10개 영화
movie_best.tail(10)

movie_pivot = pd.pivot_table(data=movie, index="date", columns="name", values="point")
movie_pivot.head()

# 엑셀로 저장
movie_pivot.to_excel("../data/03. movie_pivot_practice.xlsx")

import platform
import seaborn as sns
from matplotlib import font_manager, rc

path = "C://Windows/Fonts/malgun.ttf"

if platform.system() == "Darwin":
    rc("font", family="Arial Unicode MS")
elif platform.system() == "Windows":
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc("font", family=font_name)
else:
    print("Unknown system. sorry")

target_col = ["메이즈 러너", "강철비", "신과함께-죄와 벌", "위대한 쇼맨", "너의 췌장을 먹고 싶어", "러빙 빈센트"]
plt.figure(figsize=(20, 8))
plt.title("날짜별 평점")
plt.xlabel("날짜")
plt.ylabel("평점")
plt.xticks(rotation="vertical")
plt.tick_params(bottom="off", labelbottom="off")
plt.plot(movie_pivot[target_col])
plt.legend(target_col, loc="best")
plt.grid=True
plt.show()

<제로베이스 데이터 취업 스쿨>

김민지

이전 포스트

Ch4 웹데이터 분석 01-13 (웹데이터1-3)

다음 포스트

Ch4 웹데이터 분석 14-26 (웹데이터4-6)

Part 04. EDA/웹크롤링/파이썬프로그래밍

Ch4 웹데이터 분석 01-13 (웹데이터1-3)

Ch5 유가 분석 01-13 (유가분석1-5)

0개의 댓글