from bs4 import BeautifulSoup
import pandas as pd
from urllib.request import urlopen
url = 'https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm'
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
soup
⇊
Output exceeds the size limit. Open the full output data in a text editor
<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<style>
body#styleguide-v2 {
background: no-repeat fixed center top #000;
}
</style>
<script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>
<script>
if (typeof uet == 'function') {
uet("bb", "LoadTitle", {wb: 1});
}
</script>
<script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>
<title>Most Popular Movies - IMDb</title>
<script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>
<script>
if (typeof uet == 'function') {
uet("be", "LoadTitle", {wb: 1});
}
</script>
...
uet("be");
}
</script>
</body>
</html>
soup.find_all('td', 'titleColumn')
⇊
Output exceeds the size limit. Open the full output data in a text editor
[<td class="titleColumn">
<a href="/title/tt10366206/" title="Chad Stahelski (dir.), Keanu Reeves, Laurence Fishburne">John Wick: Chapter 4</a>
<span class="secondaryInfo">(2023)</span>
<div class="velocity">1
(no change)
</div>
</td>,
<td class="titleColumn">
<a href="/title/tt2906216/" title="John Francis Daley (dir.), Chris Pine, Michelle Rodriguez">Dungeons & Dragons: Honor Among Thieves</a>
<span class="secondaryInfo">(2023)</span>
<div class="velocity">2
<span class="secondaryInfo">(
<span class="global-sprite titlemeter up"></span>
6)</span>
</div>
</td>,
<td class="titleColumn">
<a href="/title/tt14230388/" title="Wes Anderson (dir.), Jason Schwartzman, Scarlett Johansson">Asteroid City</a>
<span class="secondaryInfo">(2023)</span>
<div class="velocity">3
<span class="secondaryInfo">(
<span class="global-sprite titlemeter up"></span>
506)</span>
</div>
</td>,
...
<span class="secondaryInfo">(
<span class="global-sprite titlemeter up"></span>
183)</span>
</div>
</td>]
영화 제목 찾기 (td 태그 안 titleColumn)
soup.find_all('td', 'titleColumn')[0].a
⇊
<a href="/title/tt10366206/" title="Chad Stahelski (dir.), Keanu Reeves, Laurence Fishburne">John Wick: Chapter 4</a>
soup.find_all('td', 'titleColumn')[0].a.string
⇊
'John Wick: Chapter 4'
a 태그 string으로 영화 제목만 가져오기
soup.find_all('td', 'ratingColumn imdbRating')
⇊
Output exceeds the size limit. Open the full output data in a text editor
[<td class="ratingColumn imdbRating">
<strong title="8.3 based on 87,940 user ratings">8.3</strong>
</td>,
<td class="ratingColumn imdbRating">
<strong title="7.6 based on 22,737 user ratings">7.6</strong>
</td>,
<td class="ratingColumn imdbRating">
</td>,
<td class="ratingColumn imdbRating">
<strong title="7.7 based on 338,971 user ratings">7.7</strong>
</td>,
<td class="ratingColumn imdbRating">
<strong title="7.9 based on 415,270 user ratings">7.9</strong>
</td>,
<td class="ratingColumn imdbRating">
<strong title="7.4 based on 668,852 user ratings">7.4</strong>
</td>,
<td class="ratingColumn imdbRating">
<strong title="6.8 based on 55,169 user ratings">6.8</strong>
</td>,
<td class="ratingColumn imdbRating">
<strong title="6.5 based on 27,759 user ratings">6.5</strong>
</td>,
<td class="ratingColumn imdbRating">
<strong title="5.7 based on 18,331 user ratings">5.7</strong>
...
<td class="ratingColumn imdbRating">
<strong title="9.0 based on 2,695,932 user ratings">9.0</strong>
</td>,
<td class="ratingColumn imdbRating">
</td>]
soup.find_all('td','ratingColumn imdbRating')[0].strong
⇊
<strong title="8.3 based on 87,940 user ratings">8.3</strong>
soup.find_all('td','ratingColumn imdbRating')[0].strong.string
⇊
'8.3'
end = len(soup.find_all('td', 'titleColumn'))
movie_name = [soup.find_all('td', 'titleColumn')[n].a.string for n in range(0, end)]
movie_name
⇊
['John Wick: Chapter 4',
'Dungeons & Dragons: Honor Among Thieves',
'Asteroid City',
'Avatar: The Way of Water',
'에브리씽 에브리웨어 올 앳 원스',
'존 윅',
'I See You',
'Shazam! Fury of the Gods',
'Murder Mystery 2',
'Scream VI',
'더 웨일',
'John Wick: Chapter 3 - Parabellum',
'Cocaine Bear',
'The Super Mario Bros. Movie',
'Air',
'Tetris',
'Creed III',
'John Wick: Chapter 2',
'Boston Strangler',
'Luther: The Fallen Sun',
'트라이앵글 오브 새드니스',
'탑건: 매버릭',
'Chor Nikal Ke Bhaga',
'Evil Dead Rise',
'Champions',
...
'해리포터와 마법사의 돌',
'더 블랙 폰',
'Pathaan',
'다크 나이트',
'John Wick: Chapter 5']
end변수에 한 페이지에 나오는 영화 제목갯수 넣기
0부터 그 갯수-1 만큼 for문 실행
end = len(soup.find_all('td', 'ratingColumn imdbRating'))
movie_rate = [rating.strong.string if rating.strong else 'No Rated' for rating in soup.find_all('td', 'ratingColumn imdbRating')]
movie_rate
⇊
['8.3',
'7.6',
'No Rated',
'7.7',
'7.9',
'7.4',
'6.8',
'6.5',
'5.7',
'7.1',
'7.7',
'7.4',
'6.0',
'7.6',
'7.8',
'7.4',
'7.1',
'7.4',
'6.5',
'6.4',
'7.4',
'8.3',
'7.8',
'8.1',
'6.8',
...
'7.6',
'6.9',
'6.1',
'9.0',
'No Rated']
date = pd.date_range('2023.03.01', periods=30, freq='D')
date
⇊
DatetimeIndex(['2023-03-01', '2023-03-02', '2023-03-03', '2023-03-04',
'2023-03-05', '2023-03-06', '2023-03-07', '2023-03-08',
'2023-03-09', '2023-03-10', '2023-03-11', '2023-03-12',
'2023-03-13', '2023-03-14', '2023-03-15', '2023-03-16',
'2023-03-17', '2023-03-18', '2023-03-19', '2023-03-20',
'2023-03-21', '2023-03-22', '2023-03-23', '2023-03-24',
'2023-03-25', '2023-03-26', '2023-03-27', '2023-03-28',
'2023-03-29', '2023-03-30'],
dtype='datetime64[ns]', freq='D')
from tqdm import tqdm_notebook
#from tqdm.notebook import tqdm
import time
movie_date = []
movie_name = []
for today in tqdm_notebook(date):
html = 'https://www.boxofficemojo.com/date/{date}/?ref_=bo_da_nav'
response = urlopen(html.format(date=today.strftime('%Y-%m-%d')))
soup = BeautifulSoup(response, 'html.parser')
end = len(soup.find_all('td', 'a-text-left mojo-field-type-release_studios'))
movie_date.extend([today for n in range(0, end)])
movie_name.extend([soup.find_all('td', 'a-text-left mojo-field-type-release mojo-cell-wide')[n].a.string for n in range(0 ,end)])
time.sleep(0.5)
⇊
100%
30/30 [01:54<00:00, 3.68s/it]
movie = pd.DataFrame({'date': movie_date, 'name': movie_name)
movie.head()
⇊
데이터 프레임 제작
.astype() : 괄호안 숫자 형태로 변환
`***
+사족
네이버 영화 평점 서비스가 2023년 3월31일부로 종료하여
IMDB 과 Box Offic mojo를 참고