Beautiful Soup은 파이썬의 HTML과 XML 파일에서 데이터를 추출하기 위한 라이브러리입니다. 이 라이브러리는 파싱 작업을 간단하게 만들어주며, 웹 스크레이핑과 데이터 추출에 매우 유용합니다. 주로 사용되는 기능은 파싱, 문서 탐색, 데이터 추출이 있습니다.
from bs4 import BeautifulSoup
import pandas as pd
page = open('data/03. test_first.html', 'r').read()
soup = BeautifulSoup(page, 'html.parser')
print(soup.prettify()) # html 출력에 들여쓰기 포함
<!DOCTYPE html> <html> <head> <title> Very Simple HTML Code by PinkWink </title> </head> <body> <div> <p class="inner-text first-item" id="first"> Happy PinkWink. <a href="http://www.pinkwink.kr" id="pw-link"> PinkWink </a> </p> <p class="inner-text second-item"> Happy Data Science. <a href="https://www.python.org" id="py-link"> Python </a> </p> </div> <p class="outer-text first-item" id="second"> <b> Data Science is funny. ... </p> </body> </html>
soup.body # body 만 보고 싶을 때
<body> <div> <p class="inner-text first-item" id="first"> Happy PinkWink. <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a> </p> <p class="inner-text second-item"> Happy Data Science. <a href="https://www.python.org" id="py-link">Python</a> </p> </div> <p class="outer-text first-item" id="second"> <b> Data Science is funny. </b> </p> <p class="outer-text"> <b> All I need is Love. </b> </p> </body>
soup.find('p') # p태그 중 처음을 찾아주는 기능
<p class="inner-text first-item" id="first"> Happy PinkWink. <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a> </p>
soup.find_all('p') # p태그 전부 찾는 기능
[<p class="inner-text first-item" id="first"> Happy PinkWink. <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a> </p>, <p class="inner-text second-item"> Happy Data Science. <a href="https://www.python.org" id="py-link">Python</a> </p>, <p class="outer-text first-item" id="second"> <b> Data Science is funny. </b> </p>, <p class="outer-text"> <b> All I need is Love. </b> </p>]
soup.find_all(class_='outer-text') # 특정 클래스만 찾고 싶을 때
[<p class="outer-text first-item" id="second"> <b> Data Science is funny. </b> </p>, <p class="outer-text"> <b> All I need is Love. </b> </p>]
for each_tag in soup.find_all('p'):
print('-------------')
print(each_tag.get_text())
-------------
Happy PinkWink.
PinkWink
-------------
Happy Data Science.
Python
-------------
Data Science is funny.
-------------
All I need is Love.
links = soup.find_all('a')
links
[<a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>, <a href="https://www.python.org" id="py-link">Python</a>]
for each in links:
href = each['href']
text = each.string
print(text + ' -> ' + href)
PinkWink -> http://www.pinkwink.kr
Python -> https://www.python.org
# 네이버 환율 정보 불러오기
from urllib.request import urlopen
url = 'https://finance.naver.com/marketindex/'
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
print(soup.prettify())
<script language="javascript" src="/template/head_js.naver?referer=info.finance.naver.com&menu=marketindex&submenu=market"> </script> <script src="https://ssl.pstatic.net/imgstock/static.pc/20231218165353/js/info/jindo.min.ns.1.5.3.euckr.js" type="text/javascript"> </script> <script src="https://ssl.pstatic.net/imgstock/static.pc/20231218165353/js/jindo.1.5.3.element-text-patch.js" type="text/javascript"> </script> <div id="container" style="padding-bottom:0px;"> <div class="market_include"> <div class="market_data"> <div class="market1"> <div class="title"> <h2 class="h_market1"> <span> 환전 고시 환율 </span> </h2> </div> <!-- data --> <div class="data"> <ul class="data_lst" id="exchangeList"> <li class="on"> <a class="head usd" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdt', '', '', event);"> <h3 class="h_lst"> <span class="blind"> 미국 USD ... }); }).attach(document, "domready"); </script>
soup.find_all('span','value')
[<span class="value">1,312.80</span>, <span class="value">909.11</span>, <span class="value">1,437.78</span>, <span class="value">183.45</span>, <span class="value">144.6400</span>, <span class="value">1.0939</span>, <span class="value">1.2718</span>, <span class="value">102.1200</span>, <span class="value">73.81</span>, <span class="value">1572.04</span>, <span class="value">2049.8</span>, <span class="value">86322.9</span>]
import requests
# from urllib.request.Request
from bs4 import BeautifulSoup
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.prettify())
<script language="javascript" src="/template/head_js.naver?referer=info.finance.naver.com&menu=marketindex&submenu=market"> </script> <script src="https://ssl.pstatic.net/imgstock/static.pc/20231218165353/js/info/jindo.min.ns.1.5.3.euckr.js" type="text/javascript"> </script> <script src="https://ssl.pstatic.net/imgstock/static.pc/20231218165353/js/jindo.1.5.3.element-text-patch.js" type="text/javascript"> </script> <div id="container" style="padding-bottom:0px;"> <div class="market_include"> <div class="market_data"> <div class="market1"> <div class="title"> <h2 class="h_market1"> <span> 환전 고시 환율 </span> </h2> </div> <!-- data --> <div class="data"> <ul class="data_lst" id="exchangeList"> <li class="on"> <a class="head usd" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdt', '', '', event);"> <h3 class="h_lst"> <span class="blind"> 미국 USD ... }); }).attach(document, "domready"); </script>
soup.find_all('li','on')
exchangeList = soup.select("#exchangeList > li")
len(exchangeList), exchangeList
(4, [<li class="on"> <a class="head usd" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdt', '', '', event);"> <h3 class="h_lst"><span class="blind">미국 USD</span></h3> <div class="head_info point_up"> <span class="value">1,316.70</span> <span class="txt_krw"><span class="blind">원</span></span> <span class="change">0.70</span> <span class="blind">상승</span> </div> </a> <a class="graph_img" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdc', '', '', event);"> <img alt="" height="153" src="https://ssl.pstatic.net/imgfinance/chart/marketindex/FX_USDKRW.png" width="295"/> </a> <div class="graph_info"> <span class="time">2024.01.08 15:01</span> <span class="source">하나은행 기준</span> <span class="count">고시회차<span class="num">308</span>회</span> </div> </li>, <li class=""> <a class="head jpy" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_JPYKRW" onclick="clickcr(this, 'fr1.jpyt', '', '', event);"> <h3 class="h_lst"><span class="blind">일본 JPY(100엔)</span></h3> <div class="head_info point_up"> <span class="value">912.16</span> ... <span class="time">2024.01.08 15:01</span> <span class="source">하나은행 기준</span> <span class="count">고시회차<span class="num">308</span>회</span> </div> </li>])
title = exchangeList[0].select_one('.h_lst').text
exchange = exchangeList[0].select_one('.value').text
change = exchangeList[0].select_one('.change').text
updown = exchangeList[0].select_one('div.head_info.point_up > .blind')
title, exchange, change, updown
('미국 USD', '1,316.70', '0.70', <span class="blind">상승</span>)
exchange_datas = []
baseUrl = 'https://finance.naver.com'
for item in exchangeList:
if item.select_one('.head_info.point_up > .blind') is None:
updo = item.select_one('.head_info.point_dn > .blind').text
else:
updo = item.select_one('.head_info.point_up > .blind').text
data = {
'title' : item.select_one('.h_lst').text,
'exchange' : item.select_one('.value').text,
'change' : item.select_one('.change').text,
'updown' : updo,
'link' : baseUrl + item.select_one('a').get('href')
}
exchange_datas.append(data)
df = pd.DataFrame(exchange_datas)
df
# 위키피디아 url 불러오기
# https://ko.wikipedia.org/wiki/%EC%97%AC%EB%AA%85%EC%9D%98_%EB%88%88%EB%8F%99%EC%9E%90
import urllib
from urllib.request import Request
html = 'https://ko.wikipedia.org/wiki/{search_words}'
req = Request(html.format(search_words=urllib.parse.quote('여명의_눈동자')))
response = urlopen(req)
soup = BeautifulSoup(response, 'html.parser')
soup
<!DOCTYPE html> <html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-zebra-design-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-toc-available" dir="ltr" lang="ko"> <head> <meta charset="utf-8"/> <title>여명의 눈동자 - 위키백과, 우리 모두의 백과사전</title> <script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-zebra-design-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-toc-available";var cookie=document.cookie.match(/(?:^|; )kowikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""], "wgDefaultDateFormat":"ko","wgMonthNames":["","1월","2월","3월","4월","5월","6월","7월","8월","9월","10월","11월","12월"],"wgRequestId":"bc04cd37-7c34-4378-8ab8-fae9895b2311","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"여명의_눈동자","wgTitle":"여명의 눈동자","wgCurRevisionId":35524239,"wgRevisionId":35524239,"wgArticleId":51472,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["깨진 링크를 가지고 있는 문서","인용 오류 - 지원되지 않는 변수 무시됨","인용 오류 - URL 없이 확인날짜를 사용함","1991년 드라마","문화방송 수목 미니시리즈","문화방송의 역사 드라마","일제강점기 역사 드라마","한국 현대사 드라마","대한민국의 소설을 바탕으로 한 텔레비전 드라마","송지나 시나리오 작품","1991년에 시작한 대한민국 TV 프로그램", "1992년에 종료한 대한민국 TV 프로그램","한국의 반일 감정","1990년대 대한민국의 텔레비전 프로그램","백상예술대상 TV부문 작품상","백상예술대상 TV부문 대상 수상자(작)"],"wgPageViewLanguage":"ko","wgPageContentLanguage":"ko","wgPageContentModel":"wikitext","wgRelevantPageName":"여명의_눈동자","wgRelevantArticleId":51472,"wgIsProbablyEditable":true,"wgRelevantPageIsProbablyEditable":true,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgNoticeProject":"wikipedia","wgMediaViewerOnClick":true,"wgMediaViewerEnabledByDefault":true,"wgPopupsFlags":6,"wgVisualEditor":{"pageLanguageCode":"ko","pageLanguageDir":"ltr","pageVariantFallbacks":"ko"},"wgMFDisplayWikibaseDescriptions":{"search":true,"watchlist":true,"tagline":true,"nearby":true},"wgWMESchemaEditAttemptStepOversample":false,"wgWMEPageLength":30000,"wgULSCurrentAutonym":"한국어","wgCentralAuthMobileDomain":false,"wgEditSubmitButtonLabelPublish":true,"wgULSPosition": "interlanguage","wgULSisCompactLinksEnabled":true,"wgULSisLanguageSelectorEmpty":false,"wgWikibaseItemId":"Q624988","wgCheckUserClientHintsHeadersJsApi":["architecture","bitness","brands","fullVersionList","mobile","model","platform","platformVersion"],"GEHomepageSuggestedEditsEnableTopics":true,"wgGETopicsMatchModeEnabled":false,"wgGEStructuredTaskRejectionReasonTextInputEnabled":false,"wgGELevelingUpEnabledForUser":false};RLSTATE={"skins.vector.user.styles":"ready","ext.gadget.SectionFont":"ready","ext.globalCssJs.user.styles":"ready","site.styles":"ready","user.styles":"ready","skins.vector.user":"ready","ext.globalCssJs.user":"ready","user":"ready","user.options":"loading","ext.cite.styles":"ready","codex-search-styles":"ready","skins.vector.styles":"ready","skins.vector.icons":"ready","skins.vector.zebra.styles":"ready","jquery.makeCollapsible.styles":"ready","ext.visualEditor.desktopArticleTarget.noscript":"ready","ext.uls.interlanguage":"ready","wikibase.client.init":"ready", "ext.wikimediaBadges":"ready"};RLPAGEMODULES=["ext.cite.ux-enhancements","mediawiki.page.media","mediawiki.toggleAllCollapsibles","site","mediawiki.page.ready","jquery.makeCollapsible","mediawiki.toc","skins.vector.js","ext.centralNotice.geoIP","ext.centralNotice.startUp","ext.gadget.directcommons","ext.gadget.ReferenceTooltips","ext.gadget.edittools","ext.gadget.refToolbar","ext.gadget.siteNotice","ext.gadget.scrollUpButton","ext.gadget.strikethroughTOC","ext.gadget.switcher","ext.urlShortener.toolbar","ext.centralauth.centralautologin","mmv.head","mmv.bootstrap.autostart","ext.popups","ext.visualEditor.desktopArticleTarget.init","ext.visualEditor.targetLoader","ext.echo.centralauth","ext.eventLogging","ext.wikimediaEvents","ext.navigationTiming","ext.uls.compactlinks","ext.uls.interface","ext.cx.eventlogging.campaigns","ext.cx.uls.quick.actions","wikibase.client.vector-2022","ext.checkUser.clientHints","ext.growthExperiments.SuggestedEditSession"];</script> <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.impl(function(){return["user.options@12s5i",function($,jQuery,require,module){mw.user.tokens.set({"patrolToken":"+\\","watchToken":"+\\","csrfToken":"+\\"}); }];});});</script> <link href="/w/load.php?lang=ko&modules=codex-search-styles%7Cext.cite.styles%7Cext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediaBadges%7Cjquery.makeCollapsible.styles%7Cskins.vector.icons%2Cstyles%7Cskins.vector.zebra.styles%7Cwikibase.client.init&only=styles&skin=vector-2022" rel="stylesheet"/> <script async="" src="/w/load.php?lang=ko&modules=startup&only=scripts&raw=1&skin=vector-2022"></script> <meta content="" name="ResourceLoaderDynamicStyles"/> <link href="/w/load.php?lang=ko&modules=ext.gadget.SectionFont&only=styles&skin=vector-2022" rel="stylesheet"/> <link href="/w/load.php?lang=ko&modules=site.styles&only=styles&skin=vector-2022" rel="stylesheet"/> <meta content="MediaWiki 1.42.0-wmf.12" name="generator"/> <meta content="origin" name="referrer"/> <meta content="origin-when-cross-origin" name="referrer"/> <meta content="max-image-preview:standard" name="robots"/> <meta content="telephone=no" name="format-detection"/> <meta content="width=1000" name="viewport"/> <meta content="여명의 눈동자 - 위키백과, 우리 모두의 백과사전" property="og:title"/> ... </div> <script>(RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgHostname":"mw2369","wgBackendResponseTime":140,"wgPageParseReport":{"limitreport":{"cputime":"0.539","walltime":"0.716","ppvisitednodes":{"value":7727,"limit":1000000},"postexpandincludesize":{"value":261160,"limit":2097152},"templateargumentsize":{"value":7697,"limit":2097152},"expansiondepth":{"value":15,"limit":100},"expensivefunctioncount":{"value":0,"limit":500},"unstrip-depth":{"value":0,"limit":20},"unstrip-size":{"value":21749,"limit":5000000},"entityaccesscount":{"value":1,"limit":400},"timingprofile":["100.00% 454.683 1 -total"," 28.43% 129.265 1 틀:각주"," 25.72% 116.945 1 틀:위키데이터_속성_추적"," 20.26% 92.118 15 틀:둘러보기_상자"," 20.03% 91.060 1 틀:텔레비전_방송_프로그램_정보"," 19.95% 90.697 11 틀:뉴스_인용"," 11.59% 52.699 349 틀:정보상자_칸"," 9.01% 40.959 20 틀:텔레비전_프로그램_정보/시즌"," 5.58% 25.350 1 틀:백상예술대상_TV부문_작품상"," 4.82% 21.908 1 틀:원작"]},"scribunto":{"limitreport-timeusage":{"value":"0.121","limit":"10.000"},"limitreport-memusage":{"value":5348283,"limit":52428800}},"cachereport":{"origin":"mw-web.codfw.main-6c599b869c-lzqj2","timestamp":"20231227054948","ttl":2592000,"transientcontent":false}}});});</script> <script type="application/ld+json">{"@context":"https:\/\/schema.org","@type":"Article","name":"\uc5ec\uba85\uc758 \ub208\ub3d9\uc790","url":"https:\/\/ko.wikipedia.org\/wiki\/%EC%97%AC%EB%AA%85%EC%9D%98_%EB%88%88%EB%8F%99%EC%9E%90","sameAs":"http:\/\/www.wikidata.org\/entity\/Q624988","mainEntity":"http:\/\/www.wikidata.org\/entity\/Q624988","author":{"@type":"Organization","name":"\uc704\ud0a4\ubbf8\ub514\uc5b4 \ud504\ub85c\uc81d\ud2b8 \uae30\uc5ec\uc790"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\/\/www.wikimedia.org\/static\/images\/wmf-hor-googpub.png"}},"datePublished":"2006-02-04T13:29:19Z","dateModified":"2023-09-03T22:36:07Z","headline":"1991\ub144\uc791 \ubb38\ud654\ubc29\uc1a1\uc758 \ub4dc\ub77c\ub9c8"}</script> </body> </html>
n=0
for each in soup.find_all('ul'):
print('=>'+ str(n)+'===============================')
print(each.get_text())
n+=1
=>0=============================== 대문최근 바뀜요즘 화제임의의 문서로기부 =>1=============================== 사랑방사용자 모임관리 요청 =>2=============================== 도움말정책과 지침질문방 =>3=============================== =>4=============================== =>5=============================== =>6=============================== 계정 만들기 ... 내용 폭 제한 전환
soup.find_all('ul')[35].text.strip().replace('\xa0',"").replace('\n',"")
'채시라: 윤여옥 역 (아역: 김민정)박상원: 장하림(하리모토 나츠오) 역 (아역: 김태진)최재성: 최대치(사카이) 역 (아역: 장덕수)'