# 위키백과 문서 정보 가져오기
import urllib
from urllib.request import urlopen, Request
# html = "https://ko.wikipedia.org/wiki/%EC%97%AC%EB%AA%85%EC%9D%98_%EB%88%88%EB%8F%99%EC%9E%90"
# https://ko.wikipedia.org/wiki/여명의_눈동자 주소의 utf-8 인코딩이 깨져서 나오는 것임
html = "https://ko.wikipedia.org/wiki/{search_words}"
# 글자를 URL로 인코딩
req = Request(html.format(search_words=urllib.parse.quote("여명의_눈동자")))
reponse = urlopen(req)
reponse
>>
<http.client.HTTPResponse at 0x2d353740c40>
--------------------------
reponse.status
>> 200
--------------------------
soup = BeautifulSoup(reponse, "html.parser")
print(soup.prettify())
>>
Output exceeds the size limit. Open the full output data in a text editor<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-enabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled" dir="ltr" lang="ko">
<head>
<meta charset="utf-8"/>
<title>
여명의 눈동자 - 위키백과, 우리 모두의 백과사전
</title>
<script>
document.documentElement.className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-enabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled";(function(){var cookie=document.cookie.match(/(?:^|; )kowikimwclientprefs=([^;]+)/);if(cookie){var featureName=cookie[1];document.documentElement.className=document.documentElement.className.replace(featureName+'-enabled',featureName+'-disabled');}}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"ko","wgMonthNames":["","1월","2월","3월","4월","5월","6월","7월","8월","9월","10월","11월","12월"],"wgRequestId":"fd19f329-c98c-4223-a939-83b6df3e5c4a",
"wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"여명의_눈동자","wgTitle":"여명의 눈동자","wgCurRevisionId":34684388,"wgRevisionId":34684388,"wgArticleId":51472,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["깨진 링크를 가지고 있는 문서","인용 오류 - 지원되지 않는 변수 무시됨","인용 오류 - URL 없이 확인날짜를 사용함","백상예술대상 TV부문 대상 수상자(작)","백상예술대상 TV부문 작품상","1991년 드라마","문화방송 수목 미니시리즈","문화방송의 역사 드라마","일제강점기 역사 드라마","한국 현대사 드라마","대한민국의 소설을 바탕으로 한 텔레비전 드라마","송지나 시나리오 작품","1991년에 시작한 대한민국 TV 프로그램","1992년에 종료한 대한민국 TV 프로그램","한국의 반일 감정",
"1990년대 대한민국의 텔레비전 프로그램"],"wgPageContentLanguage":"ko","wgPageContentModel":"wikitext","wgRelevantPageName":"여명의_눈동자","wgRelevantArticleId":51472,"wgIsProbablyEditable":true,"wgRelevantPageIsProbablyEditable":true,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgVisualEditor":{"pageLanguageCode":"ko","pageLanguageDir":"ltr","pageVariantFallbacks":"ko"},"wgMFDisplayWikibaseDescriptions":{"search":true,"watchlist":true,"tagline":true,"nearby":true},"wgWMESchemaEditAttemptStepOversample":false,"wgWMEPageLength":30000,"wgNoticeProject":"wikipedia","wgVector2022PreviewPages":[],"wgMediaViewerOnClick":true,"wgMediaViewerEnabledByDefault":true,"wgPopupsFlags":10,"wgULSCurrentAutonym":"한국어","wgEditSubmitButtonLabelPublish":true,"wgCentralAuthMobileDomain":false,"wgULSPosition":"interlanguage","wgULSisCompactLinksEnabled":true,"wgULSisLanguageSelectorEmpty":false,"wgWikibaseItemId":"Q624988","GEHomepageSuggestedEditsEnableTopics":true,
"wgGETopicsMatchModeEnabled":false,"wgGEStructuredTaskRejectionReasonTextInputEnabled":false,"wgGELevelingUpEnabledForUser":false};RLSTATE={"skins.vector.user.styles":"ready","ext.gadget.SectionFont":"ready","ext.globalCssJs.user.styles":"ready","site.styles":"ready","user.styles":"ready","skins.vector.user":"ready","ext.globalCssJs.user":"ready","user":"ready","user.options":"loading","ext.cite.styles":"ready","mediawiki.ui.button":"ready","skins.vector.styles":"ready","skins.vector.icons":"ready","mediawiki.ui.icon":"ready","jquery.makeCollapsible.styles":"ready","ext.visualEditor.desktopArticleTarget.noscript":"ready","ext.wikimediaBadges":"ready","ext.uls.interlanguage":"ready","wikibase.client.init":"ready"};RLPAGEMODULES=["ext.cite.ux-enhancements","site","mediawiki.page.ready","jquery.makeCollapsible","mediawiki.toc","skins.vector.js","skins.vector.es6","mmv.head","mmv.bootstrap.autostart","ext.visualEditor.desktopArticleTarget.init","ext.visualEditor.targetLoader",
"ext.eventLogging","ext.wikimediaEvents","ext.navigationTiming","ext.cx.eventlogging.campaigns","ext.centralNotice.geoIP","ext.centralNotice.startUp","ext.gadget.directcommons","ext.gadget.ReferenceTooltips","ext.gadget.edittools","ext.gadget.refToolbar","ext.gadget.siteNotice","ext.gadget.scrollUpButton","ext.gadget.strikethroughTOC","ext.gadget.switcher","ext.centralauth.centralautologin","ext.popups","ext.echo.centralauth","ext.uls.compactlinks","ext.uls.interface","ext.cx.uls.quick.actions","wikibase.client.vector-2022","ext.growthExperiments.SuggestedEditSession"];
</script>
<script>
(RLQ=window.RLQ||[]).push(function(){mw.loader.implement("user.options@12s5i",function($,jQuery,require,module){mw.user.tokens.set({"patrolToken":"+\\","watchToken":"+\\","csrfToken":"+\\"});});});
</script>
<link href="/w/load.php?lang=ko&modules=ext.cite.styles%7Cext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediaBadges%7Cjquery.makeCollapsible.styles%7Cmediawiki.ui.button%2Cicon%7Cskins.vector.icons%2Cstyles%7Cwikibase.client.init&only=styles&skin=vector-2022" rel="stylesheet"/>
<script async="" src="/w/load.php?lang=ko&modules=startup&only=scripts&raw=1&skin=vector-2022">
</script>
<meta content="" name="ResourceLoaderDynamicStyles"/>
<link href="/w/load.php?lang=ko&modules=ext.gadget.SectionFont&only=styles&skin=vector-2022" rel="stylesheet"/>
<link href="/w/load.php?lang=ko&modules=site.styles&only=styles&skin=vector-2022" rel="stylesheet"/>
<meta content="MediaWiki 1.41.0-wmf.2" name="generator"/>
<meta content="origin" name="referrer"/>
...
{"@context":"https:\/\/schema.org","@type":"Article","name":"\uc5ec\uba85\uc758 \ub208\ub3d9\uc790","url":"https:\/\/ko.wikipedia.org\/wiki\/%EC%97%AC%EB%AA%85%EC%9D%98_%EB%88%88%EB%8F%99%EC%9E%90","sameAs":"http:\/\/www.wikidata.org\/entity\/Q624988","mainEntity":"http:\/\/www.wikidata.org\/entity\/Q624988","author":{"@type":"Organization","name":"\uc704\ud0a4\ubbf8\ub514\uc5b4 \ud504\ub85c\uc81d\ud2b8 \uae30\uc5ec\uc790"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\/\/www.wikimedia.org\/static\/images\/wmf-hor-googpub.png"}},"datePublished":"2006-02-04T13:29:19Z","dateModified":"2023-04-03T04:13:18Z","headline":"1991\ub144\uc791 \ubb38\ud654\ubc29\uc1a1\uc758 \ub4dc\ub77c\ub9c8"}
</script>
</body>
</html>
--------------------------
# 주인공의 이름이 있는 줄 수 찾기
n = 0
for each in soup.find_all("ul"):
print("=>" + str(n) + "===========")
print(each.get_text())
n += 1
>>
Output exceeds the size limit. Open the full output data in a text editor=>0===========
대문최근 바뀜요즘 화제임의의 문서로기부
=>1===========
사랑방사용자 모임관리 요청
=>2===========
도움말정책과 지침질문방
=>3===========
계정 만들기로그인
=>4===========
계정 만들기 로그인
=>5===========
기여토론
=>6===========
처음 위치
1개요
...
--------------------------
soup.find_all("ul")[32]
>>
<ul><li><a href="/wiki/%EC%B1%84%EC%8B%9C%EB%9D%BC" title="채시라">채시라</a> : 윤여옥 역 (아역: <a href="/wiki/%EA%B9%80%EB%AF%BC%EC%A0%95_(1982%EB%85%84)" title="김민정 (1982년)">김민정</a>)</li>
<li><a href="/wiki/%EB%B0%95%EC%83%81%EC%9B%90" title="박상원">박상원</a> : 장하림(하리모토 나츠오) 역 (아역: <a href="/wiki/%EA%B9%80%ED%83%9C%EC%A7%84_(%EC%88%98%ED%95%84%EA%B0%80)" title="김태진 (수필가)">김태진</a>)</li>
<li><a href="/wiki/%EC%B5%9C%EC%9E%AC%EC%84%B1....
--------------------------
soup.find_all("ul")[32].text
>>
채시라\xa0: 윤여옥 역 (아역: 김민정)\n박상원\xa0: 장하림(하리모토 나츠오) 역 (아역: 김태진)\n최재성\xa0: 최대치(사카이) 역 (아역: 장덕수)'
--------------------------
soup.find_all("ul")[32].text.strip()
>>
채시라\xa0: 윤여옥 역 (아역: 김민정)\n박상원\xa0: 장하림(하리모토 나츠오) 역 (아역: 김태진)\n최재성\xa0: 최대치(사카이) 역 (아역: 장덕수)'
--------------------------
soup.find_all("ul")[32].text.strip().replace("\xa0", "").replace("\n", "")
>>
'채시라: 윤여옥 역 (아역: 김민정)박상원: 장하림(하리모토 나츠오) 역 (아역: 김태진)최재성: 최대치(사카이) 역 (아역: 장덕수)'
위 글은 제로베이스 데이터 취업 스쿨의 강의자료를 참고하여 작성되었습니다.