HTML을 파싱할 때 사용하는 라이브러리다.
예를 한번 들어보자.
from bs4 import BeautifulSoup
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
bs = BeautifulSoup(html_doc)
bs.find("a", id='link3')
BeautifulSoup("[html]").find("[tag]", id = "[id]"
👉🏻 html에서 tag,id에 해당하는 문장 1개 반환
👉 findAll: 전부 반환
import requests
def get_data_text(url):
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"}
r = requests.get(url, headers=head)
return r.text
👉 뉴스를 텍스트 파일로 넘겨주는 함수
news_url = "https://n.news.naver.com/mnews/article/009/0005088914?sid=105"
naver_news = get_data_text(news_url)
bs = BeautifulSoup(naver_news)
with open("./news.txt","w",encoding="utf-8") as f:
f.write(bs.find("div", id="dic_area").text.strip())
👉 위 함수를 가져다 써서 뉴스를 텍스트 파일로 저장