- conda install -c anaconda beautifulsoup4
- pip install beautifulsoup4
# import
from bs4 import BeautifulSoup
page = open("../data/03. zerobase.html", "r").read()
soup = BeautifulSoup(page, "html.parser")
print(soup.prettify())
# head ํ๊ทธ ํ์ธ
soup.head
# body ํ๊ทธ ํ์ธ
soup.body
# p ํ๊ทธ ํ์ธ
# find()
soup.p
soup.find("p")
class_
: class๋ฅผ ์ธ ๊ฒฝ์ฐ ํ์ด์ฌ ์์ฝ์ด์ ๊ฒน์น๋ ๊ฒฝ์ฐ๊ฐ ์๊ธฐ๊ธฐ ๋๋ฌธ์ ๊ตฌ๋ถํ๊ธฐ ์ํด ์ธ๋๋ฐ(underbar, โ_โ )๋ฅผ ์ฌ์ฉ# ํ์ด์ฌ ์์ฝ์ด
# class, id, def, list, str, int, tuple...
soup.find("p", class_="innter-text second-item")
soup.find("p", {"class":"outer-text first-item"}).text.strip()
'Data Science is funny.โ
# ๋ค์ค ์กฐ๊ฑด
soup.find("p", {"class":"inner-text first-item", "id":"first"})
find_all()
: ์ฌ๋ฌ ๊ฐ์ ํ๊ทธ(Tag)๋ฅผ ๋ฆฌ์คํธ(list) ํํ๋ก ๋ฐํsoup.find_all("p")
# ํน์ ํ๊ทธ ํ์ธ
soup.find_all(id="pw-link")[0].text
soup.find_all("p", class_="innter-text second-item")
len(soup.find_all("p"))
# p ํ๊ทธ ๋ฆฌ์คํธ์์ ํ
์คํธ ์์ฑ๋ง ์ถ๋ ฅ
for each_tag in soup.find_all("p"):
print("=" * 50)
print(each_tag.text)
# a ํ๊ทธ์์ href ์์ฑ๊ฐ์ ์๋ ๊ฐ ์ถ์ถ
links = soup.find_all("a")
links[0].get("href"), links[1]["href"]
('[http://www.pinkwink.kr](http://www.pinkwink.kr/)', '[https://www.python.org](https://www.python.org/)')
for each in links:
href = each.get("href") # each["href"]
text = each.get_text()
print(text + "=>" + href)
PinkWink=>[http://www.pinkwink.kr](http://www.pinkwink.kr/)
Python=> [https://www.python.org](https://www.python.org/)