(Package) BeautifulSoup

임경민·2023년 10월 12일

ZeroBase 데이터 취업스쿨 19기

목록 보기

43/80

- conda install -c anaconda beautifulsoup4
- pip install beautifulsoup4

# import 
from bs4 import BeautifulSoup

page = open("../data/03. zerobase.html", "r").read()
soup = BeautifulSoup(page, "html.parser")
print(soup.prettify())

# head 태그 확인
soup.head

# body 태그 확인
soup.body

# p 태그 확인
# find() 
soup.p

soup.find("p")

class_ : class를 쓸 경우 파이썬 예약어와 겹치는 경우가 생기기 때문에 구분하기 위해 언더바(underbar, ‘_’ )를 사용

# 파이썬 예약어 
# class, id, def, list, str, int, tuple...

soup.find("p", class_="innter-text second-item")

soup.find("p", {"class":"outer-text first-item"}).text.strip()

'Data Science is funny.’

# 다중 조건 
soup.find("p", {"class":"inner-text first-item", "id":"first"})

soup.find_all("p")

# 특정 태그 확인 
soup.find_all(id="pw-link")[0].text

soup.find_all("p", class_="innter-text second-item")

len(soup.find_all("p"))

# p 태그 리스트에서 텍스트 속성만 출력 

for each_tag in soup.find_all("p"):
    print("=" * 50)
    print(each_tag.text)

# a 태그에서 href 속성값에 있는 값 추출 

links = soup.find_all("a")
links[0].get("href"), links[1]["href"]

('[http://www.pinkwink.kr](http://www.pinkwink.kr/)', '[https://www.python.org](https://www.python.org/)')

for each in links:
    href = each.get("href") # each["href"]
    text = each.get_text()
    print(text + "=>" + href)

PinkWink=>[http://www.pinkwink.kr](http://www.pinkwink.kr/)

Python=> [https://www.python.org](https://www.python.org/)

Start