[python] #5. BeautifulSoup - find_all (1)

exoluse·2021년 10월 11일

python - web crawling

목록 보기

6/20

자료를 찾고 공부를 하다보니

실질적인 크롤링 보다 BeautifulSoup 에 할애하는 시간이 훠~~얼씬 많다. 대충 보니 문서의 3분의 1정도 읽은것 같아 보이는데 최대한 빨리 빡집중 해서 정리해 보도록 하겠다. 얼마나 시간이 걸릴지는 나도 모른다...

강의로 잠깐 맛봤던 find_all

다시 한번 볼 기회가 생겨 영광(?)이다. 소스부터 둘러보자.

import requests
from bs4 import BeautifulSoup

html = """
<html><head><title>exoluse's velog</title></head>

<body>
<p class="title"><b>exoluse's velog</b></p>
<b class="vel">exoluse's velog</b>
</body>

</html>

"""

soup = BeautifulSoup(html, "html.parser")

selected = soup.find_all("b")
print(selected)

<!-- 결과 : b태그 찾아서 출력 (ResultSet 타입) -->
[<b>exoluse's velog</b>, <b class="vel">exoluse's velog</b>]

여기까지는 soso 하다.

여러 태그를 한번에 뽑기.

html = """
<html><head><title>exoluse's velog</title></head>

<body>
<p class="title"><b>exoluse's velog</b></p>
<b class="vel">exoluse's velog</b>
</body>

</html>

"""

soup = BeautifulSoup(html, "html.parser")

selected = soup.find_all(["b", "p"])
print(selected)

<!-- 결과 : 상위 엘리먼트부터 쭉 훓으면서 조건에 맞는 엘리먼트를 찾는다. -->
[<p class="title"><b>exoluse's velog</b></p>, <b>exoluse's velog</b>, <b class="vel">exoluse's velog</b>]

별 쓸모 없겠지만 - True

html = """
<html><head><title>exoluse's velog</title></head>

<body>
<p class="title"><b>exoluse's velog</b></p>
<b class="vel">exoluse's velog</b>
</body>

</html>

"""

soup = BeautifulSoup(html, "html.parser")

selected = soup.find_all(True)
print(selected)

<!-- 결과 : 모든 엘리먼트 리턴 -->
[<html><head><title>exoluse's velog</title></head>
<body>
<p class="title"><b>exoluse's velog</b></p>
<b class="vel">exoluse's velog</b>
</body>
</html>, <head><title>exoluse's velog</title></head>, <title>exoluse's velog</title>, <body>
<p class="title"><b>exoluse's velog</b></p>
<b class="vel">exoluse's velog</b>
</body>, <p class="title"><b>exoluse's velog</b></p>, <b>exoluse's velog</b>, <b class="vel">exoluse's velog</b>]

속성값에도 True 가 사용될 수 있다.

html = """
<html><head><title>exoluse's velog</title></head>

<body>
<p class="title"><b>exoluse's velog</b></p>
<b class="vel">exoluse's velog</b>
</body>

</html>

"""
soup = BeautifulSoup(html, "html.parser")
selected = soup.find_all("b", class_=True)
print(selected)

<!-- 결과 : class 속성이 어떤 값이든 모두 끌고온다 -->
[<b class="vel">exoluse's velog</b>]

특정 속성이 없는 엘리먼트를 가려낼 수 있다.

html = """
<html><head><title>exoluse's velog</title></head>

<body>
<p class="title"><b>exoluse's velog</b></p>
<b>exoluse's velog</b>
</body>

</html>

"""

soup = BeautifulSoup(html, "html.parser")

selected = soup.find_all("b", class_=True)
print(selected)

<!-- 결과 : 아무고또 리턴되지 않는다. b태그에 class 속성이 없어서 그렇다. -->
[]

태그와 속성값으로 탐색

html = """
<html><head><title>exoluse's velog</title></head>

<body>
<p class="title"><b>exoluse's velog</b></p>
<b class="vel">exoluse's velog</b>
</body>

</html>

"""

soup = BeautifulSoup(html, "html.parser")

selected = soup.find_all("b","vel")
print(selected)

<!-- 결과 : 어떤 속성이든 값이 vel 이면 표시해줌 쓸모있네... -->
[<b class="vel">exoluse's velog</b>]

속성값만 가지고 탐색

html = """
<html><head><title>exoluse's velog</title></head>

<body>
<p class="title"><b>exoluse's velog</b></p>
<b class="vel">exoluse's velog</b>
</body>

</html>

"""

soup = BeautifulSoup(html, "html.parser")

selected = soup.find_all(class_="vel")
print(selected)

<!-- 결과 : 어떤 태그인지는 모르겠으나 class 속성이 vel인 것을 끌고온다. -->
[<b class="vel">exoluse's velog</b>]

속성 key:value 로 탐색

html = """
<html><head><title>exoluse's velog</title></head>

<body>
<p class="title"><b>exoluse's velog</b></p>
<b class="vel" meta_tag="python" metaTag="python">exoluse's velog</b>
</body>

</html>

"""

soup = BeautifulSoup(html, "html.parser")

selected1 = soup.find_all(meta_tag="python")
selected2 = soup.find_all(metaTag="python")

print(selected1)
print(selected2)

<!-- 결과 : 이건 쫌... 이상하다? -->
[<b class="vel" meta_tag="python" metaTag="python">exoluse's velog</b>]
[]

속성명 인자에 언더스코어("_")는 제대로 되는데 카멜케이스는 안되는 것이 확인되었다. 왜 그런지는 알아봐야 할듯...

여러 개의 속성으로 탐색

DB도 WHERE 절에 여러 개의 조건을 넣듯이 BeautifulSoup도 마찬가지다.

html = """
<html><head><title>exoluse's velog</title></head>

<body>
<p tag="Nana" class="title"><b>exoluse's velog</b></p>
<b tag="Baba" class="vel">exoluse's velog</b>
</body>

</html>

"""

soup = BeautifulSoup(html, "html.parser")

selected = soup.find_all(class_="vel", tag="Baba")
print(selected)

<!-- 결과 : OR 조건이 아닌 AND 조건으로 적용된다. -->
[<b class="vel" tag="Baba">exoluse's velog</b>]

AND 조건이라는 것을 증명

html = """
<html><head><title>exoluse's velog</title></head>

<body>
<p tag="Nana" class="title"><b>exoluse's velog</b></p>
<b tag="Baba" class="vel">exoluse's velog</b>
</body>

</html>

"""

soup = BeautifulSoup(html, "html.parser")

selected = soup.find_all(class_="vel", tag="Nana")

# 아래 소스도 같은 기능을 한다. (class_ 가 아니라 class 로 표현)
# selected = soup.find_all(attrs={"class" : "vel", "tag" : "Nana"})

print(selected)

<!-- 결과 : class가 vel이고 tag가 Nana인 엘리먼트는 없다. And 조건 맞음 -->
[]