태그로 되어있는 문서를 해석하는 파이썬모듈
#1
conda install -c anaconda beautifulsoap4
#2
pip install beautifulsoup4
from bs4 import BeautifulSoup
page = open("../data/03. test_first.html", "r").read()
soup = BeautifulSoup(page, "html.parser")
print(soup.prettify())
#head
soup.head
#body
soup.boody
#p(가장 처음 p만 출력)
soup.p
soup.find("p",{"class":"inner-text first-item"}).text.strip()
soup.find("p",{"class":"inner-text fisrt-item","id":"first"})
: 여러개 찾고 싶을때
soup.find_all("p")
for each_tag in soup.find_all('p'):
print("-"* 50)
print(each_tag.text)
#import
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "https://finance.naver.com/marketindex/"
page = urlopen(url)
soup = BeautifulSoup(page,"lxml")
print(soup.prettify())
#url페이지 담김
#request로 불러올 수 있음.
url = "https://finance.naver.com/marketindex/"
response = urlopen(url)
response.status
soup = BeautifulSoup(page,"lxml")
print(soup.prettify())
3.find
#1
soup.find_all("span","value"),len(soup.find_all("span","value"))
#2
soup.find_all("span", class_="value"),len(soup.find_all("span","value"))
#3
soup.find_all("span",{"class":"value"}),len(soup.find_all("span",{"class":"value"}))
>>> ... <span class="value">86197.25</span> , 12
#1
soup.find_all("span",{"class":"value"})[0].text
#2
soup.find_all("span",{"class":"value"})[0].string
#3
soup.find_all("span",{"class":"value"})[0].get_text()
>>> ('1,336.00')
# a 태그에서 href(주소값) 속성값 추출
links = soup.find_all("a")
links[0].get("href"),links[1]["href"]
links
for each in links:
href = each.get("href")
text = each.get_text()
print(text + " => " + href)
[참고] urlencoding vs urldecoding
#1
import urllib
from urllib.request import urlopen,Request
html = "https://ko.wikipedia.org/wiki/{search_words}"
req = Request(html.format(search_words=urllib.parse.quote("여명의_눈동자"))) #글자를 url로 인코딩
response = urlopen(req)
response.status
soup = BeautifulSoup(response,"html.parser")
print(soup.prettify())
#2 : ul값 찾기
n = 0
for each in soup.find_all("ul"):
print("=>" + str(n) + "=========================")
print(each.get_text())
n += 1
#3: 가져오고 싶은 값만 가져오기
#strip: 띄어쓰기 붙여주기
#replace: 보기싫은 문자부분 바꿔주기
soup.find_all("ul")[32].text.strip().replace("\xa0","").replace("\n","")
[참고]
정규표현식 (Regular Expressions)
: https://nachwon.github.io/regular-expressions/
-정규표현식
: https://nachwon.github.io/regular-expressions/