
desktop B가 server A에 연결하는 경우
from urllib.request import urlopen
html = urlopen('http://pythonscraping.com/pages/page1.html')
print(html.read())
urlopen을 이용해 얻은 객체는 page로 렌더링되지 못하고, 하나의 html파일만을 나타낸다
urllib은 request, parse, error, robotparser의 서브모듈로 나뉨
https://docs.python.org/3/library/urllib.html 참고
pip install beautifulsoup4
위에서 urlopen을 이용해 얻은 객체를 BeautifulSoup객체로 변환
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen('http://www.pythonscraping.com/pages/page1.html')
bsObj = BeautifulSoup(html.read(), 'html.parser')
print(bsObj.h1)
scraping을 하다보면 예외 상황이 많이 발생할 수 있음
HTTP error를 반환받게 된다 -> urlopen함수는 HTTPError를 발생시킴
from urllib.request import urlopen, HTTPError
from bs4 import BeautifulSoup
try:
html = urlopen('http://www.pythonscraping.com/pages/page1.html')
except HTTPError as e:
print(e) # null을 반환하거나, break문을 실행하는 등의 동작
else:
# program 을 계속 실행
# nonExistingTag와 anotherTag가 실제로 존재하는지 check
try:
badContent = bsObj.nonExistingTag.anotherTag
except AttributeError as e:
print('Tag was not found')
else:
if badContent == None:
print('Tag was not found')
else:
print(badContent)
함수를 만들어 미리 예외처리를 해두면 재사용하기 좋은 web scraper를 만들 수 있다
from urllib.urlopen, HTTPError
from bs4 import BeautifulSoup
def getTitle(url):
try:
html = urlopen(url)
except HTTPError as e:
return None
try:
bsObj = BeautifulSoup(html.read(), 'html.parser')
title = bsObj.body.h1
except AttributeError as e:
return None
return title
title = getTitle('http://www.pythonscraping.com/pages/page1.html')
if title is None:
print('Title couldn't be found')
else:
print(title)