HTTP(HyperText Transfer Protocol)
HTML(Hyper Text Markup Language)
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Test</title>
</head>
<body>
<p>Test</p>
</body>
</html>
태그(Tag)
<h3 class="tit_view">예제입니다.</h3>
requests 모듈
import requests
get 요청하기
import requests
url = 'https://www.naver.com/'
resp = requests.get(url)
resp.text
post 요청하기
import requests
url = 'https://www.kangcom.com/member/member_check.asp'
data = {
'id': 'testid',
'pwd': 'testpw'
}
resp = requests.post(url, data=data)
resp.text
HTTP header 데이터 이용하기
url = 'https://news.v.daum.net/v/20190728165812603'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}
resp = requests.get(url, headers=headers)
resp.text
HTTP response 처리하기
url = 'https://news.v.daum.net/v/20190728165812603'
resp = requests.get(url)
if resp.status_code == 200:
resp.headers
else:
print('error')
공공데이터 포털 OPEN API 사용하기
Endpoint 확인하기
endpoint = 'http://api.visitkorea.or.kr/openapi/service/rest/EngService/areaCode?serviceKey={}&numOfRows=10&pageSize=10&pageNo=1&MobileOS=ETC&MobileApp=AppTest&_type=json'.format(serviceKey)
key 값 확인하기
Parameter 확인하기
import requests
endpoint = 'http://api.visitkorea.or.kr/openapi/service/rest/EngService/areaCode?serviceKey={}&numOfRows=10&pageSize=10&pageNo={}&MobileOS=ETC&MobileApp=AppTest&_type=json'.format(serviceKey, 1)
resp = requests.get(endpoint)
print(resp.status_code) # 200
data = resp.json() # type: dict
print(data['response']['body']['items']['item'][0])
요청 및 Response 확인
beautifulsoup 모듈 사용하기
from bs4 import BeautifulSoup
html 문자열 파싱
html = '''
<html>
<head>
<title>BeautifulSoup test</title>
</head>
<body>
<div id='upper' class='test' custom='good'>
<h3 title='Good Content Title'>Contents Title</h3>
<p>Test contents</p>
</div>
<div id='lower' class='test' custom='nice'>
<p>Test Test Test 1</p>
<p>Test Test Test 2</p>
<p>Test Test Test 3</p>
</div>
</body>
</html>'''
find 함수
soup = BeautifulSoup(html)
soup.find('h3')
# <h3 title="Good Content Title">Contents Title</h3>
soup.find('p')
# <p>Test contents</p>
soup.find('div', class_='test')
"""
<div class="test" custom="good" id="upper">
<h3 title="Good Content Title">Contents Title</h3>
<p>Test contents</p>
</div>
"""
attrs = {'id': 'upper', 'class': 'test'}
soup.find('div', attrs=attrs)
"""
<div class="test" custom="good" id="upper">
<h3 title="Good Content Title">Contents Title</h3>
<p>Test contents</p>
</div>
"""
find_all 함수
soup.find_all('div', class_='test')
"""
[<div class="test" custom="good" id="upper">
<h3 title="Good Content Title">Contents Title</h3>
<p>Test contents</p>
</div>,
<div class="test" custom="nice" id="lower">
<p>Test Test Test 1</p>
<p>Test Test Test 2</p>
<p>Test Test Test 3</p>
</div>]
"""
get_text 함수
tag1 = soup.find('h3')
print(tag)
# <h3 title="Good Content Title">Contents Title</h3>
tag.get_text()
# 'Contents Title'
tag2 = soup.find('p')
print(tag2)
# <p>Test contents</p>
tag2.get_text()
# 'Test contents'
tag3 = soup.find('div', id='upper')
print(tag3)
"""
<div class="test" custom="good" id="upper">
<h3 title="Good Content Title">Contents Title</h3>
<p>Test contents</p>
</div>
"""
tag3.get_text().strip()
# 'Contents Title\nTest contents'
attribute 값 추출하기
tag = soup.find('h3')
print(tag)
# <h3 title="Good Content Title">Contents Title</h3>
tag['title']
# 'Good Content Title'
다음 뉴스 데이터 추출
import requests
from bs4 import BeautifulSoup
url = 'https://news.v.daum.net/v/20190728165812603'
resp = requests.get(url)
soup = BeautifulSoup(resp.text)
title = soup.find('h3', class_='tit_view)
title.get.test()
# '일론머스크 "테슬라에서 넷플릭스·유튜브 즐길 날 온다"'
soup.find_all('span', class_='txt_info')
"""
[<span class="txt_info">이민우</span>,
<span class="txt_info">입력 <span class="num_date">2019. 07. 28. 16:58</span></span>]
"""
info = soup.find('span', class_='info_view')
info.find('span', class_='txt_info')
# <span class="txt_info">이민우</span>
import requests
from bs4 import BeautifulSoup
url = 'https://news.v.daum.net/v/20190728165812603'
resp = requests.get(url)
soup = BeautifulSoup(resp.text)
soup.select('h3')
"""
[<h3 class="tit_view" data-translation="true">일론머스크 "테슬라에서 넷플릭스·유튜브 즐길 날 온다"</h3>,
<h3 class="tit_cp">아시아경제 주요 뉴스</h3>,
<h3 class="txt_newsview">많이본 뉴스</h3>,
<h3 class="txt_newsview">포토&TV</h3>,
<h3 class="txt_newsview">이 시각 추천뉴스</h3>]
"""
soup.select('#harmonyContainer > p')
# [<p data-translation="true"><ⓒ경제를 보는 눈, 세계를 보는 창 아시아경제 무단전재 배포금지></p>]
soup.select('h3.tit_view') # soup.select('h3[class="tit_view"]')
# [<h3 class="tit_view" data-translation="true">일론머스크 "테슬라에서 넷플릭스·유튜브 즐길 날 온다"</h3>]
soup.select('h3[class^="tx"]')
"""
[<h3 class="txt_newsview">많이본 뉴스</h3>,
<h3 class="txt_newsview">포토&TV</h3>,
<h3 class="txt_newsview">이 시각 추천뉴스</h3>]
"""
soup.select('h3[class$="_view"]')
# [<h3 class="tit_view" data-translation="true">일론머스크 "테슬라에서 넷플릭스·유튜브 즐길 날 온다"</h3>]
# 부분 문자열
soup.select('h3[class*="view"]')
"""
[<h3 class="tit_view" data-translation="true">일론머스크 "테슬라에서 넷플릭스·유튜브 즐길 날 온다"</h3>,
<h3 class="txt_newsview">많이본 뉴스</h3>,
<h3 class="txt_newsview">포토&TV</h3>,
<h3 class="txt_newsview">이 시각 추천뉴스</h3>]
"""
soup.select('span.txt_info:nth-child(1)')
# [<span class="txt_info">이민우</span>]
import re
# h1, h2, h3... 찾아오기
soup.find_all(re.compile('h\d'))
머신러닝과 데이터 분석 A-Z 올인원 패키지 Online. 👉 https://bit.ly/3cB3C8y