import webbrowser
url = "www.naver.com"
webbrowser.open(url)
True
import webbrowser
naver_search_url = "https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=1&ie=utf8&query="
search_word = '파이썬'
url = naver_search_url + search_word
webbrowser.open_new(url)
True
import webbrowser
naver_search_url = "https://www.google.com/search?q="
search_word = '파이썬'
url = naver_search_url + search_word
webbrowser.open_new(url)
True
import webbrowser
urls = ['www.naver.com', 'www.daum.net', 'www.google.com']
for url in urls:
webbrowser.open_new(url)
import webbrowser
google_url = "https://www.google.com/search?q="
search_words = ['python web scraping', 'python webbrowser']
for search_word in search_words:
webbrowser.open_new(google_url + search_word)
%%writefile HTML_example.html
<!doctype html>
<html>
<head>
<meta charset="utf-8">
<title>이것은 HTML 예제</title>
</head>
<body>
<h1>출간된 책 정보</h1>
<p id="book_title">이해가 쏙쏙 되는 파이썬</p>
<p id="author">홍길동</p>
<p id="publisher">위키북스 출판사</p>
<p id="year">2018</p>
</body>
</html>
Writing HTML_example.html
import requests
r = requests.get("https://www.google.co.kr")
r
<Response [200]>
r.text[0:100]
'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="ko"><head><meta content'
import requests
html = requests.get("https://www.google.co.kr").text
html[0:100]
'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="ko"><head><meta content'
%%writefile HTML_examp2.html
<html><body><div><span>
<a href=http://www.naver.com>naver</a>
<a href=https://www.google.com>google</a>
<a href=http://www.daum.net/>daum</a>
</span></div></body></html>
Overwriting HTML_examp2.html
from bs4 import BeautifulSoup
html = """<html><body><div><span>
<a href=http://www.naver.com>naver</a>
<a href=https://www.google.com>google</a>\
<a href=http://www.daum.net/>daum</a>\
</span></div></body></html>"""
soup = BeautifulSoup(html, 'lxml')
soup
<html><body><div><span>
<a href="http://www.naver.com">naver</a>
<a href="https://www.google.com">google</a><a href="http://www.daum.net/">daum</a></span></div></body></html>
print(soup.prettify())
<html>
<body>
<div>
<span>
<a href="http://www.naver.com">
naver
</a>
<a href="https://www.google.com">
google
</a>
<a href="http://www.daum.net/">
daum
</a>
</span>
</div>
</body>
</html>
soup.find("a")
<a href="http://www.naver.com">naver</a>
type(soup.find("a"))
bs4.element.Tag
soup.find('a').get_text()
'naver'
type(soup.find('a').get_text())
str
soup.find_all('a')
[<a href="http://www.naver.com">naver</a>,
<a href="https://www.google.com">google</a>,
<a href="http://www.daum.net/">daum</a>]
type( soup.find_all('a'))
bs4.element.ResultSet
site_names = soup.find_all('a')
for site_name in site_names:
print(site_name.get_text())
naver
google
daum
from bs4 import BeautifulSoup
html2 = """
<html>
<head>
<title>작품과 작가 모음</title>
</head>
<body>
<h1>책 정보</h1>
<p id="book_title">토지</p>
<p id="author">박경리</p>
<p id="book_title">태백산맥</p>
<p id="author">조정래</p>
<p id="book_title">감옥으로부터의 사색</p>
<p id="author">신영복</p>
</body>
</html>
"""
soup2 = BeautifulSoup(html2, "lxml")
soup2.title
<title>작품과 작가 모음</title>
soup2.body
<body>
<h1>책 정보</h1>
<p id="book_title">토지</p>
<p id="author">박경리</p>
<p id="book_title">태백산맥</p>
<p id="author">조정래</p>
<p id="book_title">감옥으로부터의 사색</p>
<p id="author">신영복</p>
</body>
soup2.h1
<h1>책 정보</h1>
soup2.find('p')
<p id="book_title">토지</p>
soup2.find_all('p')
[<p id="book_title">토지</p>,
<p id="author">박경리</p>,
<p id="book_title">태백산맥</p>,
<p id="author">조정래</p>,
<p id="book_title">감옥으로부터의 사색</p>,
<p id="author">신영복</p>]
type(soup2.find_all('p'))
bs4.element.ResultSet
soup2.find('p', {'id' : 'book_title'})
<p id="book_title">토지</p>
soup2.find('p', {'id' : 'author'})
<p id="author">박경리</p>
soup2.find_all('p', {'id' : 'book_title'})
[<p id="book_title">토지</p>,
<p id="book_title">태백산맥</p>,
<p id="book_title">감옥으로부터의 사색</p>]
soup2.find_all('p', {'id' : 'author'})
[<p id="author">박경리</p>, <p id="author">조정래</p>, <p id="author">신영복</p>]
from bs4 import BeautifulSoup
soup2 = BeautifulSoup(html2, 'lxml')
book_titles = soup2.find_all('p', {'id' : 'book_title'})
authors = soup2.find_all('p', {'id' : 'author'})
for book_title, author in zip(book_titles, authors):
print(book_title.get_text() + '/' + author.get_text())
토지/박경리
태백산맥/조정래
감옥으로부터의 사색/신영복
<html>
<head>
<title>작품과 작가 모음</title>
</head>
<body>
<h1>책 정보</h1>
<p id="book_title">토지</p>
<p id="author">박경리</p>
<p id="book_title">태백산맥</p>
<p id="author">조정래</p>
<p id="book_title">감옥으로부터의 사색</p>
<p id="author">신영복</p>
</body>
</html>
soup2.select('body h1')
[<h1>책 정보</h1>]
soup2.select('body p')
[<p id="book_title">토지</p>,
<p id="author">박경리</p>,
<p id="book_title">태백산맥</p>,
<p id="author">조정래</p>,
<p id="book_title">감옥으로부터의 사색</p>,
<p id="author">신영복</p>]
soup2.select('p')
[<p id="book_title">토지</p>,
<p id="author">박경리</p>,
<p id="book_title">태백산맥</p>,
<p id="author">조정래</p>,
<p id="book_title">감옥으로부터의 사색</p>,
<p id="author">신영복</p>]
soup2.select('p#book_title')
[<p id="book_title">토지</p>,
<p id="book_title">태백산맥</p>,
<p id="book_title">감옥으로부터의 사색</p>]
soup2.select('p#author')
[<p id="author">박경리</p>, <p id="author">조정래</p>, <p id="author">신영복</p>]
%%writefile HTML_example_my_site.html
<!doctype html>
<html>
<head>
<meta charset="utf-8">
<title>사이트 모음</title>
</head>
<body>
<p id="title"><b>자주 가는 사이트 모음</b></p>
<p id="contents">이곳은 자주 가는 사이트를 모아둔 곳입니다.</p>
<a href="http://www.naver.com" class="portal" id="naver">네이버</a> <br>
<a href="https://www.google.com" class="search" id="google">구글</a> <br>
<a href="http://www.daum.net" class="portal" id="danum">다음</a> <br>
<a href="http://www.nl.go.kr" class="government" id="nl">국립중앙도서관</a>
</body>
</html>
Writing HTML_example_my_site.html
f = open('HTML_example_my_site.html', encoding='utf-8')
html3 = f.read()
f.close()
soup3 = BeautifulSoup(html3, 'lxml')
soup3.select('a')
[<a class="portal" href="http://www.naver.com" id="naver">네이버</a>,
<a class="search" href="https://www.google.com" id="google">구글</a>,
<a class="portal" href="http://www.daum.net" id="danum">다음</a>,
<a class="government" href="http://www.nl.go.kr" id="nl">국립중앙도서관</a>]
soup3.select('a.portal')
[<a class="portal" href="http://www.naver.com" id="naver">네이버</a>,
<a class="portal" href="http://www.daum.net" id="danum">다음</a>]
soup3.select('html a')
[<a class="portal" href="http://www.naver.com" id="naver">네이버</a>,
<a class="search" href="https://www.google.com" id="google">구글</a>,
<a class="portal" href="http://www.daum.net" id="danum">다음</a>,
<a class="government" href="http://www.nl.go.kr" id="nl">국립중앙도서관</a>]
soup3.select('body a')
[<a class="portal" href="http://www.naver.com" id="naver">네이버</a>,
<a class="search" href="https://www.google.com" id="google">구글</a>,
<a class="portal" href="http://www.daum.net" id="danum">다음</a>,
<a class="government" href="http://www.nl.go.kr" id="nl">국립중앙도서관</a>]
soup3.select('html body a')
[<a class="portal" href="http://www.naver.com" id="naver">네이버</a>,
<a class="search" href="https://www.google.com" id="google">구글</a>,
<a class="portal" href="http://www.daum.net" id="danum">다음</a>,
<a class="government" href="http://www.nl.go.kr" id="nl">국립중앙도서관</a>]
soup3.select('a')
[<a class="portal" href="http://www.naver.com" id="naver">네이버</a>,
<a class="search" href="https://www.google.com" id="google">구글</a>,
<a class="portal" href="http://www.daum.net" id="danum">다음</a>,
<a class="government" href="http://www.nl.go.kr" id="nl">국립중앙도서관</a>]
soup3.select('a#naver')
[<a class="portal" href="http://www.naver.com" id="naver">네이버</a>]
soup3.select('a[href]')
[<a class="portal" href="http://www.naver.com" id="naver">네이버</a>,
<a class="search" href="https://www.google.com" id="google">구글</a>,
<a class="portal" href="http://www.daum.net" id="danum">다음</a>,
<a class="government" href="http://www.nl.go.kr" id="nl">국립중앙도서관</a>]
%%writefile br_example_constitution.html
<!doctype html>
<html>
<head>
<meta charset="utf-8">
<title>줄 바꿈 테스트 예제</title>
</head>
<body>
<p id="title"><b>대한민국헌법</b></p>
<p id="content">제1조 <br/>①대한민국은 민주공화국이다.<br/>②대한민국의 주권은 국민에게 있고,
모든 권력은 국민으로부터 나온다.</p>
<p id="content">제2조 <br/>①대한민국의 국민이 되는 요건은 법률로 정한다.<br/>②국가는 법률이
정하는 바에 의하여 재외국민을 보호할 의무를 진다.</p>
</body>
</html>
Writing br_example_constitution.html
html1 = """<p id="content">제1조 <br/>①대한민국은 민주공화국이다.<br/>②대한민국의 주권은 국민에게 있고,
모든 권력은 국민으로부터 나온다.</p>"""
soup1= BeautifulSoup(html1, 'lxml')
content1 = soup1.find('p', {'id':'content'})
br_content = content1.find('br')
br_content.replace_with("\n")
<br/>
content1
<p id="content">제1조
①대한민국은 민주공화국이다.<br/>②대한민국의 주권은 국민에게 있고,
모든 권력은 국민으로부터 나온다.</p>
soup2 = BeautifulSoup(html1, "lxml")
content2 = soup2.find('p', {'id' : 'content'})
br_contents = content2.find_all('br')
for br_content in br_contents:
br_content.replace_with("\n")
<p id="content">제1조 <br/>①대한민국은 민주공화국이다.<br/>②대한민국의 주권은 국민에게 있고,
모든 권력은 국민으로부터 나온다.</p>
def replace_newline(soup_html):
br_contents = soup_html.find_all('br')
for br_content in br_contents:
br_content.replace_with("\n")
return soup_html
soup2 = BeautifulSoup(html1, "lxml")
content2 = soup2.find('p', {'id' : 'content'})
content3 = replace_newline(content2)
print(content3.get_text())
제1조
①대한민국은 민주공화국이다.
②대한민국의 주권은 국민에게 있고,
모든 권력은 국민으로부터 나온다.
import requests
from bs4 import BeautifulSoup
url = "https://www.alexa.com/topsites/countries/KR"
html_website_ranking = requests.get(url).text
soup_website_ranking = BeautifulSoup(html_website_ranking, "lxml")
website_ranking = soup_website_ranking.select("p > a")
website_ranking[:6]
[<a href="/siteinfo/google.com">Google.com</a>,
<a href="/siteinfo/naver.com">Naver.com</a>,
<a href="/siteinfo/youtube.com">Youtube.com</a>,
<a href="/siteinfo/daum.net">Daum.net</a>,
<a href="/siteinfo/tistory.com">Tistory.com</a>,
<a href="/siteinfo/kakao.com">Kakao.com</a>]
website_ranking[0].get_text()
'Google.com'
website_ranking_address = [website_ranking_element.get_text() for website_ranking_element in website_ranking]
for k in range(6):
print("{0}:{1}".format(k+1, website_ranking_address[k]))
1:Google.com
2:Naver.com
3:Youtube.com
4:Daum.net
5:Tistory.com
6:Kakao.com
type(website_ranking_address)
list
import pandas as pd
website_ranking_dict = {"Website" : website_ranking_address}
df = pd.DataFrame(website_ranking_dict)
df[:6]
|
Website |
0 |
Google.com |
1 |
Naver.com |
2 |
Youtube.com |
3 |
Daum.net |
4 |
Tistory.com |
5 |
Kakao.com |
HTML : 뼈대 CSS : 디자인 Javascript : 동적
%%writefile playdata_home.html
<html>
<head>
<meta charset = 'utf-8'>
<title>플레이데이터 11기 홈페이지</title>
</head>
<body>
<input type="text" value="아이디를 입력하세요">
<input type="password">
<input type="button" value="로그인"
<a href="http://google.com"> 구글로 이동하기</a>
</body>
</html>
Writing playdata_home.html
//*[@id="account"]/a
/html/body/div[2]/div[3]/div[3]/div/div[2]/a
import requests
res = requests.get("http://google.com")
if res.status_code == requests.codes.ok:
print("정상입니다.")
else:
print("문제가 생겼습니다.", res.status_code)
정상입니다.
import requests
res = requests.get("http://google.com")
res.raise_for_status()
print("응답코드 :", res.status_code)
응답코드 : 200
len(res.text)
14026
with open("google.html", "w", encoding="utf8") as f:
f.write(res.text)
주민등록번호
111111-2222222
이메일 주소
Ghyunghee@gmail.com (O)
Ghyunghee@gmail.com@gmail (X)
차량번호
11가 1234
123가 1234
IP 주소
192.168.0.1 (O)
1000.2000.3000.4000 (X)
import re
p = re.compile("ca.e")
p.match("careless")
p.search("good care")
p.findall("good care cafe")
['care', 'cafe']
def print_match(m):
print("m.group() : ", m.group())
print("m.string : ", m.string)
print("m.start() : ", m.start())
print("m.end() : ", m.end())
print("m.span() : ", m.span())
m = p.match("careless")
print_match(m)
m.group() : care
m.string : careless
m.start() : 0
m.end() : 4
m.span() : (0, 4)
import re
p = re.compile('^[a-zA-Z0-9+-_.]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$')
emails =['python@mail.example.com', 'python+kr@example.com',
'python=dojang@example.co.kr', 'python_10@example.info',
'python.dojang@e-xample.com',
'@example.com', 'python@example', 'python@example-com']
for email in emails:
print(p.match(email) != None, end=' ')
True True True True True False False False
import re
p = re.compile('[+-.\w+]+@[a-z-]+\.[a-z]+')
emails = ['python@mail.example.com', 'python_10@example.com',
'python-dojang@example.co.kr', 'python_10@example.info',
'python.dojang@e-xample.com',
'@example.com', 'python@example', 'python@example-com']
for email in emails:
print(p.match(email) != None, end=' ')
True True True True True False False False