- 내부링크 : 웹사이트의 다른 페이지로 연결시켜주는 하이퍼링크. 해당 웹사이트에 계속 머무르게 함.
- 외부링크 : 다른 웹사이트의 페이지로 연결되는 하이퍼링크
- 모든 "a href" (링크) 검색 및 출력
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
i = 1
for link in bs.find_all('a'):
if 'href' in link.attrs:
print(i, link.attrs['href'])
i += 1
- /wiki/로 시작된 링크만 출력
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
i = 1
for link in bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^/wiki/[^:]*$')):
if 'href' in link.attrs:
print(i, link.attrs['href'])
i += 1
- 페이지에서 무작위로 링크 선택하고 그 링크에서 또 선택
- 링크에서 링크로 움직이며 한 웹사이트 내부를 무작위로 이동
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re
import pdb
random.seed(datetime.datetime.now())
def getLinks(articleUrl):
html = urlopen('http://en.wikipedia.org'+articleUrl)
bs = BeautifulSoup(html, 'html.parser')
return bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^/wiki/[^:]*$'))
depth = 0
links = getLinks('/wiki/Kevin_Bacon')
while len(links) > 0 and depth < 5:
newArticle = links[random.randint(0, len(links)-1)].attrs['href']
links = getLinks(newArticle)
depth +=1
print(depth, len(links), newArticle)
- 전체 사이트 크롤링 (웹사이트의 내부 링크 모두 검색)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
pages = set()
depth = 0
def getLinks(pageUrl):
global pages
global depth
html = urlopen('http://ko.wikipedia.org{}'.format(pageUrl))
bs = BeautifulSoup(html, 'html.parser')
links = bs.find_all('a', href=re.compile('^(/wiki/)'))
for link in links:
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
newPage = link.attrs['href']
print(depth, len(links), "\t", newPage)
pages.add(newPage)
try:
getLinks(newPage)
except:
print('no page')
depth += 1
if (depth > 2):
break
getLinks('')
- 전체 사이트에서 Data 수집 (페이지(내부 링크) 옮겨다니면서 데이터 수집)
- 웹 크롤러가 페이지와 페이지 사이를 옮겨 다니며 페이지에 머무르는 동안 뭔가 다른 일을 해야함.
- 무슨 페이지든 상관없이 제목은 항상 h1 태그 안에 있으며 h1태그는 페이지당 하나만 존재.
- 텍스트 body는 div#bodyContent tag에 존재.
- 첫 번째 문단의 텍스트만 선택하려면 div#mw-content-text → p
- 편집 링크는 항목 페이지에만 존재. 존재한다면 li#ca-edit -> span -> a로 찾음.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
pages = set()
depth = 0
def getLinks(pageUrl):
global pages
global depth
html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
bs = BeautifulSoup(html, 'html.parser')
try:
print(bs.h1.get_text())
print(bs.find(id ='mw-content-text').find_all('p')[0])
except AttributeError:
print('This page is missing something! Continuing.')
for link in bs.find_all('a', href=re.compile('^/wiki/[^:]*$')):
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
newPage = link.attrs['href']
print('-'*20)
print(newPage)
pages.add(newPage)
getLinks(newPage)
depth += 1
if (depth > 1):
break
getLinks('')
- 인터넷 Crawling (외부 링크에서 외부 링크로 무작위 이동)
- 외부 링크를 무시하지 않고 따라가기
- http://oreilly.com 에서 시작해 외부 링크에서 외부 링크로 무작위로 이동
- 외부 링크를 찾을 때까지 웹사이트를 재귀적으로(내부 링크로) 파고듦.
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random
import pdb
pages = set()
random.seed(datetime.datetime.now())
depth = 0
def getInternalLinks(bs, includeUrl):
includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme, urlparse(includeUrl).netloc)
internalLinks = []
for link in bs.find_all('a', href=re.compile('^(/|.*'+includeUrl+')')):
if link.attrs['href'] is not None:
if link.attrs['href'] not in internalLinks:
if(link.attrs['href'].startswith('/')):
internalLinks.append(includeUrl+link.attrs['href'])
else:
internalLinks.append(link.attrs['href'])
return internalLinks
def getExternalLinks(bs, excludeUrl):
externalLinks = []
links = bs.find_all('a', href=re.compile('^(http|www)((?!'+excludeUrl+').)*$'))
for link in links:
if link.attrs['href'] is not None:
if link.attrs['href'] not in externalLinks:
externalLinks.append(link.attrs['href'])
return externalLinks
def getRandomExternalLink(startingPage):
html = urlopen(startingPage)
bs = BeautifulSoup(html, 'html.parser')
externalLinks = getExternalLinks(bs, urlparse(startingPage).netloc)
if len(externalLinks) == 0:
print('No external links, looking around the site for one')
domain = '{}://{}'.format(urlparse(startingPage).scheme, urlparse(startingPage).netloc)
try:
internalLinks = getInternalLinks(bs, domain)
except:
printf ("예외")
return getRandomExternalLink(internalLinks[random.randint(0,
len(internalLinks)-1)])
else:
return externalLinks[random.randint(0, len(externalLinks)-1)]
def followExternalOnly(startingSite):
global depth
externalLink = getRandomExternalLink(startingSite)
print(depth, 'Random external link is: {}'.format(externalLink))
followExternalOnly(externalLink)
depth += 1
if (depth > 5):
return
followExternalOnly('http://oreilly.com')
- 사이트에서 찾은 외부 URL을 모두 리스트로 수집
allExtLinks = set()
allIntLinks = set()
def getAllExternalLinks(siteUrl):
html = urlopen(siteUrl)
domain = '{}://{}'.format(urlparse(siteUrl).scheme,
urlparse(siteUrl).netloc)
bs = BeautifulSoup(html, 'html.parser')
internalLinks = getInternalLinks(bs, domain)
externalLinks = getExternalLinks(bs, domain)
for link in externalLinks:
if link not in allExtLinks:
allExtLinks.add(link)
print(link)
for link in internalLinks:
if link not in allIntLinks:
allIntLinks.add(link)
getAllExternalLinks(link)
allIntLinks.add('http://oreilly.com')
getAllExternalLinks('http://oreilly.com')
💡 Scrapy
- 웹사이트 Layout 다루기 (사이트별로 web crawler 지정)
- CSS 선택자만 다르게
- title: 제목 텍스트 추출
- body: 기사의 주요 콘텐츠 선택/추출
- Content(URL, title, body) class 반환
import requests
class Content:
def __init__(self, url, title, body):
self.url = url
self.title = title
self.body = body
def getPage(url):
req = requests.get(url)
return BeautifulSoup(req.text, 'html.parser')
def scrapeNYTimes(url):
bs = getPage(url)
title = bs.find("h1").text
lines = bs.find_all("p", {"class":"story-content"})
body = '\n'.join([line.text for line in lines])
return Content(url, title, body)
def scrapeBrookings(url):
bs = getPage(url)
title = bs.find("h1").text
body = bs.find("div",{"class","post-body"}).text
return Content(url, title, body)
url = "https://www.brookings.edu/blog/education-plus-development/2021/10/05/invest-in-programs-that-boost-childrens-learning-and-development/"
content = scrapeBrookings(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)
url = 'https://www.nytimes.com/2021/10/05/opinion/facebook-blackout-2021.html'
content = scrapeNYTimes(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)
- Web 사이트 구조에 대한 정보 class
class Content:
"""
articles/pages 전체에 사용할 공통 기반 class
"""
def __init__(self, url, title, body):
self.url = url
self.title = title
self.body = body
def print(self):
"""
출력 결과는 원하는 대로 바꿀 수 있음.
"""
print("URL: {}".format(self.url))
print("TITLE: {}".format(self.title))
print("BODY:\n{}".format(self.body))
print("\n")
class Website:
"""
데이터를 수집하는 방법에 대한 지침
"""
def __init__(self, name, url, titleTag, bodyTag):
self.name = name
self.url = url
self.titleTag = titleTag
self.bodyTag = bodyTag
import requests
from bs4 import BeautifulSoup
class Crawler:
def getPage(self, url):
try:
req = requests.get(url)
except requests.exceptions.RequestException:
return None
return BeautifulSoup(req.text, 'html.parser')
def safeGet(self, pageObj, selector):
"""
Beautifulsoup 객체와 선택자를 받아 content를 추출하는 함수.
주어진 selector로 발견된 객체가 없으면 empty 반환
"""
selectedElems = pageObj.select(selector)
if selectedElems is not None and len(selectedElems) > 0:
return '\n'.join(
[elem.get_text() for elem in selectedElems])
return ''
def parse(self, site, url):
"""
URL을 받아 content 추출
"""
bs = self.getPage(url)
if bs is not None:
title = self.safeGet(bs, site.titleTag)
body = self.safeGet(bs, site.bodyTag)
if title != '' and body != '':
content = Content(url, title, body)
content.print()
crawler = Crawler()
siteData = [
['O\'Reilly Media', 'http://oreilly.com', 'h1', 'section#product-description'],
['Reuters', 'http://reuters.com', 'h1', 'div.StandardArticleBody_body_1gnLA'],
['Brookings', 'http://www.brookings.edu', 'h1', 'div.post-body'],
['New York Times', 'http://nytimes.com', 'h1', 'div.StoryBodyCompanionColumn div p']
]
websites = []
urls = [
'http://shop.oreilly.com/product/0636920028154.do',
'http://www.reuters.com/article/us-usa-epa-pruitt-idUSKBN19W2D0',
'https://www.brookings.edu/blog/education-plus-development/2021/10/05/invest-in-programs-that-boost-childrens-learning-and-development/',
'https://www.nytimes.com/2018/01/28/business/energy-environment/oil-boom.html'
]
for row in siteData:
websites.append(Website(row[0], row[1], row[2], row[3]))
crawler.parse(websites[0], urls[0])
crawler.parse(websites[1], urls[1])
crawler.parse(websites[2], urls[2])
crawler.parse(websites[3], urls[3])
- 검색 portal을 통한 site crawling
class Content:
"""
articles/pages 전체에 사용할 공통 기반 class
"""
def __init__(self, topic, url, title, body):
self.topic = topic
self.title = title
self.body = body
self.url = url
def print(self):
"""
출력 결과는 원하는 대로 바꿀 수 있음.
"""
print('New article found for topic: {}'.format(self.topic))
print('URL: {}'.format(self.url))
print('TITLE: {}'.format(self.title))
print('BODY:\n{}'.format(self.body))
class Website:
"""Contains information about website structure"""
def __init__(self, name, url, searchUrl, resultListing, resultUrl, absoluteUrl, titleTag, bodyTag):
self.name = name
self.url = url
self.searchUrl = searchUrl
self.resultListing = resultListing
self.resultUrl = resultUrl
self.absoluteUrl = absoluteUrl
self.titleTag = titleTag
self.bodyTag = bodyTag
import requests
from bs4 import BeautifulSoup
class Crawler:
def getPage(self, url):
try:
req = requests.get(url)
except requests.exceptions.RequestException:
return None
return BeautifulSoup(req.text, 'html.parser')
def safeGet(self, pageObj, selector):
childObj = pageObj.select(selector)
if childObj is not None and len(childObj) > 0:
return childObj[0].get_text()
return ''
def search(self, topic, site):
"""
주어진 topic으로 주어진 website 검색, 발견된 모든 page 기록
"""
bs = self.getPage(site.searchUrl + topic)
searchResults = bs.select(site.resultListing)
for result in searchResults:
url = result.select(site.resultUrl)[0].attrs['href']
if(site.absoluteUrl):
bs = self.getPage(url)
else:
bs = self.getPage(site.url + url)
if bs is None:
print('Something was wrong with that page or URL. Skipping!')
return
title = self.safeGet(bs, site.titleTag)
body = self.safeGet(bs, site.bodyTag)
if title != '' and body != '':
content = Content(topic, title, body, url)
content.print()
crawler = Crawler()
siteData = [
['Reuters', 'http://reuters.com', 'http://www.reuters.com/search/news?blob=', 'div.search-result-content',
'h3.search-result-title a', False, 'h1', 'div.StandardArticleBody_body_1gnLA'],
['Brookings', 'http://www.brookings.edu', 'https://www.brookings.edu/search/?s=',
'div.list-content article', 'h4.title a', True, 'h1', 'div.post-body']
]
sites = []
for row in siteData:
sites.append(Website(row[0], row[1], row[2],
row[3], row[4], row[5], row[6], row[7]))
topics = ['facebook', 'North Korea']
for topic in topics:
print('GETTING INFO ABOUT: ' + topic)
for targetSite in sites:
crawler.search(topic, targetSite)
- link를 통한 site crawling
- 특정 URL 패턴과 일치하는 link를 모두 따라가는 crawler
- site 전체에서 데이터를 수집해야 하는 경우
- 어떤 종류의 page를 선택할지 지정하는 규칙이 필요
class Website:
def __init__(self, name, url, targetPattern, absoluteUrl, titleTag, bodyTag):
self.name = name
self.url = url
self.targetPattern = targetPattern
self.absoluteUrl = absoluteUrl
self.titleTag = titleTag
self.bodyTag = bodyTag
class Content:
def __init__(self, url, title, body):
self.url = url
self.title = title
self.body = body
def print(self):
print('URL: {}'.format(self.url))
print('TITLE: {}'.format(self.title))
print('BODY:\n{}'.format(self.body))
import re
class Crawler:
def __init__(self, site):
self.site = site
self.visited = []
def getPage(self, url):
try:
req = requests.get(url)
except requests.exceptions.RequestException:
return None
return BeautifulSoup(req.text, 'html.parser')
def safeGet(self, pageObj, selector):
selectedElems = pageObj.select(selector)
if selectedElems is not None and len(selectedElems) > 0:
return '\n'.join([elem.get_text() for elem in selectedElems])
return ''
def parse(self, url):
bs = self.getPage(url)
if bs is not None:
title = self.safeGet(bs, self.site.titleTag)
body = self.safeGet(bs, self.site.bodyTag)
if title != '' and body != '':
content = Content(url, title, body)
content.print()
def crawl(self):
"""
Get pages from website home page
"""
bs = self.getPage(self.site.url)
targetPages = bs.findAll('a', href=re.compile(self.site.targetPattern))
for targetPage in targetPages:
targetPage = targetPage.attrs['href']
if targetPage not in self.visited:
self.visited.append(targetPage)
if not self.site.absoluteUrl:
targetPage = '{}{}'.format(self.site.url, targetPage)
self.parse(targetPage)
reuters = Website('Reuters', 'https://www.reuters.com', '^(/business/)',
False, 'h1', 'div.StandardArticleBody_body_1gnLA')
crawler = Crawler(reuters)
crawler.crawl()
crawler.visited