https://scrapeops.io/python-scrapy-playbook/scrapy-beginners-guide/
sudo apt-get update
apt install tree
sudo pip install scrapy
scrapy # 실행확인
scrapy startproject <project_name>
scrapy startproject chocolatescraper # chcolatescraper라는 폴더가 생성되고 기초 설정 파일이 생성된다.
scrapy genspider chocolatespider chocolate.co.uk
scrapy genspider <name_of_spide> <domain_name>
import scrapy
class ChocolatespiderSpider(scrapy.Spider):
name = 'chocolatespider'
allowed_domains = ['chocolate.co.uk']
start_urls = ['http://chocolate.co.uk/']
def parse(self, response):
pass
import scrapy
class ChocolatespiderSpider(scrapy.Spider):
name = 'chocolatespider'
allowed_domains = ['chocolate.co.uk']
start_urls = ['https://www.chocolate.co.uk/collections/all']
def parse(self, response):
pass
scrapy shell # scrapy shell 터미널에서 실행
fetch('https://www.chocolate.co.uk/collections/all')
# 해당 웹사이트의 웹 사이트를 fetch한다.
response # response 변수에 fetch 결과를 저장한다.
response.css('product-item') # 해당 웹 사이트는 본인이 만든 special component를 사용하기에 위와 같은 방식으로 추출한다.
response.css('product-item') # 결과라 리스트 안에 순서대로 입력된다.
response.css('product-item').get # Get First Product
products = response.css("product-item") # 해당 결과를 변수로 할당한다.
len(products) # 총 갯수 출력
product = products[0] # 나는 이 리스트 형태로 저장된 것 중에서, 첫번째를 쓰겠다.
product.css('a.product-item-meta__title::text').get()
product.css('span.price').getall()
['<span class="price">\n <span class="visually-hidden">Sale price</span>£9.95</span>']
# 앞에 날리고 뒤에 날리고
product.css('span.price').get().replace('<span class="price">\n <span class="visually-hidden">Sale price</span>','').replace('</span>','')
product.css('div.product-item-meta a').attrib['href']
import scrapy
class ChocolatespiderSpider(scrapy.Spider)
#the name of the spider
name = 'chocolatespider'
#the url of the first page that we will start scraping
start_urls = ['https://www.chocolate.co.uk/collections/all']
def parse(self, response):
#here we are looping through the products and extracting the name, price & url
products = response.css('product-item')
for product in products:
#here we put the data returned into the format we want to output for our csv or json file
yield{
'name' : product.css('a.product-item-meta__title::text').get(),
'price' : product.css('span.price').get().replace('<span class="price">\n <span class="visually-hidden">Sale price</span>','').replace('</span>',''),
'url' : product.css('div.product-item-meta a').attrib['href'],
}
scrapy crawl chocolatespider
scrapy crawl chocolatespider -O myscrapeddata.csv # csv로 저장
scrapy shell
fetch('https://www.chocolate.co.uk/collections/all')
response.css('[rel="next"] ::attr(href)').get()
# 전체에서 rel 속성값이 "next"인 곳에서 href 속성을 get 한다.
import scrapy
class ChocolateSpider(scrapy.Spider):
#the name of the spider
name = 'chocolatespider'
#these are the urls that we will start scraping
start_urls = ['https://www.chocolate.co.uk/collections/all']
def parse(self, response):
products = response.css('product-item')
for product in products:
#here we put the data returned into the format we want to output for our csv or json file
yield{
'name' : product.css('a.product-item-meta__title::text').get(),
'price' : product.css('span.price').get().replace('<span class="price">\n <span class="visually-hidden">Sale price</span>','').replace('</span>',''),
'url' : product.css('div.product-item-meta a').attrib['href'],
}
next_page = response.css('[rel="next"] ::attr(href)').get()
if next_page is not None:
next_page_url = 'https://www.chocolate.co.uk' + next_page
yield response.follow(next_page_url, callback=self.parse)