02.Python 활용 - 크롤링 기술의 이해-2

ID짱재·2021년 2월 9일

목록 보기

2/5

🌈 크롤링의 이해

🔥 requests 라이브러리와 urllib 라이브러리 비교

🔥 HTTP response code

🔥 여러 페이지 한번에 클로링하는 기법

🔥 클로링한 결과물을 엑셀 파일로 저장하기

🔥 네이버 베스트 100 간단 클로링 예시

1. requests 라이브러리와 urllib 라이브러리 비교

현재는 requests + bs4로 크롤링을 하지만, 과거에는 urllib + bs4로 클롤링을 했었음
현재도 기존 코드 중 일부가 urllib을 사용하는 경우가 있으므로, 간단하게 정리해보도록 함
경우에 따라 인코딩 처리가 달라, urllib을 사용해야지만 크롤링이 되는 경우가 드물게 존재
이에 requests 라이브러리로 크롤링을 하였으나, 불가능할 때 urllib으로 시도해볼 필요 있음

✍🏻 python

# 크롤링 주소 : https://davelee-fun.github.io/
# requests 라이브러리로 상품명 클롤하기
import requests
from bs4 import BeautifulSoup
res = requests.get('https://davelee-fun.github.io/')
soup = BeautifulSoup(res.content, 'html.parser')
data = soup.select('h4.card-text')
for item in data:
    print(item.get_text().strip())

# 크롤링 주소 : https://davelee-fun.github.io/
# urllib 라이브러리로 상품명 클롤하기
from urllib.request import urlopen # 라이브러리 import
from bs4 import BeautifulSoup
res = urlopen('https://davelee-fun.github.io/') # urlopen("주소")
soup = BeautifulSoup(res, 'html.parser') # .content 삭제
data = soup.select('h4.card-text')
for item in data:
    print(item.get_text().strip())

2. HTTP response code

HTTP 프로토콜 규격에 따라서, 응답 데이터에 응답 코드(response code)를 넣어서 보낼 수 있음
requests 라이브러리 경우, requests.get()의 이턴 변수를 활용해 응답 코드를 확인 가능
응답 코드가 200인 경우에는 정상이지만, 그렇지 않으면 문제가 발생했다는 뜻
⭐️ 응답코드 목록 참고 : https://ko.wikipedia.org/wiki/HTTP_%EC%83%81%ED%83%9C_%EC%BD%94%EB%93%9C

✍🏻 python

# requests 라이브러리로 응답코드 확인하기
import requests
from bs4 import BeautifulSoup
res = requests.get('https//...')
if res.status_code != 200: # 응답코드가 200번이 아니라면(정상으로 응답되지 않는다면)
    print("페이지 없음")
else:
    soup = BeautifulSoup(res.content, 'html.parser')
    data = soup.select('target')
    for item in data:
        print(item.get_text())

3. 여러 페이지 한번에 클로링하는 기법

여러 페이지를 크롤링할 때, 주소를 계속 바꾸면서 크롤링하기에는 현실적으로 어려움
이에 여러 페이지를 한번에 크롤링하는 기법을 활용하면 쉽게 크롤링 가능
다만, 이러한 기술은 서버에서 해당 ip를 차단할 가능성이 있기 때문에 무분별한 클로링은 위험
또한 싸이트마다 페이지의 패턴을 잘 파악해서 requests 주소로 넣어주어야 함

✍🏻 python

# 여러 페이지 한번에 크롤링하기
import requests
from bs4 import BeautifulSoup
for page_num in range(10): # 0부터 9페이지까지 크롤링(10번 반복)
    if page_num == 0:
    	# 첫번째 페이지는 페턴에 적용되지 않을 때 별도로 requests
        res = requests.get('https://크롤링 주소') 
    else:
    	 # 두번째 페이지부터는 페턴이 있을 때 for문 적용
        res = requests.get('https://크롤링 주소' + str(page_num + 1))
    soup = BeautifulSoup(res.content, 'html.parser')
    data = soup.select('h4.card-text')
    for item in data:
        print(item.get_text().strip())

4. 클로링한 결과물을 엑셀 파일로 저장하기

openpyxl은 엑셀파일로 저장하는데 유용한 라이브러리이며, 설치 후 사용
openpyxl 설치하기 : pip3 install openpyxl
openpyxl 기능을 함수로 만들어 활용하면 간편하게 이용할 수 있음

1) openpyxl 라이브러리 이해

엑셀로 저장 : import → 파일 생성 → 시트 활성화 → 시트 이름 설정 → 데이터 쓰기 → 저장 → 닫기

엑셀 불러오기 : import → 파일 지정 → 시트 지정 → 데이터 불러오기 → 닫기

✍🏻 python

# xlsx 저장
import openpyxl  # openpyxl import
excel_file = openpyxl.Workbook() # 엑셀 파일 생성(디폴트 시트가 함께 생성됨)
excel_sheet = excel_file.active # 시트를 활성화(디폴트 시트 활성화 = 활성화해야 쓸 수 있음)
excel_sheet.title = "report1" # 시트의 이름을 설정
excel_sheet.append(['data1','data2','data3','data4']) # 엑셀 시트에 쓰기
excel_file.save('temp.xlsx') # 엑셀 저장(현재 실행 중인 디렉토리에 저장)
excel_file.close() # 파일 닫기

# xlsx 불러오기
import openpyxl
excel_file = openpyxl.load_workbook('product.xlsx') # 실행결로와 xlsx 파일이 같은 디렉토리에 있을 경우
excel_file.sheetnames # 불러올 때 시트네임을 알아야하는데 모를 때 확인하는 법
excel_sheet = excel_file["상품 정보"] # 불러올 시트 시트 이름으로 지정
# excel_sheet = excel_file.active # 불러올 시트가 1개 뿐일 때
for item in excel_sheet.rows:
    print(item[0].value, item[1].value)
excel_file.close()

2) openpyxl 함수로 만들어 활용

함수 인자에는 file이름, sheet이름, 데이터를 넣음

데이터가 세로로 작성될 수 있도록 크롤링한 결과물을 중첩 리스트로 저장 후 인자로 전달

중첩 리스트 = [[상품1], [상품2], [상품3], .....]

column_dimensions[ ] : 셀 간 너비 조정 코드로 작성하여 세팅할 수 있음

✍🏻 python

# 크롤링 주소 : https://davelee-fun.github.io/
# 여러 페이지 한번에 크롤링하기
# openpyxl 함수로 만들기
import openpyxl  # openpyxl import
def write_excel_template(filename, sheetname, listdata):
    excel_file = openpyxl.Workbook() # 엑셀 파일 생성(디폴트 시트가 함께 생성됨)
    excel_sheet = excel_file.active # 시트를 활성화(디폴트 시트 활성화 = 활성화해야 쓸 수 있음)
    excel_sheet.column_dimensions['A'].width = 100 # A열 사이즈 100
    excel_sheet.column_dimensions['B'].width = 20 # B열 사이즈 20
    if sheetname != '':
        excel_sheet.title = sheetname # sheetname없다면 파라미터로 받은 이름 사용
    for item in listdata:
        excel_sheet.append(item)
    excel_file.save(filename) # 파라피터를 파일명으로하여 저장
    excel_file.close() # 파일 닫기

# 크롤링한 결과물 openpyxl 함수에 인자로 넣어 엑셀로 저장
import requests
from bs4 import BeautifulSoup
product_lists = list() # 데이터를 담을 리트스 선언
for page_num in range(10): # 0부터 9페이지까지 크롤링(10번 반복)
    if page_num == 0:
        res = requests.get('https://클롤링 주소') # 첫번째 페이지는 페턴에 적용되지 않기 때문에 따로 requests
    else:
        res = requests.get('https://크롤링 주소' + str(page_num + 1)) # 두번째 페이지부터는 페턴이 있기 때문에 for문 적용
    soup = BeautifulSoup(res.content, 'html.parser')
    data = soup.select('div.card') # 모든 상품 블록 가져와 data에 저장
    for item in data:
        product_name = item.select_one('div.card-body h4.card-text') # 상품명 크롤링
        product_date = item.select_one('div.wrapfooter span.post-date') # 상품 등록 날짜 크롤링
        product_info = [product_name.get_text().strip(), product_date.get_text().strip()] # 리스트에 담기
        product_lists.append(product_info) # 전역변수로 선언한 리스트에 추가(이중 리스트 : 리스트안에 상품별로 리스트 생성)
write_excel_template('product.xlsx', '상품 정보', product_lists) # 함수 호출

5. 클로링 예시

Gmarket 베스트 -> 컴퓨터/전자 클로링하기

✍🏻 python

import requests
from bs4 import BeautifulSoup
res = requests.get('http://corners.gmarket.co.kr/Bestsellers?viewType=G&groupCode=G06')
soup = BeautifulSoup(res.content, 'html.parser')
#
bestlists = soup.select('div.best-list')
bestitems = bestlists[1]
products = bestitems.select('ul > li')
#
for index, product in enumerate(products):
    title = product.select_one('a.itemname')
    price = product.select_one('div.s-price > strong')
    print(index+1, title.get_text(), price.get_text())

네이버 베스트 -> 디지털 가전 클로링 후 엑셀 저장(엑셀 스타일 넣기)

✍🏻 python

import requests, openpyxl
excel_file = openpyxl.Workbook()
excel_sheet = excel_file.active
excel_sheet.title = 'report'
excel_sheet.append(['순위','상품명','판매가격','링크'])
excel_sheet.column_dimensions['B'].width = 80
excel_sheet.column_dimensions['C'].width = 30
excel_sheet.column_dimensions['D'].width = 100
#
from bs4 import BeautifulSoup
res = requests.get('https://search.shopping.naver.com/best100v2/detail.nhn?catId=50000003')
if res.status_code != 200:
    print('페이지 없음')
else:
    soup = BeautifulSoup(res.content, 'html.parser')
    data = soup.select('#productListArea > ul > li')
    for index, item in enumerate(data):
        title = item.select_one('p > a')
        price = item.select_one('div.price > strong > span.num')
        print(index+1, title.get_text(), price.get_text(), title['href'])
        excel_sheet.append([index+1, title.get_text(), price.get_text(), title['href']])
        excel_sheet.cell(row=index+2, column=4).hyperlink = title['href'] # 하이퍼링크 달기
#
excel_sheet['A1'].alignment = openpyxl.styles.Alignment(horizontal='center') # 순위 셀 가운데 정렬
#
excel_file.save('BESTPRODUCT_NAVER.xlsx')
excel_file.close()