02.Python 활용 - 크롤링 기술의 이해-1

ID짱재·2021년 2월 9일

Crawling

목록 보기

1/5

🌈 크롤링의 이해

🔥 크롤링 핵심 코드 패턴

🔥 find_all() 함수

🔥 CSS 셀렉터

🔥 데이터 전처리

🔥 네이버 쇼핑 실시간 검색어 클롤링 해보기

1. 크롤링 핵심 코드 패턴

파싱을 할 수 있는 parser는 여러가지가 있으나, 가장 대표적으로 쓰는 parser는 "html.parser" 임
soup.find() 함수로 원하는 타켓을 지정할 수 있음(단, 1개만 선택됨)
변수.get_text() 함수로 추출한 부분의 text를 가져올 수 있음
다양한 속성을 통해서 원하는 타겟을 더 세밀하게 추출할 수 있음
- data = soup.find("p", class_ = "클래스명")
- data = soup.find("p", "클래스명")
- data = soup.find("p", attrs = {"속성명":"속성값"})
- data = soup.find(id="id명")

✍🏻 python

# 1단계 -> 라이브러리 임포트("bs4" 라이브러리에서 "BeautifulSoup"이라는 클래스 가져오겠다.)
import requests
from bs4 import BeautifulSoup
#2단계 -> 크롤링할 페이지 가져오기
res = requests.get("http://주소 입력")
#3단계 -> 웹페이지 파싱하기(파싱이란 문자열의 의미 분석으로 soup에 HTML 파일을 분석한 정보가 들어감)
soup = BeautifulSoup(res.content, 'html.parser')
#4단계 -> 필요한 데이터 추출하기
mydata = soup.find("html태그") # html태그 = title, h1, div 등..
#5단계 -> 추출한 데이터 활용하기
print(mydata.get_text()) # 해당 태그의 텍스트만 추출해 출력함
# print(mydata.string) .string도 가능
# get_text()가 더 잘 불러옴. 안될 때 .string 사용

2. find_all() 함수

위 4단계에서 find() 함수를 사용하면 해당하는 첫번째 값이 추출됨
일반적으로 크롤링은 필요한 데이터 다수를 한번에 추출하기 때문에 find_all()을 사용함
find_all()을 사용하면 타겟에 일치하는 모든 것을 추출한 뒤 리스트 형태로 반환함

✍🏻 python

# 원하는 대상(p태그)들을 모두 추출해오기
data = soup.find_all('p')
for item in data:  #find_all()로 추출하면 for문으로 다룰 수 있음
	print(item.string)    
# span 태그에 class 속성 사용 추출하기
data = soup.find_all('span', attrs={"class":"클래스명"})
for item in data:
    print(item.get_text())
# 타켓 대상의 부모 태그를 find로 찾은 후, 자식 태그들을 find_all로 추출
import requests
from bs4 import BeautifulSoup
res = requests.get("https://주소")
soup = BeautifulSoup(res.content, 'html.parser') 
section = soup.find('ul', id="id명")
titles = section.find_all("li", "li명")
for title in titles:
    print(title.get_text())

3. CSS 셀렉터

select() 안에 태그 또는 class 이름 등을 넣음
select()는 find_all처럼 해당하는 모든 것을 클로링하고, 결과값을 리스트 형태로 반환
세밀하게 타겟을 지정해줄 때 하위 태그 선택은 "스페이스" 간격을 두고 작성
부모태그의 자식 태그를 지정해줄 때는 ">" 로 작성
클래스명으로 select 가능하며, CSS처럼 class는 점(.)을 붙여줘야 하고, id는 #을 붙임

✍🏻 python

# select 사용하기
import requests
from bs4 import BeautifulSoup
res = requests.get("주소")
soup = BeautifulSoup(res.content, 'html.parser')
# print(soup)를 실행하면 파싱된 모든 구조를 볼 수 있음
items = soup.select("h3")
for item in items:
    print(item.get_text()) #soup에 담긴 모든 h3태그의 text요소를 출력

# 하위 태그 선택1(a라는 태그 자식으로 어딘가 있는 b태그를 선택)
items = soup.select("ul" "a")

# 하위 태그 선택2(a라는 태그 바로 아래 자식으로 있는 b를 선택)
items = soup.select("ul" > "li")

# class명으로 선택(.클래스명)
items = soup.select(".클래스명")

# Id명으로 선택(#클래스명)
items = soup.select("#아이디명")

4. 데이터 전처리

추출해온 데이터를 원하는 스타일로 가공함
split() 함수 : ()안에 지정한 옵션을 기준으로 분리한 뒤, 각 각을 리스트에 넣어 반환
strip() 함수 : 문자열 앞뒤로 공백 등을 없애줌

✍🏻 python

# split() 함수 : 옵션()을 기준으로 리스트로 돌려줌 -> 인덱싱하여 원하는 부분만 추출
# split() 연이어 작성 가능
import requests
from bs4 import BeautifulSoup
res = requests.get("https://크롤링 주소")
soup = BeautifulSoup(res.content, 'html.parser')
section = soup.find('ul', id="dev_course_list")
titles = section.find_all("li", "course")
for title in titles:
    print(title.get_text().split("[")[0].split("-")[1]

# strip() 함수 : 문자열 앞뒤로 스페이스나 불필요한 문자열을 삭제해줌
import requests
from bs4 import BeautifulSoup
res = requests.get("https://클로링 주소")
soup = BeautifulSoup(res.content, 'html.parser') # print(soup)를 실행하면 모든 구조를 볼 수 있음
section = soup.find('ul', id="dev_course_list")
titles = section.find_all("li", "course")
for title in titles:
    print(title.get_text().split("[")[0].split("-")[1].strip())

5. 네이버 쇼핑 실시간 검색어 크롤링 해보기

✍🏻 python

# 네이버 쇼핑 -> best100 -> 실시간 인기검색어 크롤링
# select의 타겟은 개발자도구에서 크롤링할 부분을 copy-> select Copy를 통해 복사
# 제대로 크롤링 안될 경우, HTML구조를 살펴 select 타게팅이 잘 되도록 수정
import requests
from bs4 import BeautifulSoup
res = requests.get("https://search.shopping.naver.com/best100v2/main.nhn")
soup = BeautifulSoup(res.content, 'html.parser')
data = soup.select("#popular_srch_lst > li > span.txt")
for item in data:
    print(item.get_text())

# 네이버 쇼핑 -> best100 -> 디지털/가전 & 식품 페이지 품목들 크롤링하기
# 크롤링할 페이지는 다르지만, 페이지 구조가 비슷할 경우 페이지를 list로 넣어 크롤링할 수 있음
import requests
from bs4 import BeautifulSoup
site_list = ["https://search.shopping.naver.com/best100v2/detail.nhn?catId=50000003", "https://search.shopping.naver.com/best100v2/detail.nhn?catId=50000006"]
for site in site_list:
    res = requests.get(site)
    soup = BeautifulSoup(res.content, 'html.parser')
    data = soup.select("#productListArea > ul > li > p > a")
    print(site)
    for item in data:
        print(item.get_text())