python(10) 자동화 (네이버 쇼핑)

hyukstory 혁스토리·2020년 8월 26일

python

목록 보기

16/35

쇼핑 탭에 들어가 검색어 입력하면 자동으로 크롤링하여

json파일로 저장하는 코드 짜기

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import pyperclip

url = 'https://www.naver.com/'

driver = webdriver.Chrome('C:/TEMP/chromedriver.exe')   # 빈 브라우저 띄움
time.sleep(3)
driver.get(url)

# 로그인 버튼을 찾고 클릭합니다
login_btn = driver.find_element_by_class_name('link_login')
login_btn.click()
time.sleep(1)

# id, pw 입력할 곳을 찾습니다.
tag_id = driver.find_element_by_name('id')
tag_pw = driver.find_element_by_name('pw')
tag_id.clear()

# id 입력
tag_id.click()
pyperclip.copy('아이디')
tag_id.send_keys(Keys.CONTROL, 'v')
time.sleep(1)

# pw 입력
tag_pw.click()
pyperclip.copy('비밀번호')
tag_pw.send_keys(Keys.CONTROL, 'v')
time.sleep(1)

# 로그인 버튼을 클릭합니다
login_btn = driver.find_element_by_id('log.login')
login_btn.click()
time.sleep(1)

# 등록 안함
driver.find_element_by_id('new.dontsave').click()


# 쇼핑 클릭
driver.switch_to.window(driver.window_handles[0]) # 제어권 메인 창으로 넘기기 
#driver.find_element_by_class_name('nav').click()  # 이걸로 하면 mail 누름

xpath = "/html/body/div[2]/div[2]/div[2]/div[1]/div[1]/ul[1]/li[5]/a" # 쇼핑 버튼의 full xpath
driver.find_element_by_xpath(xpath).click()


# 검색어 입력
xpath2 = "//input[@class='co_srh_input _input N=a:SNB.search']"  # //는 이전 경로들을 축약한 뜻

driver.find_element_by_xpath(xpath2)
driver.find_element_by_xpath(xpath2).clear()
driver.find_element_by_xpath(xpath2).send_keys(str(input("검색어를 입력하세요 : "))+"\n")


# 스크롤 끝까지 내리기
while True:
    
    SCROLL_PAUSE_TIME = 2
    # 화면 최하단으로 스크롤 다운
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
 
    # 페이지 로드를 기다림
    time.sleep(SCROLL_PAUSE_TIME)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight-50);")
    time.sleep(SCROLL_PAUSE_TIME)
 
    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    last_height = new_height
 
    # 새로운 높이가 이전 높이와 변하지 않았을 경우 스크롤 종료
    if new_height == last_height:
        break
    
    # 스크롤 다운이 된다면 스크롤 다운이 된 후의 창 높이를 새로운 높이로 갱신
    last_height = new_height





# 검색 결과 10페이지 크롤링 후 파일 저장
from bs4 import BeautifulSoup
import json

page = 0
#file = open("네이버쇼핑매크로.txt", "w", encoding = "UTF-8")  # txt파일로 저장
file = open("C:/Users/student/Desktop/python/P_3week/네이버쇼핑매크로.json", "w", encoding = "UTF-8") #json 파일로 저장

soup = BeautifulSoup(driver.page_source, 'lxml')
    
cnt = len(soup.find_all('div', class_='basicList_title__3P9Q7'))
    
naver_macro = []


while True :
    
    print("page = ", page, "크롤링 완료")  # 결과 확인 위한 출력
    
    for i in range(0,cnt) :
        
        metadata = soup.find_all('div', class_='basicList_title__3P9Q7')[i]
        title = metadata.a.get('title')
        if title != None :
            title = metadata.a.get('title')
            #print("<제품명> : ", title)               # title
        elif title == None :
            title = metadata.a.get_text()
            #print("<제품명> : ", title)  
        
        metadata2 = soup.find_all('div', class_='basicList_price_area__1UXXR')[i]
        price = metadata2.find('span', class_='price_num__2WUXn')
        if price != None : 
            price = price.text
            #print("<가격> : ", price)                 # 가격
        elif price == None :
            price = price = metadata2.strong.get_text()
            #print("<가격> : ", price)   
        
        
        url = metadata.a.get('href')
        #print("<url> : ", url)                    # url
        #print("===================================================")     
        
    
        #file.write(str(title) + "\t" + str(price) + "\t" + str(url) + "\n") 
        # txt파일로 저장 시 엑셀로 불러오기 쉽게 탭으로 구분
        naver_macro.append({'title' : title , 'price' : price, 'url' : url }) #리스트 안에 딕셔너리 값 append
    
    
    next_btn = driver.find_element_by_class_name('pagination_next__1ITTf')
    next_btn.click()
    
    page += 1
    
    if page > 10 :
        break

file.write(json.dumps(naver_macro))
        
file.close()

pandas로 부르기 , excel로 저장

- pip install openpyxl

import pandas as pd 
#import openpyxl

df = pd.read_json("C:/Users/student/Desktop/python/P_3week/네이버쇼핑매크로.json")
for j in df:
    print(j)
print(df.count())

writer = pd.ExcelWriter("C:/Users/student/Desktop/python/P_3week/네이버쇼핑매크로.xlsx")
df.to_excel(writer,"sheet1")
writer.save()

** 다음 페이지 버튼

next_btn = driver.find_element_by_class_name('pagination_next__1ITTf')
next_btn.click() 

# while 문으로 반복하면 전체 데이터 크롤링 가능

hyukstory 혁스토리

문돌이의 고군분투 개발 공부

이전 포스트

python(9) selenium 연습 (네이버, 페이스북, ktx)

다음 포스트

python(11) open API 활용하기 (네이버 블로그)

1개의 댓글

김인호

2022년 3월 14일

안녕하세요, 네이버 쇼핑 크롤링 블로그 보고 댓글 남깁니다!

metadata2 = soup.findall('div', class='basicListprice_area__1UXXR')[i]
price = metadata2.find('span', class='price_num__2WUXn')
if price != None :
price = price.text
#print("<가격> : ", price) # 가격
elif price == None :
price = price = metadata2.strong.get_text()
#print("<가격> : ", price)

상기 부분 find_all 로 div 태그 찾고, 아래 가격 정보를 price 변수에 담아 find로 찾았는데요
위와 같이 등록일 / 평점 / 리뷰수 크롤링 하려고 하는데 동일하게 적용하면

AttributeError: ResultSet object has no attribute 'find'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?

위와 같은 에러 문구가 계속 반환 됩니다. 설명 좀 가능하실까요?

답글 달기