크롤러

Seok-Hyun Lee·2021년 7월 11일
1

개요

판매자 능동 참여형 패션 스타일링 플랫폼 관련 크롤러(COLAB 활용)


!pip install Selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin/


import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

Selenium 환경 설정

from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument(f'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36') 
#user-agent 값
wd = webdriver.Chrome('chromedriver',options = chrome_options)

라이브러리 Import

import os # 파일로 이미지를 다운로드하기 위함
import time # 동적 웹 반응 대기를 위한 sleep
import socket   # 소켓 에러 방지 위함

from urllib.request import urlretrieve   # 이미지를 다운로드하기 위한 라이브러리
from urllib.error import HTTPError, URLError
from selenium.common.exceptions import ElementClickInterceptedException, NoSuchElementException, ElementNotInteractableException
from PIL import Image # 파이썬에서 이미지에 대한 라이브러리
import sqlite3
import requests
from bs4 import BeautifulSoup
import re

1. 개별 의류 데이터 계층

1.a 의류 기초 데이터

- 상품 번호(PK)
- 상품 링크
- 성별 
- 최상위 카테고리
- 중간 카테고리
- 카테고리
- 브랜드명
- 브랜드명
- 브랜드 내 상품 번호
- 색상
- 소재
- 패턴
- 스타일

2. 관계 설명용 데이터

2.a 비슷한 의류 추천

- 기준 의류 상품 번호(PK)
- 비슷한 의류 상품 번호(PK)

2.b 스타일링 의류 추천

- 기준 의류 상품 번호(PK)
- 스타일링 넘버
- 스타일링 상품 번호(PK)
annotation_origin = {
    "product_id" : None,                # primary key
    "product_link" : None,
    "sex" : None,
    "superCategory" : None,
    "midCategory" : None,
    "category" : None,
    "brand" : None,
    "brand_product_id" : None,
    "product_name" : None,
    "color" : None,
    "material" : None,           
    "pattern" : None,           
    "style" : None,             
    "coordination" : None,               # True, False, Null(Unknown)
    "price" : None,
    "img_src" : None
}

recommendation = {
    "product_id" : None,        # primary key
    "recommend_id"  : None,      # primary key 
    "category" : None,
    "coordination_number" : None
    #"similar" : None,          
}

카테고리 메뉴별 식별 번호

wd.get('https://www.ssfshop.com/WOMEN/list?dspCtgryNo=SFMA41&brandShopNo=&brndShopId=&etcCtgryNo=&ctgrySectCd=&keyword=&leftBrandNM=')
cats = wd.find_element_by_css_selector('body > div.wrap.ssf > nav > ul:nth-child(2) > li:nth-child(1) > ul')
cat = cats.find_elements_by_tag_name('a')
print("카테고리 메뉴 이름 및 식별 번호]")
for c in cat:
    print(c.text + " : " + c.get_attribute("ctgryno"))
카테고리 메뉴 이름 및 식별 번호]
메인 : THMA38A01
전체 상품 : SFMA41
아우터 : SFMA41A07
재킷/베스트 : SFMA41A21
티셔츠 : SFMA41A01
셔츠/블라우스 : SFMA41A02
니트 : SFMA41A03
원피스 : SFMA41A06
팬츠 : SFMA41A04
스커트 : SFMA41A05
가방/지갑 : SFMA41A10
패션잡화 : SFMA41A12
신발 : SFMA41A11
비치웨어 : SFMA41A09
언더웨어 : SFMA41A08

필터 내 스마트 필터값 ( 패턴, 스타일 ) : 식별 값 부여 받은 항목들

예시

smtFlterVal= 35%2C36 ( 솔리드 & 가로 스트라이프 )

  • 패턴
    • 솔리드 : 35
    • 가로 스트라이프 : 36
    • 세로 스트라이프 : 37
    • 도트 : 38
    • 트로피컬 : 39
    • 플로럴 : 40
    • 카모플라쥬 : 41
    • 애니멀 : 43
    • 그라데이션 : 46
    • 타이포그래피 : 44
    • 그래픽 : 47
    • 아가일 : 49
    • 깅엄 : 48
    • 타탄 : 50
    • 기타 : 53
  • 스타일
    • 모던/미니멀 : 54
    • 내추럴/이지 : 55
    • 페미닌 : 56
    • 클래식/ 프레피 : 57
    • 캐주얼 : 58
    • 액티브/스포티 : 59
    • 에스닉 : 60

의류 카테고리 필터 적용법 ( 카테고리 사이에 %2C 삽입)

ex ) https://,,,,&cateNo=SFMA41A07%2CSFMA41A21,,,,

which_categories(): 의류 카테고리 설정

# 원핫 인코딩 활용
category_dict = {'0':'SFMA41A07', '1': 'SFMA41A21', '2': 'SFMA41A01', '3' : 'SFMA41A02', '4': 'SFMA41A03', '5':'SFMA41A06', 
                 '6':'SFMA41A04', '7':'SFMA41A05', '8':'SFMA41A10', '9':'SFMA41A12', '10':'SFMA41A11', '11':'SFMA41A09', '12':'SFMA41A08'}

#category_list = [0,0,0,0,0,0,0,0,0,0,0,0,0] # 아우터(인덱스 0), 재켓/베스트(인덱스 1) ~ 언더웨어(인덱스12) (총 13개)
def which_catorgies(category_list):
    temp = []
    for i in range(len(category_list)):
        if category_list[i] == 1:
            temp.append(category_dict[f'{i}'])
    if len(temp) > 1 :
        category_filter = '%2C'.join(temp)
    else:
        category_filter = temp[0]
    return category_filter

total_page():전체 페이지 수 가져오기

def total_page(url):
    last_page_num = 0
    wd.get(url)
    try :
        last = wd.find_element_by_xpath('//*[@id="page_last"]')
        last_page_num = int(last.get_attribute('pageno'))
    except NoSuchElementException as e:
        print("No Last Button --> This is Last Page")
        last_page_num = 1

    return last_page_num

filtered_url(): 의류 카테고리 설정 이후 URL

def filtered_url(category_list):

    filter_info = which_catorgies(category_list)
    url = f'https://www.ssfshop.com/WOMEN/list?dspCtgryNo=SFMA41&brandShopNo=&brndShopId=&currentPage=1&sortColumn=SALE_QTY_SEQ&etcCtgryNo=&leftBrandNM=&serviceType=DSP&smtFlterVal=&price=&benefit=&delivery=&lineId=&ctgrySectCd=GNRL_CTGRY&brndId=&sizeNM=&colorCd=&materNM=&cateViewOn=&cateNo={filter_info}&fitPsbYn=N'
    return url

#total_cloths():의류 카테고리 설정 후 전체 항목 초기 정보 가져오기

# 아우터, 재킷/베스트, 티셔츠, 셔츠/블라우스, 니트, 원피스, 팬츠, 스커트
def total_clothes(category_list):
    cloth_list = []
    count = 0
    url_search = filtered_url(category_list)
    page_num = total_page(url_search)
    for i in range(25,30): #for i in range(page_num):
        new_url_search = url_search.replace("currentPage=1","currentPage="+str(i+1))
        req = requests.get(new_url_search)
        html = req.text
        soup = BeautifulSoup(html,'lxml')

        catalog = soup.select('#dspGood > li')
        
        for c in catalog[:]:
            product_id  = c['data-prdno'].strip()
            cloth_list.append(product_id)
            
    return cloth_list

from_filter() : 필터로부터 가져올 수 있는 정보

def from_filter(product_id, annotation):
    global informations_id
    if product_id in informations_id:
        print(product_id + " 중복")
        return "product_id 존재"
    else:
        print(product_id + " 필터로 부터 정보 추출 중")
        informations_id.append(product_id)
        wd.get(f'https://www.ssfshop.com/public/search/search/view?serviceType=SRC&keyword={product_id}&cateNo=&brndId=&colorCd=&sizeNM=&materNM=&fitPsbYn=&smtFlterVal=&price=&benefit=&delivery=&lineId=&dspCtgryNo=SFMA41&brndShopId=&orderView=&pageNo=1&_csrf=3c7b631b-c124-4ff6-bebe-352d9d600493&brandShopNo=&styleNM=&cateViewOn=&reSearchCk=&reNoSearchCk=&brandNM=&allBrandNM=&strtgyCtgryNo=&leftBrandNM=&tryBannerYN=&recomSmtFlterVal=#tab_a0')
        filter = wd.find_element_by_xpath('//*[@id="smartFilterAnchor"]').click()
        time.sleep(1)
        tab = wd.find_element_by_xpath('//*[@id="smartFilter"]')

        # 스마트 필터에서 가져오는 정보
        html = wd.page_source

        soup = BeautifulSoup(html, 'lxml')

        annotation['product_id'] = product_id
        
        try:
            for i in range(11):
                temp = soup.select_one(f'#tab_a{i}')
                try:            
                    if temp.text.split()[0].strip() == '브랜드':        # 브랜드 이름
                        brand_name = temp.select('label')[0].text.strip()
                        product_link = f"https://www.ssfshop.com/{brand_name}/{product_id}/good?dspCtgryNo=SFMA41A01"   # 상품 정보
                        annotation['product_link'] = product_link
                        annotation['brand'] = brand_name
                    elif temp.text.split()[0].strip() == '소재':          # 소재 정보
                        try:
                            material = temp.select('label')[0].text.strip()
                            annotation['material'] = material
                        except IndexError as e:
                            print(product_id + " 소재 정보 미기입")
                    elif temp.text.split()[0].strip() == '색상/패턴':     # 색상 / 패턴 정보
                        try:
                            color = temp.select('label')[0].text.strip()
                            annotation['color'] = color
                        except IndexError as e:
                            print(product_id + " 색상 정보 미기입")
                        try:
                            pattern = temp.select('label')[1].text.strip()
                            annotation['pattern'] = pattern
                        except IndexError as e:
                            print(product_id + " 패턴 정보 미기입")                    
                    elif temp.text.split()[0].strip() == '종류':        # 세부 카테고리 (midCategory만 있는 경우에 있는 항목인듯 => 최하위 카테고리로 활용)
                        try:
                            sub_cat = temp.select('label')[0].text.strip()
                            if annotation['category'] is None:
                                annotation['category'] = sub_cat
                        except IndexError as e:
                            print(product_id + " 종류 정보 미기입")
                    elif temp.text.split()[0] == 'Style':       # 개별 옷의 Style 정보
                        try:
                            style_name = temp.select('label')[0].text.strip()
                            annotation['style'] = style_name
                        except IndexError as e:
                            print(product_id + " 스타일 정보 미기입")
                except AttributeError as e:
                    print(e)
        except IndexError as e:
            print(e)

        return product_link

cloth_scrap() : 상품 정보 스크랩

def cloth_scrap(product_url,product_id, annotation, isRecommend):
    
    print(product_id + " 상세 페이지로부터 정보 추출 중")
    req = requests.get(product_url)
    html = req.text
    soup = BeautifulSoup(html,'lxml')

    annotation['product_link'] = product_url

    # 성별, 상위 카테고리, 중간 카테고리, 카테고리
    category_info = soup.select('#location > span')
    if len(category_info) == 5:
        sex = soup.select('#location > span')[1:][0].text.strip()
        if sex != "OUTLET":
            superCat = soup.select('#location > span')[1:][1].text.strip()
            midCat = soup.select('#location > span')[1:][2].text.strip()
            cat = soup.select('#location > span')[1:][3].text.strip()
        else:
            sex = soup.select('#location > span')[1:][1].text.strip()
            superCat = soup.select('#location > span')[1:][2].text.strip()
            midCat = soup.select('#location > span')[1:][3].text.strip()
            cat = None
        annotation['sex'] = sex
        annotation['superCategory'] = superCat
        annotation['midCategory'] = midCat
        if annotation['category'] == None and cat != None: 
            annotation['category'] = cat            
    elif len(category_info) == 6:
        sex = soup.select('#location > span')[1:][1].text.strip()
        superCat = soup.select('#location > span')[1:][2].text.strip()
        midCat = soup.select('#location > span')[1:][3].text.strip()
        cat = soup.select('#location > span')[1:][4].text.strip()
        annotation['sex'] = sex
        annotation['superCategory'] = superCat
        annotation['midCategory'] = midCat
        annotation['category'] = cat
    else:
        try:
            sex = soup.select('#location > span')[1:][0].text.strip()
            superCat = soup.select('#location > span')[1:][1].text.strip()
            cat = soup.select('#location > span')[1:][2].text.strip()
            annotation['sex'] = sex
            annotation['superCategory'] = superCat
            annotation['midCategory'] = cat
        except IndexError as e:
            print(e)

   
    # 브랜드 내 상품 번호
    try:
        brand_product_id = soup.select_one('#content > section.detail > div.summary > div.tag > h3 > small').text
        annotation['brand_product_id'] = brand_product_id
    except AttributeError as e:
        print(e)
        

    # 상품 명
    product_name = soup.select_one('#goodDtlTitle').text.split('\t')[-1].strip()
    annotation['product_name'] = product_name

    # 가격
    price = soup.select_one('#content > section.detail > div.summary > div.tag > div.price').text.split()[0].strip()
    annotation['price'] = price

    try:
        save_image(dir_name,product_id,product_url,annotation)
    except UnboundLocalError as e:
        print(e)

    if isRecommend == True:     # 추천으로 들어온 상품
        informations.append(annotation)
    else:
        reco_scrap(product_url,product_id,annotation)

reco_scrap() : 스타일링 상품 스크랩

def reco_scrap(product_url, product_id, annotation):
    global relations_id
    print("scrapping recommendation")
    temp_list = []
    # 비슷한 상품 추천(현재 생략)
    '''
    try:
        similar_items = soup.select_one('#content > section.detail > div.tastes.similar_item')
        for i in similar_items.select('li'):
            similar_product_id = i['view-godno']
            similar_product_url = 'https://www.ssfshop.com' + i.select_one('a')['href'].text
            similar_product = {"product id": similar_product_id, "product_link" : similar_product_url}
            annotation['similar'] = []
            annotation['similar'].append(similar_product.copy())
    except AttributeError as e:
        annotation['similar'] = None
        print(product_id + " doesn't have similar recommends")
    '''

    # 스타일링 목록의 전체 상품 넘버 및 주소
    try: 
        wd.get(product_url)
        button = wd.find_element_by_css_selector('#content > section.detail > div.styling > div > a')
        #button = wd.find_element_by_xpath('//*[@id="content"]/section[2]/div[4]/div/ul[1]/li/a')
        #content > section.detail > div.styling > div > a
        button.click()
        time.sleep(0.5)
        info_box = wd.find_element_by_xpath('//*[@id="popup"]')
        info_list = info_box.find_elements_by_css_selector('#ttt > div > div.lSSlideWrapper.usingCss > ul > li.lslide')
        coordination_number = 0
        for info_index in info_list: 
            try:
                info_index.click()
                coordination_number += 1
            except ElementNotInteractableException as e:
                print(e)
            
            # 자기 자신 제외
            info = wd.find_element_by_xpath('//*[@id="popup"]')
            products = info.find_elements_by_css_selector('#coordi > div.slider.set_goods.hide_last > div > div.lSSlideWrapper.usingCss > ul > li.active > ul > li')
            wd.implicitly_wait(1)
            check = products[0].find_element_by_css_selector('a').get_attribute('prop')
            for product in products:
                reco_id = product.find_element_by_css_selector('a').get_attribute('prop')
                print(reco_id)
                if reco_id == product_id:
                    pass
                else:
                    if [product_id,reco_id] in relations_id:                        
                        break
                    else:
                        relations_id.append([product_id,reco_id])
                        more_product = info.find_element_by_xpath(f'//*[@id="{reco_id}"]')
                        
                        link_list = more_product.find_elements_by_css_selector('div > div > div.lSSlideWrapper.usingCss > ul > li')
                        
                        for link in link_list[:1]:      # 범위 축소
                            product_link = link.find_element_by_css_selector('a').get_attribute('href') 
                            recommends = recommendation.copy()
                            recommends["product_id"] = product_id
                            recommends['recommend_id'] = reco_id
                            recommends['coordination_number'] = coordination_number
                            recommends['category'] = more_product.text.split()[0]
                            relations.append(recommends) #relations.append(recommends.copy())
                            temp_list.append([product_link, reco_id])
                            if annotation['coordination'] == False or annotation['coordination'] == None:
                                annotation['coordination'] = True
    except NoSuchElementException as e:
        print(product_id + " doesn't have coordination")

    informations.append(annotation)
    for temp in temp_list:
        annotation_new = annotation_origin.copy()
        if from_filter(temp[1],annotation_new) == "product_id 존재":    
            pass 
        else:
            cloth_scrap(temp[0], temp[1], annotation_new, True)
    temp_list.clear()

save_image() : 이미지 저장

def save_image(dir_name,product_id,product_link,annotation):
    
    print(product_id + " 저장 중 ")
    global count
    # new_dir = dir_name +"/" + product_id  # 이미지 별 폴더 생성
    # os.makedirs(new_dir)
    wd.get(product_link)
    image_box = wd.find_element_by_css_selector('#content > section.detail > div.summary > div.gallery > div.lSSlideOuter > div.lSPager.lSGallery > ul')
    image_list = image_box.find_elements_by_css_selector('li')
    try:
#        for image in image_list[1:2]:
        image = image_list[1]
        image.click()
        time.sleep(0.5)
        src = wd.find_element_by_css_selector('#content > section.detail > div.summary > div.gallery > div.lSSlideOuter > div.lSSlideWrapper.usingCss > ul > li.zoom-in.lslide.active > a > img').get_attribute('src')
        annotation['img_src'] = src
        count += 1
        try :
            if src.split('.')[-1] == "png": # 이미지 확장자 png
                urlretrieve(src, dir_name + '/' + product_id + ".png")
                time.sleep(0.2)   
                #urlretrieve(src, new_dir + '/' + str(count) + ".png") # 이미지 저장
            else:
                urlretrieve(src, dir_name + '/' + product_id + ".jpg")   
                time.sleep(0.2)
                #urlretrieve(src, new_dir + '/' + str(count) + ".jpg") # 이미지 저장
        except Exception as e:
            print(e)
    except IndexError as e:
        print(e)
        image = image_list[0]
        image.click()
        time.sleep(0.5)
        src = wd.find_element_by_css_selector('#content > section.detail > div.summary > div.gallery > div.lSSlideOuter > div.lSSlideWrapper.usingCss > ul > li.zoom-in.lslide.active > a > img').get_attribute('src')
        count += 1
        try :
            if src.split('.')[-1] == "png": # 이미지 확장자 png
                urlretrieve(src, dir_name + '/' + product_id + ".png")   
                time.sleep(0.2)
                #urlretrieve(src, new_dir + '/' + str(count) + ".png") # 이미지 저장
            else:
                urlretrieve(src, dir_name + '/' + product_id + ".jpg")   
                time.sleep(0.2)
                #urlretrieve(src, new_dir + '/' + str(count) + ".jpg") # 이미지 저장
        except Exception as e:
            print(f"디렉토리 내 {product_id} 이미지 존재")
    
    print(str(count) + "개 성공")

#스크래핑 수행

import json

dir_name = './' + "SSF"

os.mkdir(dir_name)

informations = []
relations = []
informations_id = []
relations_id = []

count = 0 # 이미지 카운트

category_list = [0,0,1,0,0,0,0,0,0,0,0,0,0] #티셔츠 only , [1,1,1,1,1,1,1,1,0,0,0,0,0]  : 아우터 ~ 스커트

# 페이지 범위 내 의류 상품 번호
cloth_list = total_clothes(category_list)


for id in cloth_list:
    try:
        annotation = annotation_origin.copy()
        try:
            product_link = from_filter(id,annotation)
            if product_link == "product_id 존재":
                pass
            else:
                try:
                    cloth_scrap(product_link, id, annotation, False)
                except Exception as e :
                    print(e)
        except NoSuchElementException as e:
            print(id + " 정보 찾아오기 실패")
    except AttributeError as e:
        print(e)
    except UnboundLocalError as e:
        print(e)
    except ReferenceError as e:
        print(e)

# 파일이름 설정해주기
with open('informations.json', 'w', encoding='utf-8') as f:
    json.dump(informations, f, ensure_ascii=False, indent='\t')
with open('relations.json', 'w', encoding='utf-8') as f:
    json.dump(relations, f, ensure_ascii=False, indent='\t')
con = sqlite3.connect('cloth.db') # 

cur = con.cursor() # 데이터베이스 동작을 위한 커서 생성

query = """create table Annotation(
    product_id TEXT primary key NOT NULL,
    product_link TEXT,
    sex TEXT,
    superCategory TEXT,
    midCategory TEXT,
    category TEXT,
    brand TEXT,
    brand_product_id TEXT,
    product_name TEXT,
    color TEXT,
    material TEXT,
    pattern TEXT,
    style TEXT,
    coordination INTEGER,
    price INTEGER,
    img_src TEXT
)"""
cur.execute(query)
query = "insert into Annotation values (:product_id, :product_link, :sex, :superCategory, :midCategory, :category, :brand, :brand_product_id, :product_name ,:color, :material, :pattern, :style, :coordination, :price, :img_src)"
cur.executemany(query, informations)
con.commit()


con.close() # 연결할 데이터베이스 종료
con = sqlite3.connect('relation.db') 

cur = con.cursor() # 데이터베이스 동작을 위한 커서 생성

query = """create table RelationShip(
    product_id TEXT NOT NULL,
    recommend_id TEXT NOT NULL,
    category TEXT,
    coordination_number TEXT,   
    primary key(product_id, recommend_id)
)"""
cur.execute(query)
query = "insert into RelationShip values (:product_id, :recommend_id,  :category, :coordination_number)"
cur.executemany(query, relations)
con.commit()


con.close() # 연결할 데이터베이스 종료
from google.colab import files

!zip -r SSF.zip SSF
files.download('SSF.zip')
files.download('./informations.json')
files.download('./relations.json')
files.download('./cloth.db')
files.download('./relation.db')
profile
Arch-ITech

0개의 댓글