판매자 능동 참여형 패션 스타일링 플랫폼 관련 크롤러(COLAB 활용)
!pip install Selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin/
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument(f'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36')
#user-agent 값
wd = webdriver.Chrome('chromedriver',options = chrome_options)
import os # 파일로 이미지를 다운로드하기 위함
import time # 동적 웹 반응 대기를 위한 sleep
import socket # 소켓 에러 방지 위함
from urllib.request import urlretrieve # 이미지를 다운로드하기 위한 라이브러리
from urllib.error import HTTPError, URLError
from selenium.common.exceptions import ElementClickInterceptedException, NoSuchElementException, ElementNotInteractableException
from PIL import Image # 파이썬에서 이미지에 대한 라이브러리
import sqlite3
import requests
from bs4 import BeautifulSoup
import re
- 상품 번호(PK)
- 상품 링크
- 성별
- 최상위 카테고리
- 중간 카테고리
- 카테고리
- 브랜드명
- 브랜드명
- 브랜드 내 상품 번호
- 색상
- 소재
- 패턴
- 스타일
- 기준 의류 상품 번호(PK)
- 비슷한 의류 상품 번호(PK)
- 기준 의류 상품 번호(PK)
- 스타일링 넘버
- 스타일링 상품 번호(PK)
annotation_origin = {
"product_id" : None, # primary key
"product_link" : None,
"sex" : None,
"superCategory" : None,
"midCategory" : None,
"category" : None,
"brand" : None,
"brand_product_id" : None,
"product_name" : None,
"color" : None,
"material" : None,
"pattern" : None,
"style" : None,
"coordination" : None, # True, False, Null(Unknown)
"price" : None,
"img_src" : None
}
recommendation = {
"product_id" : None, # primary key
"recommend_id" : None, # primary key
"category" : None,
"coordination_number" : None
#"similar" : None,
}
wd.get('https://www.ssfshop.com/WOMEN/list?dspCtgryNo=SFMA41&brandShopNo=&brndShopId=&etcCtgryNo=&ctgrySectCd=&keyword=&leftBrandNM=')
cats = wd.find_element_by_css_selector('body > div.wrap.ssf > nav > ul:nth-child(2) > li:nth-child(1) > ul')
cat = cats.find_elements_by_tag_name('a')
print("카테고리 메뉴 이름 및 식별 번호]")
for c in cat:
print(c.text + " : " + c.get_attribute("ctgryno"))
카테고리 메뉴 이름 및 식별 번호]
메인 : THMA38A01
전체 상품 : SFMA41
아우터 : SFMA41A07
재킷/베스트 : SFMA41A21
티셔츠 : SFMA41A01
셔츠/블라우스 : SFMA41A02
니트 : SFMA41A03
원피스 : SFMA41A06
팬츠 : SFMA41A04
스커트 : SFMA41A05
가방/지갑 : SFMA41A10
패션잡화 : SFMA41A12
신발 : SFMA41A11
비치웨어 : SFMA41A09
언더웨어 : SFMA41A08
smtFlterVal= 35%2C36 ( 솔리드 & 가로 스트라이프 )
ex ) https://,,,,&cateNo=SFMA41A07%2CSFMA41A21,,,,
# 원핫 인코딩 활용
category_dict = {'0':'SFMA41A07', '1': 'SFMA41A21', '2': 'SFMA41A01', '3' : 'SFMA41A02', '4': 'SFMA41A03', '5':'SFMA41A06',
'6':'SFMA41A04', '7':'SFMA41A05', '8':'SFMA41A10', '9':'SFMA41A12', '10':'SFMA41A11', '11':'SFMA41A09', '12':'SFMA41A08'}
#category_list = [0,0,0,0,0,0,0,0,0,0,0,0,0] # 아우터(인덱스 0), 재켓/베스트(인덱스 1) ~ 언더웨어(인덱스12) (총 13개)
def which_catorgies(category_list):
temp = []
for i in range(len(category_list)):
if category_list[i] == 1:
temp.append(category_dict[f'{i}'])
if len(temp) > 1 :
category_filter = '%2C'.join(temp)
else:
category_filter = temp[0]
return category_filter
def total_page(url):
last_page_num = 0
wd.get(url)
try :
last = wd.find_element_by_xpath('//*[@id="page_last"]')
last_page_num = int(last.get_attribute('pageno'))
except NoSuchElementException as e:
print("No Last Button --> This is Last Page")
last_page_num = 1
return last_page_num
def filtered_url(category_list):
filter_info = which_catorgies(category_list)
url = f'https://www.ssfshop.com/WOMEN/list?dspCtgryNo=SFMA41&brandShopNo=&brndShopId=¤tPage=1&sortColumn=SALE_QTY_SEQ&etcCtgryNo=&leftBrandNM=&serviceType=DSP&smtFlterVal=&price=&benefit=&delivery=&lineId=&ctgrySectCd=GNRL_CTGRY&brndId=&sizeNM=&colorCd=&materNM=&cateViewOn=&cateNo={filter_info}&fitPsbYn=N'
return url
#total_cloths():의류 카테고리 설정 후 전체 항목 초기 정보 가져오기
# 아우터, 재킷/베스트, 티셔츠, 셔츠/블라우스, 니트, 원피스, 팬츠, 스커트
def total_clothes(category_list):
cloth_list = []
count = 0
url_search = filtered_url(category_list)
page_num = total_page(url_search)
for i in range(25,30): #for i in range(page_num):
new_url_search = url_search.replace("currentPage=1","currentPage="+str(i+1))
req = requests.get(new_url_search)
html = req.text
soup = BeautifulSoup(html,'lxml')
catalog = soup.select('#dspGood > li')
for c in catalog[:]:
product_id = c['data-prdno'].strip()
cloth_list.append(product_id)
return cloth_list
def from_filter(product_id, annotation):
global informations_id
if product_id in informations_id:
print(product_id + " 중복")
return "product_id 존재"
else:
print(product_id + " 필터로 부터 정보 추출 중")
informations_id.append(product_id)
wd.get(f'https://www.ssfshop.com/public/search/search/view?serviceType=SRC&keyword={product_id}&cateNo=&brndId=&colorCd=&sizeNM=&materNM=&fitPsbYn=&smtFlterVal=&price=&benefit=&delivery=&lineId=&dspCtgryNo=SFMA41&brndShopId=&orderView=&pageNo=1&_csrf=3c7b631b-c124-4ff6-bebe-352d9d600493&brandShopNo=&styleNM=&cateViewOn=&reSearchCk=&reNoSearchCk=&brandNM=&allBrandNM=&strtgyCtgryNo=&leftBrandNM=&tryBannerYN=&recomSmtFlterVal=#tab_a0')
filter = wd.find_element_by_xpath('//*[@id="smartFilterAnchor"]').click()
time.sleep(1)
tab = wd.find_element_by_xpath('//*[@id="smartFilter"]')
# 스마트 필터에서 가져오는 정보
html = wd.page_source
soup = BeautifulSoup(html, 'lxml')
annotation['product_id'] = product_id
try:
for i in range(11):
temp = soup.select_one(f'#tab_a{i}')
try:
if temp.text.split()[0].strip() == '브랜드': # 브랜드 이름
brand_name = temp.select('label')[0].text.strip()
product_link = f"https://www.ssfshop.com/{brand_name}/{product_id}/good?dspCtgryNo=SFMA41A01" # 상품 정보
annotation['product_link'] = product_link
annotation['brand'] = brand_name
elif temp.text.split()[0].strip() == '소재': # 소재 정보
try:
material = temp.select('label')[0].text.strip()
annotation['material'] = material
except IndexError as e:
print(product_id + " 소재 정보 미기입")
elif temp.text.split()[0].strip() == '색상/패턴': # 색상 / 패턴 정보
try:
color = temp.select('label')[0].text.strip()
annotation['color'] = color
except IndexError as e:
print(product_id + " 색상 정보 미기입")
try:
pattern = temp.select('label')[1].text.strip()
annotation['pattern'] = pattern
except IndexError as e:
print(product_id + " 패턴 정보 미기입")
elif temp.text.split()[0].strip() == '종류': # 세부 카테고리 (midCategory만 있는 경우에 있는 항목인듯 => 최하위 카테고리로 활용)
try:
sub_cat = temp.select('label')[0].text.strip()
if annotation['category'] is None:
annotation['category'] = sub_cat
except IndexError as e:
print(product_id + " 종류 정보 미기입")
elif temp.text.split()[0] == 'Style': # 개별 옷의 Style 정보
try:
style_name = temp.select('label')[0].text.strip()
annotation['style'] = style_name
except IndexError as e:
print(product_id + " 스타일 정보 미기입")
except AttributeError as e:
print(e)
except IndexError as e:
print(e)
return product_link
def cloth_scrap(product_url,product_id, annotation, isRecommend):
print(product_id + " 상세 페이지로부터 정보 추출 중")
req = requests.get(product_url)
html = req.text
soup = BeautifulSoup(html,'lxml')
annotation['product_link'] = product_url
# 성별, 상위 카테고리, 중간 카테고리, 카테고리
category_info = soup.select('#location > span')
if len(category_info) == 5:
sex = soup.select('#location > span')[1:][0].text.strip()
if sex != "OUTLET":
superCat = soup.select('#location > span')[1:][1].text.strip()
midCat = soup.select('#location > span')[1:][2].text.strip()
cat = soup.select('#location > span')[1:][3].text.strip()
else:
sex = soup.select('#location > span')[1:][1].text.strip()
superCat = soup.select('#location > span')[1:][2].text.strip()
midCat = soup.select('#location > span')[1:][3].text.strip()
cat = None
annotation['sex'] = sex
annotation['superCategory'] = superCat
annotation['midCategory'] = midCat
if annotation['category'] == None and cat != None:
annotation['category'] = cat
elif len(category_info) == 6:
sex = soup.select('#location > span')[1:][1].text.strip()
superCat = soup.select('#location > span')[1:][2].text.strip()
midCat = soup.select('#location > span')[1:][3].text.strip()
cat = soup.select('#location > span')[1:][4].text.strip()
annotation['sex'] = sex
annotation['superCategory'] = superCat
annotation['midCategory'] = midCat
annotation['category'] = cat
else:
try:
sex = soup.select('#location > span')[1:][0].text.strip()
superCat = soup.select('#location > span')[1:][1].text.strip()
cat = soup.select('#location > span')[1:][2].text.strip()
annotation['sex'] = sex
annotation['superCategory'] = superCat
annotation['midCategory'] = cat
except IndexError as e:
print(e)
# 브랜드 내 상품 번호
try:
brand_product_id = soup.select_one('#content > section.detail > div.summary > div.tag > h3 > small').text
annotation['brand_product_id'] = brand_product_id
except AttributeError as e:
print(e)
# 상품 명
product_name = soup.select_one('#goodDtlTitle').text.split('\t')[-1].strip()
annotation['product_name'] = product_name
# 가격
price = soup.select_one('#content > section.detail > div.summary > div.tag > div.price').text.split()[0].strip()
annotation['price'] = price
try:
save_image(dir_name,product_id,product_url,annotation)
except UnboundLocalError as e:
print(e)
if isRecommend == True: # 추천으로 들어온 상품
informations.append(annotation)
else:
reco_scrap(product_url,product_id,annotation)
def reco_scrap(product_url, product_id, annotation):
global relations_id
print("scrapping recommendation")
temp_list = []
# 비슷한 상품 추천(현재 생략)
'''
try:
similar_items = soup.select_one('#content > section.detail > div.tastes.similar_item')
for i in similar_items.select('li'):
similar_product_id = i['view-godno']
similar_product_url = 'https://www.ssfshop.com' + i.select_one('a')['href'].text
similar_product = {"product id": similar_product_id, "product_link" : similar_product_url}
annotation['similar'] = []
annotation['similar'].append(similar_product.copy())
except AttributeError as e:
annotation['similar'] = None
print(product_id + " doesn't have similar recommends")
'''
# 스타일링 목록의 전체 상품 넘버 및 주소
try:
wd.get(product_url)
button = wd.find_element_by_css_selector('#content > section.detail > div.styling > div > a')
#button = wd.find_element_by_xpath('//*[@id="content"]/section[2]/div[4]/div/ul[1]/li/a')
#content > section.detail > div.styling > div > a
button.click()
time.sleep(0.5)
info_box = wd.find_element_by_xpath('//*[@id="popup"]')
info_list = info_box.find_elements_by_css_selector('#ttt > div > div.lSSlideWrapper.usingCss > ul > li.lslide')
coordination_number = 0
for info_index in info_list:
try:
info_index.click()
coordination_number += 1
except ElementNotInteractableException as e:
print(e)
# 자기 자신 제외
info = wd.find_element_by_xpath('//*[@id="popup"]')
products = info.find_elements_by_css_selector('#coordi > div.slider.set_goods.hide_last > div > div.lSSlideWrapper.usingCss > ul > li.active > ul > li')
wd.implicitly_wait(1)
check = products[0].find_element_by_css_selector('a').get_attribute('prop')
for product in products:
reco_id = product.find_element_by_css_selector('a').get_attribute('prop')
print(reco_id)
if reco_id == product_id:
pass
else:
if [product_id,reco_id] in relations_id:
break
else:
relations_id.append([product_id,reco_id])
more_product = info.find_element_by_xpath(f'//*[@id="{reco_id}"]')
link_list = more_product.find_elements_by_css_selector('div > div > div.lSSlideWrapper.usingCss > ul > li')
for link in link_list[:1]: # 범위 축소
product_link = link.find_element_by_css_selector('a').get_attribute('href')
recommends = recommendation.copy()
recommends["product_id"] = product_id
recommends['recommend_id'] = reco_id
recommends['coordination_number'] = coordination_number
recommends['category'] = more_product.text.split()[0]
relations.append(recommends) #relations.append(recommends.copy())
temp_list.append([product_link, reco_id])
if annotation['coordination'] == False or annotation['coordination'] == None:
annotation['coordination'] = True
except NoSuchElementException as e:
print(product_id + " doesn't have coordination")
informations.append(annotation)
for temp in temp_list:
annotation_new = annotation_origin.copy()
if from_filter(temp[1],annotation_new) == "product_id 존재":
pass
else:
cloth_scrap(temp[0], temp[1], annotation_new, True)
temp_list.clear()
def save_image(dir_name,product_id,product_link,annotation):
print(product_id + " 저장 중 ")
global count
# new_dir = dir_name +"/" + product_id # 이미지 별 폴더 생성
# os.makedirs(new_dir)
wd.get(product_link)
image_box = wd.find_element_by_css_selector('#content > section.detail > div.summary > div.gallery > div.lSSlideOuter > div.lSPager.lSGallery > ul')
image_list = image_box.find_elements_by_css_selector('li')
try:
# for image in image_list[1:2]:
image = image_list[1]
image.click()
time.sleep(0.5)
src = wd.find_element_by_css_selector('#content > section.detail > div.summary > div.gallery > div.lSSlideOuter > div.lSSlideWrapper.usingCss > ul > li.zoom-in.lslide.active > a > img').get_attribute('src')
annotation['img_src'] = src
count += 1
try :
if src.split('.')[-1] == "png": # 이미지 확장자 png
urlretrieve(src, dir_name + '/' + product_id + ".png")
time.sleep(0.2)
#urlretrieve(src, new_dir + '/' + str(count) + ".png") # 이미지 저장
else:
urlretrieve(src, dir_name + '/' + product_id + ".jpg")
time.sleep(0.2)
#urlretrieve(src, new_dir + '/' + str(count) + ".jpg") # 이미지 저장
except Exception as e:
print(e)
except IndexError as e:
print(e)
image = image_list[0]
image.click()
time.sleep(0.5)
src = wd.find_element_by_css_selector('#content > section.detail > div.summary > div.gallery > div.lSSlideOuter > div.lSSlideWrapper.usingCss > ul > li.zoom-in.lslide.active > a > img').get_attribute('src')
count += 1
try :
if src.split('.')[-1] == "png": # 이미지 확장자 png
urlretrieve(src, dir_name + '/' + product_id + ".png")
time.sleep(0.2)
#urlretrieve(src, new_dir + '/' + str(count) + ".png") # 이미지 저장
else:
urlretrieve(src, dir_name + '/' + product_id + ".jpg")
time.sleep(0.2)
#urlretrieve(src, new_dir + '/' + str(count) + ".jpg") # 이미지 저장
except Exception as e:
print(f"디렉토리 내 {product_id} 이미지 존재")
print(str(count) + "개 성공")
#스크래핑 수행
import json
dir_name = './' + "SSF"
os.mkdir(dir_name)
informations = []
relations = []
informations_id = []
relations_id = []
count = 0 # 이미지 카운트
category_list = [0,0,1,0,0,0,0,0,0,0,0,0,0] #티셔츠 only , [1,1,1,1,1,1,1,1,0,0,0,0,0] : 아우터 ~ 스커트
# 페이지 범위 내 의류 상품 번호
cloth_list = total_clothes(category_list)
for id in cloth_list:
try:
annotation = annotation_origin.copy()
try:
product_link = from_filter(id,annotation)
if product_link == "product_id 존재":
pass
else:
try:
cloth_scrap(product_link, id, annotation, False)
except Exception as e :
print(e)
except NoSuchElementException as e:
print(id + " 정보 찾아오기 실패")
except AttributeError as e:
print(e)
except UnboundLocalError as e:
print(e)
except ReferenceError as e:
print(e)
# 파일이름 설정해주기
with open('informations.json', 'w', encoding='utf-8') as f:
json.dump(informations, f, ensure_ascii=False, indent='\t')
with open('relations.json', 'w', encoding='utf-8') as f:
json.dump(relations, f, ensure_ascii=False, indent='\t')
con = sqlite3.connect('cloth.db') #
cur = con.cursor() # 데이터베이스 동작을 위한 커서 생성
query = """create table Annotation(
product_id TEXT primary key NOT NULL,
product_link TEXT,
sex TEXT,
superCategory TEXT,
midCategory TEXT,
category TEXT,
brand TEXT,
brand_product_id TEXT,
product_name TEXT,
color TEXT,
material TEXT,
pattern TEXT,
style TEXT,
coordination INTEGER,
price INTEGER,
img_src TEXT
)"""
cur.execute(query)
query = "insert into Annotation values (:product_id, :product_link, :sex, :superCategory, :midCategory, :category, :brand, :brand_product_id, :product_name ,:color, :material, :pattern, :style, :coordination, :price, :img_src)"
cur.executemany(query, informations)
con.commit()
con.close() # 연결할 데이터베이스 종료
con = sqlite3.connect('relation.db')
cur = con.cursor() # 데이터베이스 동작을 위한 커서 생성
query = """create table RelationShip(
product_id TEXT NOT NULL,
recommend_id TEXT NOT NULL,
category TEXT,
coordination_number TEXT,
primary key(product_id, recommend_id)
)"""
cur.execute(query)
query = "insert into RelationShip values (:product_id, :recommend_id, :category, :coordination_number)"
cur.executemany(query, relations)
con.commit()
con.close() # 연결할 데이터베이스 종료
from google.colab import files
!zip -r SSF.zip SSF
files.download('SSF.zip')
files.download('./informations.json')
files.download('./relations.json')
files.download('./cloth.db')
files.download('./relation.db')