02.Python ํ™œ์šฉ - Crawling & MySql

ID์งฑ์žฌยท2021๋…„ 2์›” 22์ผ
0

Crawling

๋ชฉ๋ก ๋ณด๊ธฐ
4/5
post-thumbnail
post-custom-banner

๐ŸŒˆ Crawling ์—ฐ์Šต

๐Ÿ”ฅ mysql ์Šคํ‚ค๋งˆ ๋งŒ๋“ค๊ธฐ

๐Ÿ”ฅ gmarket ํฌ๋กค๋ง

๐Ÿ”ฅ ํฌ๋กค๋ง ๋ฐ์ดํ„ฐ mysql db ์‚ฝ์ž…

๐Ÿ”ฅ ์ตœ์ข… ์ฝ”๋“œ


1. mysql ์Šคํ‚ค๋งˆ ๋งŒ๋“ค๊ธฐ

  • crawling ํ•œ ๊ฒฐ๊ณผ๋ฅผ, mysql์— ์ €์žฅํ•  ์ˆ˜ ์žˆ๋„๋ก ๋จผ์ € mysql์˜ ์Šคํ‚ค๋งˆ ์ƒ์„ฑ
  • items ํ…Œ์ด๋ธ”์„ ๋จผ์ € ์ƒ์„ฑํ•œ ๋’ค, ranking ํ…Œ์ด๋ธ”์„ ์ƒ์„ฑํ•  ๋•Œ, foreign key ์ง€์ •
    • ๐Ÿ” FOREIGN KEY (item_code) REFERENCES items(item_code)
  • items ํ…Œ์ด๋ธ” ๊ตฌ์กฐ
    • item_code : ์ƒํ’ˆ ๊ณ ์œ  ์ฝ”๋“œ(varchar / primary key)
    • title : ์ƒํ’ˆ๋ช…(varchar)
    • ori_priece : ์›๋ž˜ ๊ฐ€๊ฒฉ(int)
    • dis_pricce : ํ• ์ธ ๊ฐ€๊ฒฉ(int)
    • dis_percent : ํ• ์ธ์œจ(int)
    • provider : ํŒ๋งค์ž(varchar)
  • ranking ํ…Œ์ด๋ธ” ๊ตฌ์กฐ
    • num : ๋ฒˆํ˜ธ(int / primary key)
    • main_category : ๋Œ€๋ถ„๋ฅ˜(varchar)
    • sub_category : ์†Œ๋ถ„๋ฅ˜(varchar)
    • item_ranking : ์ˆœ์œ„(tinyint / unsigned)
    • item_code : ์ƒํ’ˆ ๊ณ ์œ  ์ฝ”๋“œ(varchar / foreign key)

โœ๐Ÿป python

## mysql & crawling
import pymysql
db = pymysql.connect(host='localhost', port=3306, user='root', passwd='๋‚ด ๋น„๋ฐ€๋ฒˆํ˜ธ', db='bestproducts', charset='utf8')
cursor = db.cursor()
#
# ์Šคํ‚ค๋งˆ ์ƒ์„ฑ
# ์Šคํ‚ค๋งˆ ์ •์˜ : items ํ…Œ์ด๋ธ”
sql='''
CREATE TABLE items(
    item_code VARCHAR(20) NOT NULL PRIMARY KEY,
    title VARCHAR(200) NOT NULL,
    ori_price INT NOT NULL,
    dis_price INT NOT NULL,
    dis_percent INT NOT NULL,
    provider VARCHAR(100)
);
'''
cursor.execute(sql) # items ํ…Œ์ด๋ธ” ์ƒ์„ฑ sql ์‹คํ–‰
#
# ์Šคํ‚ค๋งˆ ์ •์˜ : ranking ํ…Œ์ด๋ธ”
sql = '''
CREATE TABLE ranking(
    num INT AUTO_INCREMENT NOT NULL PRIMARY KEY,
    main_category VARCHAR(50) NOT NULL,
    sub_category VARCHAR(50) NOT NULL,
    item_ranking TINYINT UNSIGNED NOT NULL,
    item_code VARCHAR(20) NOT NULL,
    FOREIGN KEY (item_code) REFERENCES items(item_code)
);
'''
cursor.execute(sql) # items ํ…Œ์ด๋ธ” ์ƒ์„ฑ  sql ์‹คํ–‰
db.commit()
db.close()

2. gmarket ํฌ๋กค๋ง

1) best100 main_category(๋Œ€๋ถ„๋ฅ˜) ์ •๋ณด ๊ฐ€์ ธ์˜ค๊ธฐ

  • ํฌ๋กค๋ง ์ฃผ์†Œ : http://corners.gmarket.co.kr/Bestsellers
  • best100 ํŽ˜์ด์ง€์—์„œ navigation ์—ญํ• ์„ ํ•˜๋Š” ์˜์—ญ ํด๋กค๋งํ•˜์—ฌ ์ฃผ์†Œ์™€ ํ…์ŠคํŠธ ๊ฐ€์ ธ์˜ด

โœ๐Ÿป python

import requests
from bs4 import BeautifulSoup
res = requests.get('http://corners.gmarket.co.kr/Bestsellers')
soup = BeautifulSoup(res.content, 'html.parser')
categories = soup.select('div.gbest-cate ul.by-group li a')
for category in categories:
    print ('http://corners.gmarket.co.kr/' + category['href'], category.get_text())

2) main_category(๋Œ€๋ถ„๋ฅ˜)์™€ sub_category(์†Œ๋ถ„๋ฅ˜) ์ •๋ณด ๊ฐ€์ ธ์˜ค๊ธฐ

  • main_category ๋งํฌ ์ •๋ณด๋ฅผ ๋‹ค์‹œ requestsํ•ด ์†Œ๋ถ„๋ฅ˜ ์ •๋ณด ํด๋กœ๋ง
    • ๋งํฌ ์ •๋ณด์™€ ๋งํฌ๋ช…์„ get_catergory ํ•จ์ˆ˜์— ํŒŒ๋ผ๋ฏธํ„ฐ๋กœ ์ „๋‹ฌ
  • ํด๋กœ๋งํ•  ์†Œ๋ถ„๋ฅ˜ ์ •๋ณด๋Š” ์†Œ๋ถ„๋ฅ˜๋ณ„(sub_category) ๋งํฌ์ฃผ์†Œ์™€ ์†Œ๋ถ„๋ฅ˜ ์ด๋ฆ„

โœ๐Ÿป python

import requests
from bs4 import BeautifulSoup
# 2๋‹จ๊ณ„ : main_category์˜ ๋งํฌ์ฃผ์†Œ์™€ main_category ์ด๋ฆ„์„ ํŒŒ๋ผ๋ฏธํ„ฐ๋กœ ๋ฐ›์Œ
def get_category(category_link, category_name):
    res = requests.get(category_link)
    soup = BeautifulSoup(res.content, 'html.parser')  
    sub_categories = soup.select('div.navi.group ul li a')
    # main_category ๋งํฌ์ฃผ์†Œ, main_category ์ด๋ฆ„, sub_category ์ด๋ฆ„, sub_category ๋งํฌ์ฃผ์†Œ ํฌ๋กค๋ง 
    for sub_category in sub_categories:
        print (category_link, category_name, sub_category.get_text(), 'http://corners.gmarket.co.kr/' + sub_category['href'])
#
# 1๋‹จ๊ณ„ : main_category ๋งํฌ์ฃผ์†Œ์™€ main_category ์ด๋ฆ„์„ get_catergory ํ•จ์ˆ˜๋กœ ๋„˜๊ฒจ์คŒ
res = requests.get('http://corners.gmarket.co.kr/Bestsellers')
soup = BeautifulSoup(res.content, 'html.parser')
categories = soup.select('div.gbest-cate ul.by-group li a')
for category in categories:
    get_category('http://corners.gmarket.co.kr/' + category['href'], category.get_text())

3) main_category, ์ƒํ’ˆ ์ •๋ณด, ์ƒํ’ˆ ์ฝ”๋“œ, ํŒ๋งค์ž ํด๋กœ๋ง

  • ๋ช‡๋ช‡ ์ƒํ’ˆ๋“ค์€ ํ•ด๋‹น ํƒ€๊ฒŸ ํƒœ๊ทธ๊ฐ€ ์กด์žฌํ•˜์ง€ ์•Š๊ฑฐ๋‚˜, ํƒœ๊ทธ ์•ˆ์— text๊ฐ€ ์—†์–ด ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒ๋  ์ˆ˜ ์žˆ์Œ
  • ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ•˜๋ฉด ํฌ๋กค๋ง์ด ์ง„ํ–‰๋˜์ง€ ์•Š๊ธฐ ๋•Œ๋ฌธ์— ๊ทธ๋Ÿฐ ์ƒํ™ฉ์„ ๋Œ€์ฒ˜ํ•  ์ˆ˜ ์žˆ๋Š” if๋ฌธ์œผ๋กœ ์ฒ˜๋ฆฌํ•ด์คŒ
  • ๐Ÿ”ฅ ํฌ๋กค๋ง target : ๋Œ€๋ถ„๋ฅ˜๋ช…, ์†Œ๋ถ„๋ฅ˜๋ช…, ์ˆœ์œ„, ์ƒํ’ˆ์ฝ”๋“œ, ํŒ๋งค์ž, ์ƒํ’ˆ๋ช…, ์›๊ฐ€๊ฒฉ, ํ• ์ธ๊ฐ€๊ฒฉ, ํ• ์ธ๋ฅ 
    • ๋ณ€์ˆ˜๋ช… : category_name, sub_category_name, ranking, item_code, provider, title, ori_price, dis_price, discount_percent

โœ๐Ÿป python

import requests
from bs4 import BeautifulSoup
# 3๋‹จ๊ณ„ : item ์ •๋ณด๋ฅผ ๊ฐ€์ ธ์˜ด
def get_items(html, category_name, sub_category_name):
    best_item = html.select('div.best-list')
    # best_item์ด๋ผ๋Š” class๋ช…์ด ๋‹ค์ˆ˜์ž„. ๋‘๋ฒˆ์งธ best_item ํด๋ž˜์Šค ๋‚ด liํƒœ๊ทธ์— ์ƒํ’ˆ ์ •๋ณด๊ฐ€ ์žˆ์Œ
    if len(best_item[1].select('li')) > 0: 
        for index, item in enumerate(best_item[1].select('li')):
            ranking = index + 1
            title = item.select_one('a.itemname').get_text()
            ori_price = item.select_one('div.o-price')
            dis_price = item.select_one('div.s-price strong span')
            discount_percent = item.select_one('div.s-price em')
	    # ์›๋ž˜ ๊ฐ€๊ฒฉ์ด ๋น„์–ด์žˆ๊ฑฐ๋‚˜, ํ…์ŠคํŠธ๊ฐ€ ์—†์„ ๊ฒฝ์šฐ
            if ori_price == None or ori_price.get_text() == '':
                ori_price = dis_price
	    # ํ• ์ธ ๊ฐ€๊ฒฉ์ด ๋น„์–ด์žˆ์„ ๊ฒฝ์šฐ, ์›๋ž˜ ๊ฐ€๊ฒฉ๊ณผ ํ• ์ธ๊ฐ€๊ฒฉ์— 0 ๋„ฃ์Œ(๋ฌด๋ฃŒ์ผ ๊ฒฝ์šฐ)
            if dis_price == None:
                ori_price, dis_price = 0, 0
            else:
                ori_price = ori_price.get_text().replace(',', '').replace('์›', '')
                dis_price = dis_price.get_text().replace(',', '').replace('์›', '')
	    # ํ• ์ธ์œจ์ด ๋น„์–ด์žˆ๊ฑฐ๋‚˜, ํ…์ŠคํŠธ๊ฐ€ ์—†์„ ๊ฒฝ์šฐ
            if discount_percent == None or discount_percent.get_text() == '':
                discount_percent = 0
            else:
                discount_percent = discount_percent.get_text().replace('%', '')
	    # ๊ฐ๊ฐ์˜ ์ƒํ’ˆ ๋งํฌ
            product_link = item.select_one('div.thumb > a')
            item_code = product_link.attrs['href'].split('=')[1]
            # ํŒ๋งค์ž crawling(๊ฐ ์ƒํ’ˆ ์ •๋ณด์˜ ๋งํฌ๋ฅผ ํƒ€๊ณ  ๋“ค์–ด๊ฐ€ ํŒ๋งค์ž ์ •๋ณด ํด๋กœ๋ง)
            res = requests.get(product_link.attrs['href'])
            soup = BeautifulSoup(res.content, 'html.parser')
            provider = soup.select_one('div.item-topinfo_headline > p > span.text__seller > a')
            if provider == None:
                provider = ''
            else:
                provider = provider.get_text()
            print (category_name, sub_category_name, ranking, item_code, provider, title, ori_price, dis_price, discount_percent)
# 2๋‹จ๊ณ„ : ๋Œ€๋ถ„๋ฅ˜ ๋งํฌ์ฃผ์†Œ์™€ ๋Œ€๋ถ„๋ฅ˜ ์ด๋ฆ„์„ get_category ํŒŒ๋ผ๋ฏธํ„ฐ๋กœ ๋„˜๊น€
def get_category(category_link, category_name):
    res = requests.get(category_link)
    soup = BeautifulSoup(res.content, 'html.parser')
    # ์†Œ๋ถ„๋ฅ˜ ๋„ค๋น„๊ฒŒ์ด์…˜ ํด๋กœ๋ง
    sub_categories = soup.select('div.navi.group ul li > a')
    for sub_category in sub_categories:
        res = requests.get('http://corners.gmarket.co.kr/' + sub_category['href']) # ์†Œ๋ถ„๋ฅ˜ ๋งํฌ๋ฅผ requests
        soup = BeautifulSoup(res.content, 'html.parser')
        # ์†Œ๋ถ„๋ฅ˜ ์ฃผ์ˆ˜, ๋Œ€๋ถ„๋ฅ˜ ์ด๋ฆ„, ์†Œ๋ถ„๋ฅ˜ ์ด๋ฆ„์„ get_items()์˜ ํŒŒ๋ผ๋ฏธํ„ฐ๋กœ ๋„˜๊น€
        get_items(soup, category_name, sub_category.get_text())
#1๋‹จ๊ณ„ : ๋Œ€๋ถ„๋ฅ˜ ๋งํฌ ์ฃผ์†Œ์™€ ๋Œ€๋ถ„๋ฅ˜๋ช…์„ ํฌ๋กค๋ง
res = requests.get('http://corners.gmarket.co.kr/Bestsellers')
soup = BeautifulSoup(res.content, 'html.parser')
categories = soup.select('div.gbest-cate ul.by-group li a')
for category in categories:
    get_category('http://corners.gmarket.co.kr/' + category['href'], category.get_text())

3) ํฌ๋กค๋ง ๋ฐ์ดํ„ฐ ๋”•์…”๋„ˆ๋ฆฌ์— ๋‹ด๊ธฐ

โœ๐Ÿป python

import requests
from bs4 import BeautifulSoup
def get_items(html, category_name, sub_category_name):
    # ์ผ๋ถ€ ์„œ๋ธŒ ์นดํ…Œ๊ณ ๋ฆฌ์˜ ๊ฒฝ์šฐ, ์ด๋ฒคํŠธ ํŽ˜์ด์ง€์™€ ๊ฐ™์ด ํ‘œ์‹œ๋  ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ, ํ•ด๋‹น ์ผ€์ด์Šค๋“ค์€ skip ํ•จ (2020.09.30)
    best_item = html.select('div.best-list')
    if len(best_item[1].select('li')) > 0:
        for index, item in enumerate(best_item[1].select('li')):
            # ํด๋กœ๋งํ•œ ๋ฐ์ดํ„ฐ๋ฅผ ๋”•์…”๋„ˆ๋ฆฌ ํ˜•ํƒœ๋กœ ๋‹ด๊ธฐ์œ„ํ•ด ๋”•์…”๋„ˆ๋ฆฌ ์„ ์–ธ
            data_dict = dict()
            ranking = index + 1
            title = item.select_one('a.itemname').get_text()
            ori_price = item.select_one('div.o-price')
            dis_price = item.select_one('div.s-price strong span')
            dis_percent = item.select_one('div.s-price em')
            if ori_price == None or ori_price.get_text() == '':
                ori_price = dis_price
            if dis_price == None:
                ori_price, dis_price = 0, 0
            else:
                ori_price = ori_price.get_text().replace(',', '').replace('์›', '')
                dis_price = dis_price.get_text().replace(',', '').replace('์›', '')
            if dis_percent == None or dis_percent.get_text() == '':
                dis_percent = 0
            else:
                dis_percent = dis_percent.get_text().replace('%', '')
            product_link = item.select_one('div.thumb > a')
            item_code = product_link.attrs['href'].split('=')[1]
            res = requests.get(product_link.attrs['href'])
            soup = BeautifulSoup(res.content, 'html.parser')
            provider = soup.select_one('div.item-topinfo_headline > p > span.text__seller > a')
            if provider == None:
                provider = ''
            else:
                provider = provider.get_text()
            # ์„ ์–ธํ•œ ๋”•์…”๋„ˆ๋ฆฌ์— ํด๋กœ๋งํ•œ ๋ฐ์ดํ„ฐ ๋‹ด๊ธฐ
            data_dict['category_name'] = category_name
            data_dict['sub_category_name'] = sub_category_name
            data_dict['ranking'] = ranking
            data_dict['title'] = title
            data_dict['ori_price'] = ori_price
            data_dict['dis_price'] = dis_price
            data_dict['dis_percent'] = dis_percent
            data_dict['item_code'] = item_code
            data_dict['provider'] = provider
            print(data_dict)
            # print (category_name, sub_category_name, ranking, item_code, provider, title, ori_price, dis_price, dis_percent)
# ๋Œ€๋ถ„๋ฅ˜ ๋งํฌ์ฃผ์†Œ, ๋Œ€๋ถ„๋ฅ˜ ์ด๋ฆ„ ํŒŒ๋ผ๋ฏธํ„ฐ๋กœ ๋ฐ›์•„ ์†Œ๋ถ„๋ฅ˜ ํŒŒ์‹ฑ ๊ฒฐ๊ณผ์™€ ๋Œ€๋ถ„๋ฅ˜ ์ด๋ฆ„, ์†Œ๋ถ„๋ฅ˜ ์ด๋ฆ„ ์ „๋‹ฌ
def get_category(category_link, category_name):
    print(category_link, category_name)
    res = requests.get(category_link)
    soup = BeautifulSoup(res.content, 'html.parser')
    sub_categories = soup.select('div.navi.group ul li > a')
    for sub_category in sub_categories:
        res = requests.get('http://corners.gmarket.co.kr/' + sub_category['href'])
        soup = BeautifulSoup(res.content, 'html.parser')
        get_items(soup, category_name, sub_category.get_text())
# ๋ฒ ์ŠคํŠธ100 ํ™”๋ฉด์—์„œ ๋„ค๋น„๊ฒŒ์ด์…˜ ์˜์—ญ ํด๋กœ๋งํ•˜์—ฌ ๋Œ€๋ถ„๋ฅ˜ ์ฃผ์†Œ์™€ ๋Œ€๋ถ„๋ฅ˜ ์ด๋ฆ„ ํฌ๋กค๋ง
res = requests.get('http://corners.gmarket.co.kr/Bestsellers')
soup = BeautifulSoup(res.content, 'html.parser')
categories = soup.select('div.gbest-cate ul.by-group li a')
for category in categories:
    get_category('http://corners.gmarket.co.kr/' + category['href'], category.get_text())


3. ํฌ๋กค๋ง ๋ฐ์ดํ„ฐ mysql db ์‚ฝ์ž…

1) COUNT ํ•จ์ˆ˜

  • item_code๋Š” PRIMARU KEY์ด๊ธฐ ๋•Œ๋ฌธ์— ๋™์ผํ•œ item_code๋ฅผ ๊ฐ€์ง„ ๋ฐ์ดํ„ฐ๋Š” ์ €์žฅ ๋ถˆ๊ฐ€
  • ์ด๋Ÿฐ ๋ฌธ์ œ๋ฅผ ํ•ด๊ฒฐํ•  ์ˆ˜ ์žˆ๋Š” ๊ฒƒ์ด sql๋ฌธ์— COUNT ํ•จ์ˆ˜์ž„
  • ์ฆ‰, if๋ฌธ๊ณผ ํ•จ๊ป˜ ์‚ฌ์šฉํ•˜์—ฌ ์ €์žฅ๋œ ์ ์ด ์—†๋Š” item_code๋งŒ ๋ฐ์ดํ„ฐ๋กœ ์‚ฝ์ž…ํ•˜๊ฒŒ๋” ์ œ์–ด
  • COUNT ํ•จ์ˆ˜๋Š” ๋™์ผํ•œ ๋ฐ์ดํ„ฐ๊ฐ€ ํ•˜๋‚˜๋ผ๋„ ์žˆ๋Š”์ง€ ํ™•์ธํ•ด์„œ ์žˆ์œผ๋ฉด 1, ์—†์œผ๋ฉด 0์„ ๋Œ๋ ค์คŒ
  • SELECT COUNT(*) FROM [ํ…Œ์ด๋ธ”๋ช…] WHERE [์ปฌ๋Ÿผ๋ช…] = [๋ฐ์ดํ„ฐ๊ฐ’]
  • ๐Ÿ” SELECT COUNT(*) FROM items WHERE item_code = item_info['item_code'];

โœ๐Ÿป python

import pymysql
db = pymysql.connect(host='localhost', port=3306, user='root', passwd='๋‚ด ๋น„๋ฐ€๋ฒˆํ˜ธ', db='gmarketbest', charset='utf8')
cursor = db.cursor()
# gmarket best ํฌ๋กค๋ง
import requests
from bs4 import BeautifulSoup
# 5๋‹จ๊ณ„ : mysql ๋ฐ์ดํ„ฐ insert
def save_data(item_info):
    # print (item_info)
    # 5-3๋‹จ๊ณ„ : COUNT ํ•จ์ˆ˜๋กœ item_code ์ค‘๋ณต์œผ๋กœ ์ธํ•œ ์˜ค๋ฅ˜ ์˜ˆ๋ฐฉ : item_code๊ฐ€ ์ด๋ฏธ ์กด์žฌํ•˜๋ฉด ํ•ด๋‹น ์ƒํ’ˆ์€ insertํ•˜์ง€ ์•Š์Œ
    sql = """SELECT COUNT(*) FROM items WHERE item_code = '""" + item_info['item_code'] + """';"""
    cursor.execute(sql)
    result = cursor.fetchone()
    # print (result[0])
    # 5-1๋‹จ๊ณ„ : items ํ…Œ์ด๋ธ” date insert
    if result[0] == 0:
        sql = """INSERT INTO items VALUES('""" + item_info['item_code'] + """',
        '""" + item_info['title'] + """', 
        """ + str(item_info['ori_price']) + """, 
        """ + str(item_info['dis_price']) + """, 
        """ + str(item_info['dis_percent']) + """, 
        '""" + item_info['provider'] + """')"""
        print (sql)
        cursor.execute(sql)
    # 5-2๋‹จ๊ณ„ : ranking ํ…Œ์ด๋ธ” date insert
    sql = """INSERT INTO ranking (main_category, sub_category, item_ranking, item_code) VALUES('""" + item_info['category_name'] + """',
    '""" + item_info['sub_category_name'] + """', 
    '""" + str(item_info['ranking']) + """', 
    '""" + item_info['item_code'] + """')"""     
    print (sql)    
    cursor.execute(sql)
    db.commit()
    db.close()

4. ์ตœ์ข… ์ฝ”๋“œ

โœ๐Ÿป python

import pymysql
db = pymysql.connect(host='localhost', port=3306, user='root', passwd='๋‚ด ๋น„๋ฐ€๋ฒˆํ˜ธ', db='gmarketbest', charset='utf8')
cursor = db.cursor()
# gmarket best ํฌ๋กค๋ง
import requests
from bs4 import BeautifulSoup
# 5๋‹จ๊ณ„ : mysql ๋ฐ์ดํ„ฐ insert
def save_data(item_info):
    # print (item_info)
    # 5-3๋‹จ๊ณ„ : COUNT ํ•จ์ˆ˜๋กœ item_code ์ค‘๋ณต์œผ๋กœ ์ธํ•œ ์˜ค๋ฅ˜ ์˜ˆ๋ฐฉ : item_code๊ฐ€ ์ด๋ฏธ ์กด์žฌํ•˜๋ฉด ํ•ด๋‹น ์ƒํ’ˆ์€ insertํ•˜์ง€ ์•Š์Œ
    sql = """SELECT COUNT(*) FROM items WHERE item_code = '""" + item_info['item_code'] + """';"""
    cursor.execute(sql)
    result = cursor.fetchone()
    # print (result[0])
    # 5-1๋‹จ๊ณ„ : items ํ…Œ์ด๋ธ” date insert
    if result[0] == 0:
        sql = """INSERT INTO items VALUES('""" + item_info['item_code'] + """',
        '""" + item_info['title'] + """', 
        """ + str(item_info['ori_price']) + """, 
        """ + str(item_info['dis_price']) + """, 
        """ + str(item_info['dis_percent']) + """, 
        '""" + item_info['provider'] + """')"""
        print (sql)
        cursor.execute(sql)
    # 5-2๋‹จ๊ณ„ : ranking ํ…Œ์ด๋ธ” date insert
    sql = """INSERT INTO ranking (main_category, sub_category, item_ranking, item_code) VALUES('""" + item_info['category_name'] + """',
    '""" + item_info['sub_category_name'] + """', 
    '""" + str(item_info['ranking']) + """', 
    '""" + item_info['item_code'] + """')"""     
    print (sql)    
    cursor.execute(sql)
    db.commit()
    db.close()
# 3๋‹จ๊ณ„ : data ์ถ”์ถœ(๋Œ€๋ถ„๋ฅ˜๋ช…, ์†Œ๋ถ„๋ฅ˜๋ช…, ์ˆœ์œ„, ์ƒํ’ˆ์ฝ”๋“œ, ํŒ๋งค์ž, ์ƒํ’ˆ๋ช…, ์›๊ฐ€๊ฒฉ, ํ• ์ธ๊ฐ€๊ฒฉ, ํ• ์ธ๋ฅ )
def get_items(html, category_name, sub_category_name):
    best_item = html.select('div.best-list')
    if len(best_item[1].select('li')) > 0:
        # 3-1๋‹จ๊ณ„ : data ์ถ”์ถœ(๋Œ€๋ถ„๋ฅ˜๋ช…, ์†Œ๋ถ„๋ฅ˜๋ช…, ์ˆœ์œ„, ์ƒํ’ˆ๋ช…, ์›๊ฐ€๊ฒฉ, ํ• ์ธ๊ฐ€๊ฒฉ, ํ• ์ธ๋ฅ )
        for index, item in enumerate(best_item[1].select('li')):
            data_dict = dict()
            ranking = index + 1
            title = item.select_one('a.itemname').get_text()
            ori_price = item.select_one('div.o-price')
            dis_price = item.select_one('div.s-price strong span')
            dis_percent = item.select_one('div.s-price em')
            if ori_price == None or ori_price.get_text() == '':
                ori_price = dis_price
            if dis_price == None:
                ori_price, dis_price = 0, 0
            else:
                ori_price = ori_price.get_text().replace(',', '').replace('์›', '')
                dis_price = dis_price.get_text().replace(',', '').replace('์›', '')
            if dis_percent == None or dis_percent == '':
                dis_percent = 0
            else:
                dis_percent = dis_percent.get_text().replace('%', '')
        # 3-2๋‹จ๊ณ„ : data ์ถ”์ถœ(์ƒํ’ˆ์ฝ”๋“œ, ํŒ๋งค์ž)
            product_link = item.select_one('div.thumb > a')
            item_code = product_link.attrs['href'].split('=')[1].split('&')[0]
            res = requests.get(product_link.attrs['href'])
            soup = BeautifulSoup(res.content, 'html.parser')
            provider = soup.select_one('div.item-topinfo_headline > p > span.text__seller > a')
            if provider == None:
                provider = ''
            else:
                provider = provider.get_text()            
            # print(category_name, sub_category_name, ranking, item_code, provider, title, ori_price, dis_price, dis_percent)
        # 4๋‹จ๊ณ„ : dict ํ˜•์‹์œผ๋กœ ์ €์žฅ
            data_dict['category_name'] = category_name
            data_dict['sub_category_name'] = sub_category_name
            data_dict['ranking'] = ranking
            data_dict['title'] = title
            data_dict['ori_price'] = ori_price
            data_dict['dis_price'] = dis_price
            data_dict['dis_percent'] = dis_percent
            data_dict['item_code'] = item_code
            data_dict['provider'] = provider
            save_data(data_dict)
# 2๋‹จ๊ณ„ : ์ค‘๋ถ„๋ฅ˜ ํด๋กœ๋ง
def get_category(category_link, category_name):
    res = requests.get(category_link)
    soup = BeautifulSoup(res.content, 'html.parser')
    sub_categories = soup.select('div.navi.group ul li a')
    for sub_category in sub_categories:
        res = requests.get('http://corners.gmarket.co.kr/'+sub_category['href'])
        soup = BeautifulSoup(res.content, 'html.parser')
        get_items(soup, category_name, sub_category.get_text())        
# 1๋‹จ๊ณ„ : ๋Œ€๋ถ„๋ฅ˜ ๋งํฌ ์ฃผ์†Œ ๋ฐ ๋Œ€๋ถ„๋ฅ˜๋ช… ํฌ๋กค๋ง
res = requests.get('http://corners.gmarket.co.kr/Bestsellers')
soup = BeautifulSoup(res.content, 'html.parser')
categories = soup.select('div.gbest-cate ul.by-group li a')
for category in categories:
    get_category('http://corners.gmarket.co.kr/'+category['href'], category.get_text())
profile
Keep Going, Keep Coding!
post-custom-banner

0๊ฐœ์˜ ๋Œ“๊ธ€