๐ฅ mysql ์คํค๋ง ๋ง๋ค๊ธฐ
๐ฅ gmarket ํฌ๋กค๋ง
๐ฅ ํฌ๋กค๋ง ๋ฐ์ดํฐ mysql db ์ฝ์
๐ฅ ์ต์ข ์ฝ๋
โ๐ป python
## mysql & crawling import pymysql db = pymysql.connect(host='localhost', port=3306, user='root', passwd='๋ด ๋น๋ฐ๋ฒํธ', db='bestproducts', charset='utf8') cursor = db.cursor() # # ์คํค๋ง ์์ฑ # ์คํค๋ง ์ ์ : items ํ ์ด๋ธ sql=''' CREATE TABLE items( item_code VARCHAR(20) NOT NULL PRIMARY KEY, title VARCHAR(200) NOT NULL, ori_price INT NOT NULL, dis_price INT NOT NULL, dis_percent INT NOT NULL, provider VARCHAR(100) ); ''' cursor.execute(sql) # items ํ ์ด๋ธ ์์ฑ sql ์คํ # # ์คํค๋ง ์ ์ : ranking ํ ์ด๋ธ sql = ''' CREATE TABLE ranking( num INT AUTO_INCREMENT NOT NULL PRIMARY KEY, main_category VARCHAR(50) NOT NULL, sub_category VARCHAR(50) NOT NULL, item_ranking TINYINT UNSIGNED NOT NULL, item_code VARCHAR(20) NOT NULL, FOREIGN KEY (item_code) REFERENCES items(item_code) ); ''' cursor.execute(sql) # items ํ ์ด๋ธ ์์ฑ sql ์คํ db.commit() db.close()
1) best100 main_category(๋๋ถ๋ฅ) ์ ๋ณด ๊ฐ์ ธ์ค๊ธฐ
- ํฌ๋กค๋ง ์ฃผ์ : http://corners.gmarket.co.kr/Bestsellers
- best100 ํ์ด์ง์์ navigation ์ญํ ์ ํ๋ ์์ญ ํด๋กค๋งํ์ฌ ์ฃผ์์ ํ ์คํธ ๊ฐ์ ธ์ด
โ๐ป python
import requests from bs4 import BeautifulSoup res = requests.get('http://corners.gmarket.co.kr/Bestsellers') soup = BeautifulSoup(res.content, 'html.parser') categories = soup.select('div.gbest-cate ul.by-group li a') for category in categories: print ('http://corners.gmarket.co.kr/' + category['href'], category.get_text())
2) main_category(๋๋ถ๋ฅ)์ sub_category(์๋ถ๋ฅ) ์ ๋ณด ๊ฐ์ ธ์ค๊ธฐ
- main_category ๋งํฌ ์ ๋ณด๋ฅผ ๋ค์ requestsํด ์๋ถ๋ฅ ์ ๋ณด ํด๋ก๋ง
- ๋งํฌ ์ ๋ณด์ ๋งํฌ๋ช ์ get_catergory ํจ์์ ํ๋ผ๋ฏธํฐ๋ก ์ ๋ฌ
- ํด๋ก๋งํ ์๋ถ๋ฅ ์ ๋ณด๋ ์๋ถ๋ฅ๋ณ(sub_category) ๋งํฌ์ฃผ์์ ์๋ถ๋ฅ ์ด๋ฆ
โ๐ป python
import requests from bs4 import BeautifulSoup # 2๋จ๊ณ : main_category์ ๋งํฌ์ฃผ์์ main_category ์ด๋ฆ์ ํ๋ผ๋ฏธํฐ๋ก ๋ฐ์ def get_category(category_link, category_name): res = requests.get(category_link) soup = BeautifulSoup(res.content, 'html.parser') sub_categories = soup.select('div.navi.group ul li a') # main_category ๋งํฌ์ฃผ์, main_category ์ด๋ฆ, sub_category ์ด๋ฆ, sub_category ๋งํฌ์ฃผ์ ํฌ๋กค๋ง for sub_category in sub_categories: print (category_link, category_name, sub_category.get_text(), 'http://corners.gmarket.co.kr/' + sub_category['href']) # # 1๋จ๊ณ : main_category ๋งํฌ์ฃผ์์ main_category ์ด๋ฆ์ get_catergory ํจ์๋ก ๋๊ฒจ์ค res = requests.get('http://corners.gmarket.co.kr/Bestsellers') soup = BeautifulSoup(res.content, 'html.parser') categories = soup.select('div.gbest-cate ul.by-group li a') for category in categories: get_category('http://corners.gmarket.co.kr/' + category['href'], category.get_text())
3) main_category, ์ํ ์ ๋ณด, ์ํ ์ฝ๋, ํ๋งค์ ํด๋ก๋ง
- ๋ช๋ช ์ํ๋ค์ ํด๋น ํ๊ฒ ํ๊ทธ๊ฐ ์กด์ฌํ์ง ์๊ฑฐ๋, ํ๊ทธ ์์ text๊ฐ ์์ด ์ค๋ฅ๊ฐ ๋ฐ์๋ ์ ์์
- ์ค๋ฅ๊ฐ ๋ฐ์ํ๋ฉด ํฌ๋กค๋ง์ด ์งํ๋์ง ์๊ธฐ ๋๋ฌธ์ ๊ทธ๋ฐ ์ํฉ์ ๋์ฒํ ์ ์๋ if๋ฌธ์ผ๋ก ์ฒ๋ฆฌํด์ค
- ๐ฅ ํฌ๋กค๋ง target : ๋๋ถ๋ฅ๋ช , ์๋ถ๋ฅ๋ช , ์์, ์ํ์ฝ๋, ํ๋งค์, ์ํ๋ช , ์๊ฐ๊ฒฉ, ํ ์ธ๊ฐ๊ฒฉ, ํ ์ธ๋ฅ
- ๋ณ์๋ช : category_name, sub_category_name, ranking, item_code, provider, title, ori_price, dis_price, discount_percent
โ๐ป python
import requests from bs4 import BeautifulSoup # 3๋จ๊ณ : item ์ ๋ณด๋ฅผ ๊ฐ์ ธ์ด def get_items(html, category_name, sub_category_name): best_item = html.select('div.best-list') # best_item์ด๋ผ๋ class๋ช ์ด ๋ค์์. ๋๋ฒ์งธ best_item ํด๋์ค ๋ด liํ๊ทธ์ ์ํ ์ ๋ณด๊ฐ ์์ if len(best_item[1].select('li')) > 0: for index, item in enumerate(best_item[1].select('li')): ranking = index + 1 title = item.select_one('a.itemname').get_text() ori_price = item.select_one('div.o-price') dis_price = item.select_one('div.s-price strong span') discount_percent = item.select_one('div.s-price em') # ์๋ ๊ฐ๊ฒฉ์ด ๋น์ด์๊ฑฐ๋, ํ ์คํธ๊ฐ ์์ ๊ฒฝ์ฐ if ori_price == None or ori_price.get_text() == '': ori_price = dis_price # ํ ์ธ ๊ฐ๊ฒฉ์ด ๋น์ด์์ ๊ฒฝ์ฐ, ์๋ ๊ฐ๊ฒฉ๊ณผ ํ ์ธ๊ฐ๊ฒฉ์ 0 ๋ฃ์(๋ฌด๋ฃ์ผ ๊ฒฝ์ฐ) if dis_price == None: ori_price, dis_price = 0, 0 else: ori_price = ori_price.get_text().replace(',', '').replace('์', '') dis_price = dis_price.get_text().replace(',', '').replace('์', '') # ํ ์ธ์จ์ด ๋น์ด์๊ฑฐ๋, ํ ์คํธ๊ฐ ์์ ๊ฒฝ์ฐ if discount_percent == None or discount_percent.get_text() == '': discount_percent = 0 else: discount_percent = discount_percent.get_text().replace('%', '') # ๊ฐ๊ฐ์ ์ํ ๋งํฌ product_link = item.select_one('div.thumb > a') item_code = product_link.attrs['href'].split('=')[1] # ํ๋งค์ crawling(๊ฐ ์ํ ์ ๋ณด์ ๋งํฌ๋ฅผ ํ๊ณ ๋ค์ด๊ฐ ํ๋งค์ ์ ๋ณด ํด๋ก๋ง) res = requests.get(product_link.attrs['href']) soup = BeautifulSoup(res.content, 'html.parser') provider = soup.select_one('div.item-topinfo_headline > p > span.text__seller > a') if provider == None: provider = '' else: provider = provider.get_text() print (category_name, sub_category_name, ranking, item_code, provider, title, ori_price, dis_price, discount_percent) # 2๋จ๊ณ : ๋๋ถ๋ฅ ๋งํฌ์ฃผ์์ ๋๋ถ๋ฅ ์ด๋ฆ์ get_category ํ๋ผ๋ฏธํฐ๋ก ๋๊น def get_category(category_link, category_name): res = requests.get(category_link) soup = BeautifulSoup(res.content, 'html.parser') # ์๋ถ๋ฅ ๋ค๋น๊ฒ์ด์ ํด๋ก๋ง sub_categories = soup.select('div.navi.group ul li > a') for sub_category in sub_categories: res = requests.get('http://corners.gmarket.co.kr/' + sub_category['href']) # ์๋ถ๋ฅ ๋งํฌ๋ฅผ requests soup = BeautifulSoup(res.content, 'html.parser') # ์๋ถ๋ฅ ์ฃผ์, ๋๋ถ๋ฅ ์ด๋ฆ, ์๋ถ๋ฅ ์ด๋ฆ์ get_items()์ ํ๋ผ๋ฏธํฐ๋ก ๋๊น get_items(soup, category_name, sub_category.get_text()) #1๋จ๊ณ : ๋๋ถ๋ฅ ๋งํฌ ์ฃผ์์ ๋๋ถ๋ฅ๋ช ์ ํฌ๋กค๋ง res = requests.get('http://corners.gmarket.co.kr/Bestsellers') soup = BeautifulSoup(res.content, 'html.parser') categories = soup.select('div.gbest-cate ul.by-group li a') for category in categories: get_category('http://corners.gmarket.co.kr/' + category['href'], category.get_text())
3) ํฌ๋กค๋ง ๋ฐ์ดํฐ ๋์ ๋๋ฆฌ์ ๋ด๊ธฐ
โ๐ป python
import requests from bs4 import BeautifulSoup def get_items(html, category_name, sub_category_name): # ์ผ๋ถ ์๋ธ ์นดํ ๊ณ ๋ฆฌ์ ๊ฒฝ์ฐ, ์ด๋ฒคํธ ํ์ด์ง์ ๊ฐ์ด ํ์๋ ์ ์์ผ๋ฏ๋ก, ํด๋น ์ผ์ด์ค๋ค์ skip ํจ (2020.09.30) best_item = html.select('div.best-list') if len(best_item[1].select('li')) > 0: for index, item in enumerate(best_item[1].select('li')): # ํด๋ก๋งํ ๋ฐ์ดํฐ๋ฅผ ๋์ ๋๋ฆฌ ํํ๋ก ๋ด๊ธฐ์ํด ๋์ ๋๋ฆฌ ์ ์ธ data_dict = dict() ranking = index + 1 title = item.select_one('a.itemname').get_text() ori_price = item.select_one('div.o-price') dis_price = item.select_one('div.s-price strong span') dis_percent = item.select_one('div.s-price em') if ori_price == None or ori_price.get_text() == '': ori_price = dis_price if dis_price == None: ori_price, dis_price = 0, 0 else: ori_price = ori_price.get_text().replace(',', '').replace('์', '') dis_price = dis_price.get_text().replace(',', '').replace('์', '') if dis_percent == None or dis_percent.get_text() == '': dis_percent = 0 else: dis_percent = dis_percent.get_text().replace('%', '') product_link = item.select_one('div.thumb > a') item_code = product_link.attrs['href'].split('=')[1] res = requests.get(product_link.attrs['href']) soup = BeautifulSoup(res.content, 'html.parser') provider = soup.select_one('div.item-topinfo_headline > p > span.text__seller > a') if provider == None: provider = '' else: provider = provider.get_text() # ์ ์ธํ ๋์ ๋๋ฆฌ์ ํด๋ก๋งํ ๋ฐ์ดํฐ ๋ด๊ธฐ data_dict['category_name'] = category_name data_dict['sub_category_name'] = sub_category_name data_dict['ranking'] = ranking data_dict['title'] = title data_dict['ori_price'] = ori_price data_dict['dis_price'] = dis_price data_dict['dis_percent'] = dis_percent data_dict['item_code'] = item_code data_dict['provider'] = provider print(data_dict) # print (category_name, sub_category_name, ranking, item_code, provider, title, ori_price, dis_price, dis_percent) # ๋๋ถ๋ฅ ๋งํฌ์ฃผ์, ๋๋ถ๋ฅ ์ด๋ฆ ํ๋ผ๋ฏธํฐ๋ก ๋ฐ์ ์๋ถ๋ฅ ํ์ฑ ๊ฒฐ๊ณผ์ ๋๋ถ๋ฅ ์ด๋ฆ, ์๋ถ๋ฅ ์ด๋ฆ ์ ๋ฌ def get_category(category_link, category_name): print(category_link, category_name) res = requests.get(category_link) soup = BeautifulSoup(res.content, 'html.parser') sub_categories = soup.select('div.navi.group ul li > a') for sub_category in sub_categories: res = requests.get('http://corners.gmarket.co.kr/' + sub_category['href']) soup = BeautifulSoup(res.content, 'html.parser') get_items(soup, category_name, sub_category.get_text()) # ๋ฒ ์คํธ100 ํ๋ฉด์์ ๋ค๋น๊ฒ์ด์ ์์ญ ํด๋ก๋งํ์ฌ ๋๋ถ๋ฅ ์ฃผ์์ ๋๋ถ๋ฅ ์ด๋ฆ ํฌ๋กค๋ง res = requests.get('http://corners.gmarket.co.kr/Bestsellers') soup = BeautifulSoup(res.content, 'html.parser') categories = soup.select('div.gbest-cate ul.by-group li a') for category in categories: get_category('http://corners.gmarket.co.kr/' + category['href'], category.get_text())
1) COUNT ํจ์
- item_code๋ PRIMARU KEY์ด๊ธฐ ๋๋ฌธ์ ๋์ผํ item_code๋ฅผ ๊ฐ์ง ๋ฐ์ดํฐ๋ ์ ์ฅ ๋ถ๊ฐ
- ์ด๋ฐ ๋ฌธ์ ๋ฅผ ํด๊ฒฐํ ์ ์๋ ๊ฒ์ด sql๋ฌธ์ COUNT ํจ์์
- ์ฆ, if๋ฌธ๊ณผ ํจ๊ป ์ฌ์ฉํ์ฌ ์ ์ฅ๋ ์ ์ด ์๋ item_code๋ง ๋ฐ์ดํฐ๋ก ์ฝ์ ํ๊ฒ๋ ์ ์ด
- COUNT ํจ์๋ ๋์ผํ ๋ฐ์ดํฐ๊ฐ ํ๋๋ผ๋ ์๋์ง ํ์ธํด์ ์์ผ๋ฉด 1, ์์ผ๋ฉด 0์ ๋๋ ค์ค
- SELECT COUNT(*) FROM [ํ ์ด๋ธ๋ช ] WHERE [์ปฌ๋ผ๋ช ] = [๋ฐ์ดํฐ๊ฐ]
- ๐ SELECT COUNT(*) FROM items WHERE item_code = item_info['item_code'];
โ๐ป python
import pymysql db = pymysql.connect(host='localhost', port=3306, user='root', passwd='๋ด ๋น๋ฐ๋ฒํธ', db='gmarketbest', charset='utf8') cursor = db.cursor() # gmarket best ํฌ๋กค๋ง import requests from bs4 import BeautifulSoup # 5๋จ๊ณ : mysql ๋ฐ์ดํฐ insert def save_data(item_info): # print (item_info) # 5-3๋จ๊ณ : COUNT ํจ์๋ก item_code ์ค๋ณต์ผ๋ก ์ธํ ์ค๋ฅ ์๋ฐฉ : item_code๊ฐ ์ด๋ฏธ ์กด์ฌํ๋ฉด ํด๋น ์ํ์ insertํ์ง ์์ sql = """SELECT COUNT(*) FROM items WHERE item_code = '""" + item_info['item_code'] + """';""" cursor.execute(sql) result = cursor.fetchone() # print (result[0]) # 5-1๋จ๊ณ : items ํ ์ด๋ธ date insert if result[0] == 0: sql = """INSERT INTO items VALUES('""" + item_info['item_code'] + """', '""" + item_info['title'] + """', """ + str(item_info['ori_price']) + """, """ + str(item_info['dis_price']) + """, """ + str(item_info['dis_percent']) + """, '""" + item_info['provider'] + """')""" print (sql) cursor.execute(sql) # 5-2๋จ๊ณ : ranking ํ ์ด๋ธ date insert sql = """INSERT INTO ranking (main_category, sub_category, item_ranking, item_code) VALUES('""" + item_info['category_name'] + """', '""" + item_info['sub_category_name'] + """', '""" + str(item_info['ranking']) + """', '""" + item_info['item_code'] + """')""" print (sql) cursor.execute(sql) db.commit() db.close()
โ๐ป python
import pymysql db = pymysql.connect(host='localhost', port=3306, user='root', passwd='๋ด ๋น๋ฐ๋ฒํธ', db='gmarketbest', charset='utf8') cursor = db.cursor() # gmarket best ํฌ๋กค๋ง import requests from bs4 import BeautifulSoup # 5๋จ๊ณ : mysql ๋ฐ์ดํฐ insert def save_data(item_info): # print (item_info) # 5-3๋จ๊ณ : COUNT ํจ์๋ก item_code ์ค๋ณต์ผ๋ก ์ธํ ์ค๋ฅ ์๋ฐฉ : item_code๊ฐ ์ด๋ฏธ ์กด์ฌํ๋ฉด ํด๋น ์ํ์ insertํ์ง ์์ sql = """SELECT COUNT(*) FROM items WHERE item_code = '""" + item_info['item_code'] + """';""" cursor.execute(sql) result = cursor.fetchone() # print (result[0]) # 5-1๋จ๊ณ : items ํ ์ด๋ธ date insert if result[0] == 0: sql = """INSERT INTO items VALUES('""" + item_info['item_code'] + """', '""" + item_info['title'] + """', """ + str(item_info['ori_price']) + """, """ + str(item_info['dis_price']) + """, """ + str(item_info['dis_percent']) + """, '""" + item_info['provider'] + """')""" print (sql) cursor.execute(sql) # 5-2๋จ๊ณ : ranking ํ ์ด๋ธ date insert sql = """INSERT INTO ranking (main_category, sub_category, item_ranking, item_code) VALUES('""" + item_info['category_name'] + """', '""" + item_info['sub_category_name'] + """', '""" + str(item_info['ranking']) + """', '""" + item_info['item_code'] + """')""" print (sql) cursor.execute(sql) db.commit() db.close() # 3๋จ๊ณ : data ์ถ์ถ(๋๋ถ๋ฅ๋ช , ์๋ถ๋ฅ๋ช , ์์, ์ํ์ฝ๋, ํ๋งค์, ์ํ๋ช , ์๊ฐ๊ฒฉ, ํ ์ธ๊ฐ๊ฒฉ, ํ ์ธ๋ฅ ) def get_items(html, category_name, sub_category_name): best_item = html.select('div.best-list') if len(best_item[1].select('li')) > 0: # 3-1๋จ๊ณ : data ์ถ์ถ(๋๋ถ๋ฅ๋ช , ์๋ถ๋ฅ๋ช , ์์, ์ํ๋ช , ์๊ฐ๊ฒฉ, ํ ์ธ๊ฐ๊ฒฉ, ํ ์ธ๋ฅ ) for index, item in enumerate(best_item[1].select('li')): data_dict = dict() ranking = index + 1 title = item.select_one('a.itemname').get_text() ori_price = item.select_one('div.o-price') dis_price = item.select_one('div.s-price strong span') dis_percent = item.select_one('div.s-price em') if ori_price == None or ori_price.get_text() == '': ori_price = dis_price if dis_price == None: ori_price, dis_price = 0, 0 else: ori_price = ori_price.get_text().replace(',', '').replace('์', '') dis_price = dis_price.get_text().replace(',', '').replace('์', '') if dis_percent == None or dis_percent == '': dis_percent = 0 else: dis_percent = dis_percent.get_text().replace('%', '') # 3-2๋จ๊ณ : data ์ถ์ถ(์ํ์ฝ๋, ํ๋งค์) product_link = item.select_one('div.thumb > a') item_code = product_link.attrs['href'].split('=')[1].split('&')[0] res = requests.get(product_link.attrs['href']) soup = BeautifulSoup(res.content, 'html.parser') provider = soup.select_one('div.item-topinfo_headline > p > span.text__seller > a') if provider == None: provider = '' else: provider = provider.get_text() # print(category_name, sub_category_name, ranking, item_code, provider, title, ori_price, dis_price, dis_percent) # 4๋จ๊ณ : dict ํ์์ผ๋ก ์ ์ฅ data_dict['category_name'] = category_name data_dict['sub_category_name'] = sub_category_name data_dict['ranking'] = ranking data_dict['title'] = title data_dict['ori_price'] = ori_price data_dict['dis_price'] = dis_price data_dict['dis_percent'] = dis_percent data_dict['item_code'] = item_code data_dict['provider'] = provider save_data(data_dict) # 2๋จ๊ณ : ์ค๋ถ๋ฅ ํด๋ก๋ง def get_category(category_link, category_name): res = requests.get(category_link) soup = BeautifulSoup(res.content, 'html.parser') sub_categories = soup.select('div.navi.group ul li a') for sub_category in sub_categories: res = requests.get('http://corners.gmarket.co.kr/'+sub_category['href']) soup = BeautifulSoup(res.content, 'html.parser') get_items(soup, category_name, sub_category.get_text()) # 1๋จ๊ณ : ๋๋ถ๋ฅ ๋งํฌ ์ฃผ์ ๋ฐ ๋๋ถ๋ฅ๋ช ํฌ๋กค๋ง res = requests.get('http://corners.gmarket.co.kr/Bestsellers') soup = BeautifulSoup(res.content, 'html.parser') categories = soup.select('div.gbest-cate ul.by-group li a') for category in categories: get_category('http://corners.gmarket.co.kr/'+category['href'], category.get_text())