스타벅스와 이디야 데이터 가져오기(SQL)

TaeHwi Kang·2022년 11월 16일
0

1. AWS RDS에 데이터 저장을 위한 작업하기

1) Database 생성 후, 접근 가능한 사용자 계정을 만들기

# AWS RDS 접속
import mysql.connector

conn = mysql.connector.connect(
    host = "<database-1>",
    port = 3306,
    user = 'admin',
    password = '<password>',
)

cursor = conn.cursor(buffered=True)

sql = "CREATE DATABASE oneday"
cursor.execute(sql)

cursor.execute("CREATE USER 'oneday'@'%' identified by '1234'")
cursor.execute("GRANT ALL ON oneday.* to 'oneday'@'%'")

2) Database oneday에 스타벅스 이디야 데이터를 저장할 테이블 생성

cursor.execute("use oneday")

# 브랜드 명을 넣은 Table
sql_1 = "CREATE TABLE COFFEE_BRAND
		(
		id int NOT NULL AUTO_INCREMENT PRIMARY KEY,
        name varchar(10)
        )"
cursor.execute(sql_1)

# 데이터를 넣을 Table
sql_2 = "CREATE TABLE COFFEE_STORE
		(
        id int NOT NULL AUTO_INCREMENT PRIMARY KEY,
        brand int,name varchar(32) NOT NULL,
        gu_name varchar(5) NOT NULL,
        address varchar(128) NOT NULL,
        lat decimal(16,14) NOT NULL,
        lng decimal(17,14) NOT NULL,
        FOREIGN KEY (brand) REFERENCES COFFEE_BRAND(id)
        )"
cursor.execute(sql_2)

conn.close()

2. 데이터 가져오기

1) COFFEE_BRAND에 스타벅스와 이디야 값 넣기

import mysql.connector

conn = mysql.connector.connect(
    host = "<database-1>",
    port = 3306,
    user = 'oneday',
    password = '1234',
    database = 'oneday'
)

cursor = conn.cursor(buffered=True)
cursor.execute("INSERT INTO COFFEE_BRAND VALUES (1,'STABUCK'),(2,'EDIYA')")
conn.commit()
conn.close()

2)_1 셀레니움으로 스타벅스 홈페이지에서 매장 데이터 가져오기

from selenium import webdriver
import time
import warnings
from bs4 import BeautifulSoup 
from tqdm import tqdm_notebook
warnings.simplefilter(action='ignore')

# 스타벅스 페이지 접근
url = "https://www.starbucks.co.kr/store/store_map.do"
driver = webdriver.Chrome('./driver/chromedriver.exe')
driver.get(url)

# 팝업창 닫기
from selenium.common.exceptions import NoSuchElementException

try:
   driver.find_element_by_css_selector('.holiday_notice_close a').click()
except NoSuchElementException as e:
    print(e)

# 지역 선택
xpath_one ='//*[@id="container"]/div/form/fieldset/div/section/article[1]/article/header[2]'
some_tag_one = driver.find_element_by_xpath(xpath_one)
some_tag_one.click()

# 서울 선택
xpath_seoul ='//*[@id="container"]/div/form/fieldset/div/section/article[1]/article/article[2]/div[1]/div[2]/ul/li[1]/a'
some_tag_seoul = driver.find_element_by_xpath(xpath_seoul)
some_tag_seoul.click()

# 서울 전체 선택
xpath_seoul_all ='//*[@id="mCSB_2_container"]/ul/li[1]/a'
some_tag_seoul_all = driver.find_element_by_xpath(xpath_seoul_all)
some_tag_seoul_all.click()

# BeautifulSoup
star_req = driver.page_source
star_soup = BeautifulSoup(star_req, "html.parser")

test_soup = star_soup.find_all("li", {"class":"quickResultLstCon"})

driver.close()

2)_2 가져온 스타벅스 데이터를 AWS RDS(sql) COFFEE_STORE에 바로 입력

import mysql.connector

conn = mysql.connector.connect(
    host = "<database-1>",
    port = 3306,
    user = 'oneday',
    password = '1234',
    database = 'oneday'
)
cursor = conn.cursor(buffered=True)

sql = "INSERT INTO COFFEE_STORE (brand, name, gu_name, address, lat, lng) 
		VALUES (1 ,%s,%s,%s,%s,%s)"

for i in tqdm_notebook(range(1, len(test_soup))):
    name = test_soup[i]["data-name"].strip()
    lat = test_soup[i]["data-lat"].strip()
    lng = test_soup[i]["data-long"].strip()
    addess = test_soup[i].find("p").get_text()[0:len(test_soup[i].find("p").get_text())-9]
    gu_name = addess.split()[1]
    
    cursor.execute(sql,(name,gu_name,addess,lat,lng))
    conn.commit()
 
 conn.close()

3)_1 셀레니움으로 이디야 홈페이지에서 매장 데이터 가져오기

# 이디야 페이지 접근
url = "https://www.ediya.com/contents/find_store.html#c"
driver = webdriver.Chrome('./driver/chromedriver.exe')
driver.get(url)

# 주소 클릭
xpath_address = '//*[@id="contentWrap"]/div[3]/div/div[1]/ul/li[2]/a'
some_tag_address = driver.find_element_by_xpath(xpath_address)
some_tag_address.click()

# 구 이름으로 검색 시 검색결과가 많아 (‘서울 ‘+구이름) 형식으로 변환
import mysql.connector

conn = mysql.connector.connect(
    host = "<database-1>",
    port = 3306,
    user = 'oneday',
    password = '1234',
    database = 'oneday'
)

gu_list = []

cursor.execute("select distinct(gu_name) from COFFEE_STORE")
result = cursor.fetchall()

for row in result:
    gu_list.append(str("서울 ") + row[0])

3)_2 이디야 데이터를 AWS RDS(sql) COFFEE_STORE에 바로 입력

import googlemaps

gmaps_key = "GOOGLE_KEY"
gmaps = googlemaps.Client(key=gmaps_key)

sql = "INSERT INTO COFFEE_STORE (brand, name, gu_name, address, lat, lng) 
	   VALUES (2 ,%s,%s,%s,%s,%s)"

for num in tqdm_notebook(gu_list):
    some_tag = driver.find_element_by_xpath('//*[@id="keyword"]')
    some_tag.send_keys(num)
    time.sleep(1)
    driver.find_element_by_xpath('//*[@id="keyword_div"]/form/button').click()
    time.sleep(1)
    edi_req = driver.page_source
    edi_soup = BeautifulSoup(edi_req, "html.parser")
    
    for i in range(len(edi_soup.find_all("li", {"class":"item"}))):
        name = edi_soup.find_all("li", {"class":"item"})[i].find("dt").text # 매장명
        gu_name = edi_soup.find_all("li", {"class":"item"})[i].text.split()[2]  # 구
        e = edi_soup.find_all("li", {"class":"item"})[i].get_text()      # 주소
        
        if len(e.split()) > 4: # 
            addess = e.split()[1] + e.split()[2] + e.split()[3] + e.split()[4]
        else: 
            addess = e.split()[1] + e.split()[2] + e.split()[3] + e.split()[0][0:-1]

        lat = gmaps.geocode(addess)[0].get("geometry")["location"]["lat"] 
        lng = gmaps.geocode(addess)[0].get("geometry")["location"]["lng"]


        cursor.execute(sql,(name,gu_name,e,lat,lng))
        conn.commit()
        
        
    time.sleep(1)    
    some_tag.clear()
 
driver.close()

3. 가져온 데이터로 쿼리로 조회

1) 스타벅스 매장 주요 분포 지역 검색

# 매장수를 구이름, 매장 개수 출력
sql = "select s.gu_name, count(s.brand) from COFFEE_BRAND b, COFFEE_STORE s 
		where b.id = s.brand AND b.name='STARBUCKS' 
        GROUP BY s.gu_name 
        order by count(s.brand) desc
        limit 9"

cursor.execute(sql)
result = cursor.fetchall()

for row in result:
    print(row)
 
 # 검색 결과
('강남구', 88)
('중구', 55)
('서초구', 48)
('마포구', 41)
('영등포구', 41)
('종로구', 40)
('송파구', 34)
('용산구', 25)
('강서구', 25)

2) 이디야 매장 주요 분포 지역 검색

# 매장수를 구이름, 매장 개수 출력
sql = "select s.gu_name, count(s.brand) from COFFEE_BRAND b, COFFEE_STORE s 
		where b.id = s.brand AND b.name='EDIYA' 
        GROUP BY s.gu_name 
        order by count(s.brand) desc
        limit 9"

cursor.execute(sql)
result = cursor.fetchall()

for row in result:
    print(row)
 
 # 검색 결과
('강서구', 45)
('강남구', 44)
('영등포구', 41)
('송파구', 39)
('마포구', 37)
('종로구', 31)
('성북구', 31)
('중구', 30)
('노원구', 30)

3) 구별 브랜드 각각의 매장 개수 조회

sql = "select s.gu_name, s.count STARBUCKS, e.count EDIYA 
       from (select gu_name, count(brand) count from COFFEE_STORE WHERE brand = 1 GROUP BY gu_name) s, 
            (select gu_name, count(brand) count from COFFEE_STORE WHERE brand = 2 GROUP BY gu_name) e 
       where s.gu_name = e.gu_name"

cursor.execute(sql)
result = cursor.fetchall()

for row in result:
    print(row)
    
# 검색 결과 (구, 스타벅스, 이디야)
('은평구', 12, 26)
('서대문구', 23, 21)
('마포구', 41, 37)
('강남구', 88, 44)
('강북구', 6, 20)
('강서구', 25, 45)
('관악구', 12, 28)
('광진구', 19, 24)
('금천구', 13, 23)
('노원구', 14, 30)
('도봉구', 4, 24)
('동작구', 11, 23)
('서초구', 48, 29)
('성북구', 15, 31)
('송파구', 34, 39)
('양천구', 17, 27)
('영등포구', 41, 41)
('종로구', 40, 31)
('중구', 55, 30)
('강동구', 17, 27)
('구로구', 13, 29)
('동대문구', 9, 25)
('성동구', 14, 22)
('용산구', 25, 14)
('중랑구', 8, 28)   

4. 시각화 프로젝트를 위하여 CSV 파일로 저장

import pandas as pd

sql =  "select * 
        from (select s.id s_id, b.name s_brand, s.name s_name, s.gu_name s_gu, s.address s_address, s.lat s_lat, s.lng s_lng 
              from COFFEE_STORE s, COFFEE_BRAND b 
              where b.id = s.brand and b.name like 'STARBUCKS') st, 
             (select s.id e_id, b.name e_brand, s.name e_name, s.gu_name e_gu, s.address e_address, s.lat e_lat, s.lng e_lng 
             from COFFEE_STORE s, COFFEE_BRAND b 
             where b.id = s.brand and b.name like 'EDIYA') ed 
        order by st.s_id, ed.e_id"

cursor.execute(sql)
result = cursor.fetchall()
num_fields = len(cursor.description)
field_names = [i[0] for i in cursor.description]

df = pd.DataFrame(result)
df.columns = field_names

df.to_csv('coffee_output.csv', index = False, encoding = "utf-8")
conn.close()
profile
스터디 노트

0개의 댓글