1. AWS RDS에 데이터 저장을 위한 작업하기
1) Database 생성 후, 접근 가능한 사용자 계정을 만들기
# AWS RDS 접속
import mysql.connector
conn = mysql.connector.connect(
host = "<database-1>",
port = 3306,
user = 'admin',
password = '<password>',
)
cursor = conn.cursor(buffered=True)
sql = "CREATE DATABASE oneday"
cursor.execute(sql)
cursor.execute("CREATE USER 'oneday'@'%' identified by '1234'")
cursor.execute("GRANT ALL ON oneday.* to 'oneday'@'%'")
2) Database oneday에 스타벅스 이디야 데이터를 저장할 테이블 생성
cursor.execute("use oneday")
# 브랜드 명을 넣은 Table
sql_1 = "CREATE TABLE COFFEE_BRAND
(
id int NOT NULL AUTO_INCREMENT PRIMARY KEY,
name varchar(10)
)"
cursor.execute(sql_1)
# 데이터를 넣을 Table
sql_2 = "CREATE TABLE COFFEE_STORE
(
id int NOT NULL AUTO_INCREMENT PRIMARY KEY,
brand int,name varchar(32) NOT NULL,
gu_name varchar(5) NOT NULL,
address varchar(128) NOT NULL,
lat decimal(16,14) NOT NULL,
lng decimal(17,14) NOT NULL,
FOREIGN KEY (brand) REFERENCES COFFEE_BRAND(id)
)"
cursor.execute(sql_2)
conn.close()
2. 데이터 가져오기
1) COFFEE_BRAND에 스타벅스와 이디야 값 넣기
import mysql.connector
conn = mysql.connector.connect(
host = "<database-1>",
port = 3306,
user = 'oneday',
password = '1234',
database = 'oneday'
)
cursor = conn.cursor(buffered=True)
cursor.execute("INSERT INTO COFFEE_BRAND VALUES (1,'STABUCK'),(2,'EDIYA')")
conn.commit()
conn.close()
2)_1 셀레니움으로 스타벅스 홈페이지에서 매장 데이터 가져오기
from selenium import webdriver
import time
import warnings
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook
warnings.simplefilter(action='ignore')
# 스타벅스 페이지 접근
url = "https://www.starbucks.co.kr/store/store_map.do"
driver = webdriver.Chrome('./driver/chromedriver.exe')
driver.get(url)
# 팝업창 닫기
from selenium.common.exceptions import NoSuchElementException
try:
driver.find_element_by_css_selector('.holiday_notice_close a').click()
except NoSuchElementException as e:
print(e)
# 지역 선택
xpath_one ='//*[@id="container"]/div/form/fieldset/div/section/article[1]/article/header[2]'
some_tag_one = driver.find_element_by_xpath(xpath_one)
some_tag_one.click()
# 서울 선택
xpath_seoul ='//*[@id="container"]/div/form/fieldset/div/section/article[1]/article/article[2]/div[1]/div[2]/ul/li[1]/a'
some_tag_seoul = driver.find_element_by_xpath(xpath_seoul)
some_tag_seoul.click()
# 서울 전체 선택
xpath_seoul_all ='//*[@id="mCSB_2_container"]/ul/li[1]/a'
some_tag_seoul_all = driver.find_element_by_xpath(xpath_seoul_all)
some_tag_seoul_all.click()
# BeautifulSoup
star_req = driver.page_source
star_soup = BeautifulSoup(star_req, "html.parser")
test_soup = star_soup.find_all("li", {"class":"quickResultLstCon"})
driver.close()
2)_2 가져온 스타벅스 데이터를 AWS RDS(sql) COFFEE_STORE에 바로 입력
import mysql.connector
conn = mysql.connector.connect(
host = "<database-1>",
port = 3306,
user = 'oneday',
password = '1234',
database = 'oneday'
)
cursor = conn.cursor(buffered=True)
sql = "INSERT INTO COFFEE_STORE (brand, name, gu_name, address, lat, lng)
VALUES (1 ,%s,%s,%s,%s,%s)"
for i in tqdm_notebook(range(1, len(test_soup))):
name = test_soup[i]["data-name"].strip()
lat = test_soup[i]["data-lat"].strip()
lng = test_soup[i]["data-long"].strip()
addess = test_soup[i].find("p").get_text()[0:len(test_soup[i].find("p").get_text())-9]
gu_name = addess.split()[1]
cursor.execute(sql,(name,gu_name,addess,lat,lng))
conn.commit()
conn.close()
3)_1 셀레니움으로 이디야 홈페이지에서 매장 데이터 가져오기
# 이디야 페이지 접근
url = "https://www.ediya.com/contents/find_store.html#c"
driver = webdriver.Chrome('./driver/chromedriver.exe')
driver.get(url)
# 주소 클릭
xpath_address = '//*[@id="contentWrap"]/div[3]/div/div[1]/ul/li[2]/a'
some_tag_address = driver.find_element_by_xpath(xpath_address)
some_tag_address.click()
# 구 이름으로 검색 시 검색결과가 많아 (‘서울 ‘+구이름) 형식으로 변환
import mysql.connector
conn = mysql.connector.connect(
host = "<database-1>",
port = 3306,
user = 'oneday',
password = '1234',
database = 'oneday'
)
gu_list = []
cursor.execute("select distinct(gu_name) from COFFEE_STORE")
result = cursor.fetchall()
for row in result:
gu_list.append(str("서울 ") + row[0])
3)_2 이디야 데이터를 AWS RDS(sql) COFFEE_STORE에 바로 입력
import googlemaps
gmaps_key = "GOOGLE_KEY"
gmaps = googlemaps.Client(key=gmaps_key)
sql = "INSERT INTO COFFEE_STORE (brand, name, gu_name, address, lat, lng)
VALUES (2 ,%s,%s,%s,%s,%s)"
for num in tqdm_notebook(gu_list):
some_tag = driver.find_element_by_xpath('//*[@id="keyword"]')
some_tag.send_keys(num)
time.sleep(1)
driver.find_element_by_xpath('//*[@id="keyword_div"]/form/button').click()
time.sleep(1)
edi_req = driver.page_source
edi_soup = BeautifulSoup(edi_req, "html.parser")
for i in range(len(edi_soup.find_all("li", {"class":"item"}))):
name = edi_soup.find_all("li", {"class":"item"})[i].find("dt").text # 매장명
gu_name = edi_soup.find_all("li", {"class":"item"})[i].text.split()[2] # 구
e = edi_soup.find_all("li", {"class":"item"})[i].get_text() # 주소
if len(e.split()) > 4: #
addess = e.split()[1] + e.split()[2] + e.split()[3] + e.split()[4]
else:
addess = e.split()[1] + e.split()[2] + e.split()[3] + e.split()[0][0:-1]
lat = gmaps.geocode(addess)[0].get("geometry")["location"]["lat"]
lng = gmaps.geocode(addess)[0].get("geometry")["location"]["lng"]
cursor.execute(sql,(name,gu_name,e,lat,lng))
conn.commit()
time.sleep(1)
some_tag.clear()
driver.close()
3. 가져온 데이터로 쿼리로 조회
1) 스타벅스 매장 주요 분포 지역 검색
# 매장수를 구이름, 매장 개수 출력
sql = "select s.gu_name, count(s.brand) from COFFEE_BRAND b, COFFEE_STORE s
where b.id = s.brand AND b.name='STARBUCKS'
GROUP BY s.gu_name
order by count(s.brand) desc
limit 9"
cursor.execute(sql)
result = cursor.fetchall()
for row in result:
print(row)
# 검색 결과
('강남구', 88)
('중구', 55)
('서초구', 48)
('마포구', 41)
('영등포구', 41)
('종로구', 40)
('송파구', 34)
('용산구', 25)
('강서구', 25)
2) 이디야 매장 주요 분포 지역 검색
# 매장수를 구이름, 매장 개수 출력
sql = "select s.gu_name, count(s.brand) from COFFEE_BRAND b, COFFEE_STORE s
where b.id = s.brand AND b.name='EDIYA'
GROUP BY s.gu_name
order by count(s.brand) desc
limit 9"
cursor.execute(sql)
result = cursor.fetchall()
for row in result:
print(row)
# 검색 결과
('강서구', 45)
('강남구', 44)
('영등포구', 41)
('송파구', 39)
('마포구', 37)
('종로구', 31)
('성북구', 31)
('중구', 30)
('노원구', 30)
3) 구별 브랜드 각각의 매장 개수 조회
sql = "select s.gu_name, s.count STARBUCKS, e.count EDIYA
from (select gu_name, count(brand) count from COFFEE_STORE WHERE brand = 1 GROUP BY gu_name) s,
(select gu_name, count(brand) count from COFFEE_STORE WHERE brand = 2 GROUP BY gu_name) e
where s.gu_name = e.gu_name"
cursor.execute(sql)
result = cursor.fetchall()
for row in result:
print(row)
# 검색 결과 (구, 스타벅스, 이디야)
('은평구', 12, 26)
('서대문구', 23, 21)
('마포구', 41, 37)
('강남구', 88, 44)
('강북구', 6, 20)
('강서구', 25, 45)
('관악구', 12, 28)
('광진구', 19, 24)
('금천구', 13, 23)
('노원구', 14, 30)
('도봉구', 4, 24)
('동작구', 11, 23)
('서초구', 48, 29)
('성북구', 15, 31)
('송파구', 34, 39)
('양천구', 17, 27)
('영등포구', 41, 41)
('종로구', 40, 31)
('중구', 55, 30)
('강동구', 17, 27)
('구로구', 13, 29)
('동대문구', 9, 25)
('성동구', 14, 22)
('용산구', 25, 14)
('중랑구', 8, 28)
4. 시각화 프로젝트를 위하여 CSV 파일로 저장
import pandas as pd
sql = "select *
from (select s.id s_id, b.name s_brand, s.name s_name, s.gu_name s_gu, s.address s_address, s.lat s_lat, s.lng s_lng
from COFFEE_STORE s, COFFEE_BRAND b
where b.id = s.brand and b.name like 'STARBUCKS') st,
(select s.id e_id, b.name e_brand, s.name e_name, s.gu_name e_gu, s.address e_address, s.lat e_lat, s.lng e_lng
from COFFEE_STORE s, COFFEE_BRAND b
where b.id = s.brand and b.name like 'EDIYA') ed
order by st.s_id, ed.e_id"
cursor.execute(sql)
result = cursor.fetchall()
num_fields = len(cursor.description)
field_names = [i[0] for i in cursor.description]
df = pd.DataFrame(result)
df.columns = field_names
df.to_csv('coffee_output.csv', index = False, encoding = "utf-8")
conn.close()