Coffee shops analysis (MySQL)

J·2024년 9월 4일

EDA ENG aws rds python sql

EDA (Exploratory Data Analysis)

목록 보기

12/17

Problem 1.

Create a project-related database in AWS RDS (MySQL) and create an accessible user account.

Database Name: oneday
User Name / Password: oneday / 1234

Call required module

pip install mysql-connector-python
import mysql.connector

import mysql.connector
mydb = mysql.connector.connect(
    host = "your aws rds's host name",
    port = 3306,
    user = "your ID",
    password = "your password"
)

cursor = mydb.cursor(buffered = True)

Account setting

sql = 'create database oneday default character set utf8mb4'
cursor.execute(sql)

cursor.execute("create user 'oneday'@'%' identified by '1234'")
cursor.execute("grant all on oneday.* to 'oneday'@'%'")

Checking

Database creation statement query result: SHOW CREATE DATABASE oneday;
User permission check result: SHOW GRANT FOR ‘oneday’@‘%’

result1 = "show create database oneday"
cursor.execute(result1)

result1 = cursor.fetchall()
for i in result1:
    print(i)

result2 = "show grants for 'oneday'@'%'"
cursor.execute(result2)

result2 = cursor.fetchall()
for i in result2:
    print(i)

Problem 2.

Create a table to store Starbucks Ediya data

cursor.execute('use oneday')

cBrand = "create table COFFEE_BRAND(id int not null auto_increment primary key, name varchar(12))"
cursor.execute(cBrand)

cStore = "create table COFFEE_STORE(id int not null auto_increment primary key, brand int, name varchar(32) not null, gu_name varchar(5) not null, address varchar(128) not null, lat decimal(16,14) not null, lng decimal(17,14) not null, foreign key (brand) references COFFEE_BRAND(id))"
cursor.execute(cStore)

Checking

Table creation result: Desc COFFEE_BRAND; Desc COFFEE_STORE;
COFFEE_BRAND query result: SELECT * FROM COFFEE_BRAND;

result3 = 'desc COFFEE_BRAND'
cursor.execute(result3)

result3 = cursor.fetchall()
for i in result3:
    print(i)

result4 = 'desc COFFEE_STORE'
cursor.execute(result4)

result4 = cursor.fetchall()
for i in result4:
    print(i)

Problem 3.

Enter and check the COFFEE_BRAND data with Python code as follows.

cursor = mydb.cursor(buffered = True)
cursor.execute("insert into COFFEE_BRAND values (1, 'STARBUCKS'), (2, 'EDIYA')")
conn.commit()

Checking

result5 = "select * from COFFEE_BRAND"
cursor.execute(result5)

result5 = cursor.fetchall()
for i in result5:
    print(i)

Problem 4.

When importing data from the Starbucks page with Python code, modify it to enter directly into the COFFEE_STORE table.

url = "https://www.starbucks.co.kr/store/store_map.do"
driver = webdriver.Chrome()
driver.get(url)

driver.find_element(By.CSS_SELECTOR,'#container > div > form > fieldset > div > section > article.find_store_cont > article > header.loca_search > h3 > a').click()
time.sleep(0.5)
driver.find_element(By.CSS_SELECTOR, '.set_sido_cd_btn').click()
time.sleep(0.5)

xpath = '//*[@id="mCSB_2_container"]/ul/li[1]/a'
tag = driver.find_element(By.XPATH, xpath)
tag.click()
time.sleep(0.5)

soup = BeautifulSoup(driver.page_source, "html.parser")
seoul_list = driver.find_elements(By.CSS_SELECTOR, '#mCSB_3_container ul li')
soup.select_one(f'#mCSB_3_container > ul > li:nth-child(100) > p').text[:-9]

Read entire data

cursor = mydb.cursor(buffered=True)

sql = "insert into COFFEE_STORE (brand, name, gu_name, address, lat, lng) values (1, %s, %s, %s, %s, %s)"

cnt = 1

for content in tqdm_notebook(seoul_list):
    name = content.get_attribute('data-name')
    address = soup.select_one(f'#mCSB_3_container > ul > li:nth-child({cnt}) > p').text[:-9]
    lat = content.get_attribute('data-lat')
    lng = content.get_attribute('data-long')
    gu_name = address.split()[1] if address else ''

    cnt += 1

    cursor.execute(sql, (name, gu_name, address, lat, lng))
    conn.commit()
    
    driver.close()

Count Records

count_query = 'SELECT COUNT(*) FROM COFFEE_STORE where brand = 1;'
cursor.execute(count_query)

record_count = cursor.fetchone()[0]
record_count

Checking 10 records

check = "select * from COFFEE_STORE where brand = 1 limit 10"
cursor.execute(check)

result = cursor.fetchall()
for i in result:
    print(i)

Problem 5.

When importing data from the Ediya page with Python code, modify it to enter directly into the COFFEE_STORE table.

Collecting Data

driver = webdriver.Chrome() 
driver.get('https://www.ediya.com/contents/find_store.html')
driver.find_element(By.CSS_SELECTOR, '#contentWrap > div.contents > div > div.store_search_pop > ul > li:nth-child(2) > a').click()

sql = "INSERT INTO COFFEE_STORE (brand, name,gu_name, address, lat, lng) VALUES (2, %s, %s, %s, %s, %s)"

for gu in tqdm_notebook(gu_list):

    keyword = driver.find_element(By.CSS_SELECTOR, '#keyword')
    keyword.clear()
    keyword.send_keys(gu)

    driver.find_element(By.CSS_SELECTOR, '#keyword_div > form > button').click()

    time.sleep(1)  

    html = driver.page_source
    soup_ed = BeautifulSoup(html, 'html.parser')
    contents = soup_ed.select('#placesList li')

    for content in contents:
        name = content.select_one('dt').text
        address = content.select_one('dd').text
        gu_name = address.split(' ')[1]

        print(f'{name}--{address}--{gu_name}')

        cursor.execute(sql,(name, gu_name, address, lat, lng))
        conn.commit()
        
        driver.close()

Recoords count

count_query = 'SELECT COUNT(*) FROM COFFEE_STORE where brand = 2;'
cursor.execute(count_query)

record_count = cursor.fetchone()[0]
record_count

Checking 10 records

cursor.execute("select * from COFFEE_STORE where brand = 2 limit 10;")
result = cursor.fetchall()
for row in result:
    print(row)

Checking

Main distribution areas of Starbucks stores (names of top 5 districts with the most stores, output of number of stores)

strb = "select s.gu_name, count(s.brand) from COFFEE_BRAND as b, COFFEE_STORE as s where b.id = s.brand and b.name='STARBUCKS' group by s.gu_name order by count(s.brand) desc limit 5"

cursor.execute(strb)
result = cursor.fetchall()

for row in result:
    print(row)

Ediya store main distribution area (name of top 5 districts with most stores, output number of stores)

edy = "select s.gu_name, count(s.brand) from COFFEE_BRAND as b, COFFEE_STORE as s where b.id = s.brand and b.name='EDIYA' group by s.gu_name order by count(s.brand) desc limit 5"

cursor.execute(edy)
result = cursor.fetchall()

for row in result:
    print(row)

Search the number of stores for each distinct brand (output old name, brand name, number of stores)

  gu_each_st = "select gu_name, '스타벅스' as brand, count(brand) as count from COFFEE_STORE where brand = 1 group by gu_name"

  cursor.execute(gu_each_st)
  result = cursor.fetchall()

  for row in result:
      print(row)

  gu_each_ed = "select gu_name, '이디야' as brand, count(brand) as count from COFFEE_STORE where brand = 2 group by gu_name"

  cursor.execute(gu_each_ed)
  result = cursor.fetchall()

  for row in result:
      print(row)

Check the number of stores for each brand (output old name, number of Starbucks stores, number of Ediya stores)

count = ("select s.gu_name, "
         "sum(s.brand=1) as count1, "
         "sum(s.brand=2) as count2 "
         "from COFFEE_BRAND b, COFFEE_STORE s " 
         "where b.id = s.brand " 
         "group by s.gu_name " 
         "order by s.gu_name;")

cursor.execute(count)
result = cursor.fetchall()

for row in result:
    gu_name, count1, count2 = row
    print(f'({gu_name}, {count1}, {count2})')

Problem 6.

Save as CSV file. (Working with Python code)

final = ("select A.*, B.* from (select * from COFFEE_STORE where brand=1) as A join (select * from COFFEE_STORE where brand=2) as B on B.gu_name = A.gu_name")

cursor.execute(final)
result = cursor.fetchall()

df = pd.DataFrame(result)
df.columns = ['s_id', 's_brand', 's_name', 's_gu', 's_address', 's_lat', 's_lng', 
              'e_id', 'e_brand', 'e_name', 'e_gu', 'e_address', 'e_lat', 'e_lng']

df.to_csv('./starbucks_ediya.csv', index = False, encoding = "euc-kr")

Full of adventure

이전 포스트

Summer Olympics data analysis

다음 포스트

Coffee shops analysis (MySQL)

EDA (Exploratory Data Analysis)

Problem 1.

Problem 2.

Problem 3.

Problem 4.

Problem 5.

Problem 6.

Summer Olympics data analysis

Internet usage by country analysis

0개의 댓글