<수업 내용>
import pandas as pd
customer_master = pd.read_csv('https://raw.githubusercontent.com/jin0choi1216/dataset/main/table_data_anlaysis/shopping_mall/customer_master.csv')
item_master = pd.read_csv('https://raw.githubusercontent.com/jin0choi1216/dataset/main/table_data_anlaysis/shopping_mall/item_master.csv')
transaction_1 =pd.read_csv('https://raw.githubusercontent.com/jin0choi1216/dataset/main/table_data_anlaysis/shopping_mall/transaction_1.csv')
transaction_2 =pd.read_csv('https://raw.githubusercontent.com/jin0choi1216/dataset/main/table_data_anlaysis/shopping_mall/transaction_2.csv')
transaction_detail_1 =pd.read_csv('https://raw.githubusercontent.com/jin0choi1216/dataset/main/table_data_anlaysis/shopping_mall/transaction_detail_1.csv')
transaction_detail_2 =pd.read_csv('https://raw.githubusercontent.com/jin0choi1216/dataset/main/table_data_anlaysis/shopping_mall/transaction_detail_2.csv')
customer_master.head()
item_master.head()
transaction_1.tail()
transaction_2.head()
transaction_detail_1.head()
데이터 결합
transaction=pd.concat([transaction_1,transaction_2], ignore_index=True)
transaction
transaction_detail=pd.concat([transaction_detail_1,transaction_detail_2], ignore_index=True)
transaction_detail
merge1=pd.merge(left=transaction_detail, right=transaction, on= 'transaction_id', how='left')
merge1
- 위의 merge결합의 문제점 : quantity와 price열에 오해가 있을 수 있다. price열을 drop하는 것이 오해를 없앨 수 있다. 하단 이미지 참고
merge1=pd.merge(left=transaction_detail, right=transaction.drop('price',axis=1), on= 'transaction_id', how='left')
merge1
- 위에서 생성한 merge1데이터 프레임과 customer_master,item_master를 merge하기
merge2=pd.merge(left=customer_master,right=merge1,on='customer_id', how='left')
merge3=pd.merge(left=merge2,right=item_master.drop("item_price",axis=1),on='item_id', how='left')
merge3
merge3['price']=merge3['quantity']*merge3['item_price']
merge3
print(merge3['price'].sum())
print(transaction['price'].sum(0))
merge3['birth']=pd.to_datetime(merge3['birth'])
merge3['payment_date']=pd.to_datetime(merge3['payment_date'])
merge3['registration_date']=pd.to_datetime(merge3['registration_date'])
merge3.info()
실습
- 성별에 따른 매출 총합을 비교하고 어떤 성별이 매출이 더 높은지 구해보세요.
merge4=merge3[['gender','price']].groupby('gender').sum()
merge4
- 태어난 년도(birth) 별 매출을 구해보세요.
merge3['year']=merge3['birth'].apply(lambda x : x.strftime("%Y"))
merge5=merge3[['year','price']].groupby('year').sum()
merge5
- merge3['year']=merge3['birth'].dt.year활용이 더 간단하다
- payment_date칼럼을 활용하여 구매년월(payment_month) 칼럼을 생성하고 구매년월, 아이템 이름에 따른 매출 총합과 수량 총합을 구해보세요.
merge3.dropna(axis=0,inplace= True)
merge3['payment_month']=merge3['payment_date'].apply(lambda x : x.strftime("%Y%m"))
merge6=merge3[['payment_month','item_name','price','quantity']].groupby(['payment_month','item_name']).sum()
merge6
- merge3['payment_month']=merge3['payment_date'].dt.strftime('%Y%m')활용 가능
데이터 시각화
import matplotlib.pyplot as plt
import numpy as np
x= np.arange(100,300,10)
y= np.random.randn(20)
plt.plot(x,y) 입력하세요
plt.figure(figsize=(15,5))
plt.plot(x,y)
plt.plot(x,y)
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.title('Title')
plt.plot(x,y,color='r',linestyle='--',marker='s')
여러개의 그래프 그리기
fig=plt.figure(figsize=(16,10))
axes1=fig.add_subplot(2,2,1)
axes1.plot(x,y,color='r',linestyle='--',marker='s')
axes2=fig.add_subplot(2,2,2)
axes2.hist(y)
axes3=fig.add_subplot(2,2,3)
axes3.scatter(x,y)
fig,axes=plt.subplots(2,2,figsize=(15,10))
axes[0,0].plot(x,y)
axes[0,1].hist(y)
axes[1,0].scatter(x,y)
axes[1,1].plot(x,y,'r--s')
x1=np.arange(100,300, 10)
y1=np.random.randn(20)
x2=np.arange(200,400,10)
y2=np.random.randn(20)
plt.plot(x1,y1,label="x1")
plt.plot(x2,y2,label="x2")
plt.legend(loc=1)
여러 그래프의 label, title 다루기
fig=plt.figure(figsize=(16,10))
axes1=fig.add_subplot(2,2,1)
axes1.plot(x,y,color='r',linestyle='--',marker='s')
axes1.set_title('axes1_title')
axes2=fig.add_subplot(2,2,2)
axes2.hist(y)
axes2.set_title('axes2_title')
axes3=fig.add_subplot(2,2,3)
axes3.scatter(x,y)
axes3.set_title('axes3_title')
fig.suptitle('Figure Title')