[!Important]+ Goals
일반사단법인 데이터사이언스 협회가 GitHub 에 공개한 " 데이터 사이언스 100 개 노크 (구조화 데이터 가공편)" 의 문제연습을 하는 코스입니다.
이 코스에서는 애매한 조건, 소트, 집계, 샘플링 등에 대해 배울 수 있습니다.
비즈니스 현장에서의 분석 실무에서는 데이터의 대부분이 구조화 데이터입니다. 구조화 데이터를 자유롭게 가공, 집계하는 스킬을 닦아, 데이터 사이언스의 실전력을 올립시다.
[!abstract]+ Curriculum
1. 애매한 조건
2. 소트
3. 집계
4. 서브쿼리
5. 샘플링
6. 뺄셈 에러 대응
7. name identification
8. 데이터 분할
#pd/query/str/startswith
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
print(df_store.query("store_cd.str.startswith('S14')", engine='python').head(10))
#regular_expression #regex #pd/query/str/contains
임의의 한 글자 | . |
---|---|
문자열 선두 | \^ |
문자열 끝 | $ |
동일문자 반복 | * + \? |
범위지정 | - |
대괄호에 포함되는 어떤 한 문자에 매치 | [] |
대괄호에 포함되는 문자 이외에 매치 | [\^] |
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
print(df_customer.query("status_cd.str.contains('^[A-F]', regex=True)", engine='python').head(10))
#pd/query/str/endswith
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
print(df_customer.query("customer_id.str.endswith('1')", engine='python').head(10))
#pd/query/str/contains #regex #regular_expression
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
print(df_customer.query("status_cd.str.contains('[1-9]$', regex=True)", engine='python').head(10))
#pd/query/str/contains
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
print(df_store.query("address.str.contains('横浜市')", engine='python'))
#pd/query/str/contains
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
print(df_customer.query("status_cd.str.contains('^[A-F].*[1-9]$', regex=True)", engine='python').head(10))
#pd/sort
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
print(df_customer.sort_values('birth_day', ascending=True).head(10))
#pd/sort
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
print(df_customer.sort_values('birth_day', ascending=False).head(10))
#pd/rank #pd/concat #pd/sort
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
# "df_tmp"に処理後のデータを代入してください
df_tmp = pd.concat(
[df_receipt[['customer_id', 'amount']]
, df_receipt["amount"].rank(method='min', ascending=False)
]
, axis=1)
# カラム名を指定します。書き換える必要はありません
df_tmp.columns = ['customer_id', 'amount', 'ranking']
print(df_tmp.sort_values('ranking', ascending=True).head(10))
#pd/rank #pd/concat #pd/sort
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
# "df_tmp"に処理後のデータを代入してください
df_tmp = pd.concat(
[df_receipt[['customer_id', 'amount']]
, df_receipt["amount"].rank(method='first', ascending=False)
]
, axis=1)
# カラム名を指定します。書き換える必要はありません
df_tmp.columns = ['customer_id', 'amount', 'ranking']
print(df_tmp.sort_values('ranking', ascending=True).head(10))
#len #pd/unique
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
print(len(pd.unique(df_receipt.customer_id)))
#pd/groupby/agg/sum
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
print(df_receipt.groupby('store_cd').agg({'amount':'sum', 'quantity':'sum'}).reset_index().head())
#pd/groupby/agg/max
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
print(df_receipt.groupby('customer_id').agg({'sales_ymd':'max'}).reset_index().head(10))
#pd/groupby/agg/min
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
print(df_receipt.groupby('customer_id').agg({'sales_ymd':'min'}).reset_index().head(10))
#pd/groupby/agg
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
df_tmp = df_receipt.groupby('customer_id').agg({'sales_ymd':['max','min']}).reset_index()
# カラム名を指定します。書き換える必要はありません
df_tmp.columns = ["_".join(pair) for pair in df_tmp.columns]
print(df_tmp.query('sales_ymd_max != sales_ymd_min').head(10))
#pd/groupby/agg/mean #pd/sort
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
print(df_receipt.groupby("store_cd").agg({"amount":"mean"}).reset_index().sort_values("amount", ascending=False).head())
#pd/groupby/agg/median #pd/sort
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
print(df_receipt.groupby("store_cd").agg({"amount":"median"}).reset_index().sort_values("amount", ascending=False).head())
#pd/groupby/apply #lambda
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
print(df_receipt.groupby('store_cd').product_cd.apply(lambda x: x.mode()).reset_index())
#pd/groupby/var #std_var
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
print(
df_receipt
.groupby("store_cd")
.amount.var(ddof=0)
.reset_index()
.sort_values("amount", ascending=False)
.head()
)
#pd/groupby/std #pd/std_dev
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
print(
df_receipt
.groupby("store_cd").amount
.std(ddof=0)
.reset_index()
.sort_values("amount", ascending=False)
.head()
)
#np/percentile
import pandas as pd
import numpy as np
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
print(
np.percentile(df_receipt.amount, [25,50,75,100])
)
#pd/query #pd/groupby/agg/mean
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
print(
df_receipt.groupby("store_cd")
.agg({"amount":'mean'})
.reset_index()
.query("amount >= 330")
)
#pd/query
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
print(
df_receipt
.query('not customer_id.str.startswith("Z")', engine='python')
.groupby("customer_id")
.amount
.sum()
.mean()
)
#pd/query
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
amount_mean = df_receipt.query('not customer_id.str.startswith("Z")', engine='python').groupby("customer_id").amount.sum().mean()
amount_sum = df_receipt.query('not customer_id.str.startswith("Z")', engine='python').groupby("customer_id").amount.sum().reset_index()
print(amount_sum[amount_sum.amount >= amount_mean].head(10))
#df/sample
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
# random_stateを42としてランダムに1%のデータを抽出し, "df_sampleに代入してください"
df_sample = df_customer.sample(frac=0.01, random_state=42)
print(df_sample.head(10))
#sample/stratify
import pandas as pd
from sklearn.model_selection import train_test_split
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
# sklearn.model_selection.train_test_splitを使用して層化を行ってください
_, sample_df_customer = train_test_split(df_customer, test_size=0.1, stratify =df_customer.gender_cd, random_state=42)
print(sample_df_customer.groupby("gender_cd").agg({'customer_id' : 'count'}))
#pd/merge #fillna
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
# 1.レシート明細データフレーム(df_receipt)からqueryメソッドにて該当の期間のデータを抽出する
df_tmp_1 = df_receipt.query('20190101 <= sales_ymd <= 20191231')
# 2. "1"で抽出したデータを顧客データフレーム(df_customer)に結合する
df_tmp_1 = pd.merge(df_customer['customer_id'], df_tmp_1[['customer_id', 'amount']], how='left', on='customer_id'). \
groupby('customer_id').sum().reset_index().rename(columns={'amount':'amount_2019'})
# 3. レシート明細データフレーム(df_receipt)を顧客データフレーム(df_customer)に結合する
df_tmp_2 = pd.merge(df_customer['customer_id'], df_receipt[['customer_id', 'amount']], how='left', on='customer_id'). \
groupby('customer_id').sum().reset_index()
# 4. "2"と"3"で得たデータを内部結合する
df_tmp = pd.merge(df_tmp_1, df_tmp_2, how='inner', on='customer_id')
# 5. "4"の結合時に生じた欠損値を補完する
df_tmp['amount_2019'] = df_tmp['amount_2019'].fillna(0)
df_tmp['amount'] = df_tmp['amount'].fillna(0)
# 6. 2019の売り上げ金額 / 全期間の売上金額を行い割合をデータフレームに追加する
df_tmp['amount_rate'] = df_tmp['amount_2019'] / df_tmp['amount']
# 7. "6"で生じた欠損値を補完する
df_tmp['amount_rate'] = df_tmp['amount_rate'].fillna(0)
# 8. queryメソッドにて条件に基づいて取得する
print( df_tmp.query('amount_rate > 0').head(10) )
#pd/merge #pd/sort #pd/drop_duplicates
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
# 顧客ごとの売上金額合計を算出する
df_tmp = df_receipt.groupby('customer_id').agg({'amount':'sum'}).reset_index()
# 顧客データフレーム(df_customer)に売上金額合計を追加し、売上金額合計、顧客IDでソートする
df_customer_u = pd.merge(df_customer, df_tmp, how='left', on='customer_id').sort_values(['amount', 'customer_id'], ascending=[False, True])
# 同一顧客に対しては売上金額合計が最も高いものを残すように削除する
df_customer_u.drop_duplicates(subset=['customer_name', 'postal_cd'], keep='first', inplace=True)
print('減少数: ', len(df_customer) - len(df_customer_u))
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
df_customer_u = pd.read_csv('./100knocks-preprocess/87.csv', index_col=0)
# 顧客データフレーム(df_customer)と名寄顧客データフレーム(df_customer_u)を内部結合する
df_customer_n = pd.merge(df_customer, df_customer_u[['customer_name', 'postal_cd', 'customer_id']],
how='inner', on =['customer_name', 'postal_cd'])
# カラム名を変更する
df_customer_n.rename(columns={'customer_id_x':'customer_id', 'customer_id_y':'integration_id'}, inplace=True)
print('ID数の差', len(df_customer_n['customer_id'].unique()) - len(df_customer_n['integration_id'].unique()))
import pandas as pd
from sklearn.model_selection import train_test_split
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
# 顧客ごとの売上金額合計を算出します
df_sales= df_receipt.groupby('customer_id').agg({'amount':'sum'}).reset_index()
# df_salesにある顧客のみを抽出します
df_tmp = pd.merge(df_customer, df_sales['customer_id'], how='inner', on='customer_id')
# 8:2の割合でランダムにデータを分割します
df_train, df_test = train_test_split(df_tmp, test_size=0.2, random_state=71)
print('訓練データ割合: ', len(df_train) / len(df_tmp))
print('検証データ割合: ', len(df_test) / len(df_tmp))
import pandas as pd
# データをcsvファイルから読み込みます。書き換える必要はありません
df_customer = pd.read_csv('./100knocks-preprocess/customer.csv')
df_category = pd.read_csv('./100knocks-preprocess/category.csv')
df_product = pd.read_csv('./100knocks-preprocess/product.csv')
df_receipt =pd.read_csv('./100knocks-preprocess/receipt.csv')
df_store = pd.read_csv('./100knocks-preprocess/store.csv')
df_geocode = pd.read_csv('./100knocks-preprocess/geocode.csv')
df_tmp = df_receipt[['sales_ymd', 'amount']].copy()
# 西暦と月のみにし、"sales_ym"に代入します
df_tmp['sales_ym'] = df_tmp['sales_ymd'].astype('str').str[0:6]
# 月毎の"amount"を算出します
df_tmp = df_tmp.groupby('sales_ym').agg({'amount':'sum'}).reset_index()
# 「train_size, test_size」はデータの長さ, 「slide_window,start_point」はtrainデータの始まりを決定するのに使用します
def split_data(df, train_size, test_size, slide_window, start_point):
train_start = start_point * slide_window
test_start = train_start + train_size
return df[train_start : test_start], df[test_start : test_start + test_size]
df_train_1, df_test_1 = split_data(df_tmp, train_size=12, test_size=6, slide_window=6, start_point=0)
df_train_2, df_test_2 = split_data(df_tmp, train_size=12, test_size=6, slide_window=6, start_point=1)
df_train_3, df_test_3 = split_data(df_tmp, train_size=12, test_size=6, slide_window=6, start_point=2)
print(df_train_3)