Brazilian E-Commerce Public Dataset by Olist
from kaggleolist_2022
geolocation.csv
와 customers.csv
zipcode_prefix
와 city
, state
가 같음payment installments
import pandas as pd
# 파일 불러오기
product_df = pd.read_csv('olist_products_dataset.csv', encoding='utf-8')
translation_df = pd.read_csv('product_category_name_translation.csv', encoding='utf-8')
# 특정 열 가져오기
product_series = product_df['product_category_name']
name_series = translation_df['product_category_name']
eng_series = translation_df['product_category_name_english']
# 전처리하기 위해 list로 타입 변경하기
product_list = product_series.tolist()
name_list = name_series.tolist()
eng_list = eng_series.tolist()
# 없는 값 확인하기
non_list = []
for product in product_list:
if product not in name_list:
non_list.append(product_list.index(product))
set(non_list) # 중복 제외 # 105, 1628, 5821
# 대응되는 인덱스만 추출해서 변환하기
for product in product_list:
if product in name_list:
product_list[product_list.index(product)] = eng_list[name_list.index(product)]
# 변환한 값 적용하기
product_series = pd.Series(product_list)
product_df['product_category_name'] = product_series
# 변환한 dataframe 저장하기
product_df.to_csv("olist_products_dataset.csv", mode='w', index=False)
comment_title
과 comment_message
열에 NULL 값이 많다# review 테이블 가져오기
review_df = pd.read_csv('olist_order_reviews_dataset.csv', encoding='utf-8')
# comment_title 및 comment_message 열 제거
review_df = review_df.drop(['review_comment_title','review_comment_message'], axis=1)
# review 갱신하기
review_df.to_csv("olist_order_reviews_dataset.csv", mode='w', index=False)
orders
테이블 timestamp
컬럼 타입 변환# order 테이블 불러오기
order_df = pd.read_csv('olist_orders_dataset.csv', encoding='utf-8')
# 컬럼 type 확인하기
purchase_time = order_df['order_purchase_timestamp']
type(purchase_time[0]) # str
# datetime으로 변환하기
from datetime import datetime
date_list = []
for i in range(len(purchase_time)):
date = purchase_time[i].split(' ')
date_list.append(datetime.strptime(date[0], "%Y-%m-%d"))
purchase_time = pd.Series(date_list)
# 바뀐 데이터 적용하기
order_df['order_purchase_timestamp'] = purchase_time
timestamp
가 null인 컬럼들.fillna
적용 안되는 문제inplace=True
옵션 추가# 결측치 개수 확인
order_df.isnull().sum()
order_id 0
customer_id 0
order_status 0
order_purchase_timestamp 0
order_approved_at 160
order_delivered_carrier_date 1783
order_delivered_customer_date 2965
order_estimated_delivery_date 0
dtype: int64
# 결측치 대체
order_df.fillna('2000-01-01 00:00:00', inplace=True)
order_df.isnull().sum()
order_id 0
customer_id 0
order_status 0
order_purchase_timestamp 0
order_approved_at 0
order_delivered_carrier_date 0
order_delivered_customer_date 0
order_estimated_delivery_date 0
dtype: int64
# 결측치 복원
order_df.replace('2000-01-01', '', inplace=True)
주제 선정
“How fast each order gets delivered to customers”
orders
order_item
: [product_id], shipping_limit_dateproduct
: [product_id], product_category_namereviews
: [order_id], review_scorecustomers
: [customer_id], ...zip_code_prefix, ...city, ...stategeolocation
: ...zip_code_prefix, ...city, ...statesellers
: ...zip_code_prefix, ...city, ...state# customers
customer_df.shape
(99441, 5)
# geo location
geo_df.shape
(1000163, 5)
# order items
order_item_df.shape
(112650, 7)
# order payments
order_payment_df.shape
(103886, 5)
# order reviews
order_review_df.shape
(99224, 7)
# order
order_df.shape
(99441, 8)
order_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 order_id 99441 non-null object
1 customer_id 99441 non-null object
2 order_status 99441 non-null object
3 order_purchase_timestamp 99441 non-null object
4 order_approved_at 99281 non-null object ***
5 order_delivered_carrier_date 97658 non-null object ***
6 order_delivered_customer_date 96476 non-null object ***
7 order_estimated_delivery_date 99441 non-null object
dtypes: object(8)
memory usage: 6.1+ MB
# order products
product_df.shape
(32951, 9)
product_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 product_id 32951 non-null object
1 product_category_name 32341 non-null object ***
2 product_name_lenght 32341 non-null float64 ***
3 product_description_lenght 32341 non-null float64 ***
4 product_photos_qty 32341 non-null float64 ***
5 product_weight_g 32949 non-null float64 ***
6 product_length_cm 32949 non-null float64 ***
7 product_height_cm 32949 non-null float64 ***
8 product_width_cm 32949 non-null float64 ***
dtypes: float64(7), object(2)
memory usage: 2.3+ MB
# sellers
seller_df.shape
(3095, 4)