해당 내용은 Udemy 강의 'Pandas 및 Python을 이용한 데이터 분석:마스터 클래스'를 수강 후 정리한 내용입니다.
IMPORT AND EXPLORE DATASET
# Data contains transactions details between 01/12/2010 and 09/12/2011 for a UK-based non-store online retail.
# The company specializes in selling unique gifts
# Convert Invoice date to datetime format
sales_df['InvoiceDate'] = pd.to_datetime(sales_df['InvoiceDate'])
# Check the number of Null values in the data
sales_df.isnull().sum()
MINI CHALLENGE #1:
# 항목 출력
sales_df['Country'].unique()
# 항목들의 합 출력
sales_df['Country'].nunique()
GROUPBY
# A groupby operation involves some combination of splitting the object, applying a function, and combining the results.
# This can be used to group large amounts of data and compute operations on these groups.
pandas.DataFrame.groupby — pandas 2.2.3 documentation
sales_df.groupby('Country')['UnitPrice'].mean()
sales_df.groupby('Country')['UnitPrice'].max()
sales_df.groupby('Country')['UnitPrice'].min()
두개의 열을 groupby도 가능
sales_df.groupby(['Country', 'InvoiceDate'])['UnitPrice'].mean()
MINI CHALLENGE #2:
# 2010-12-01 08:34:00의 평균 가격
sales_df.groupby('InvoiceDate')['UnitPrice'].mean()['2010-12-01 08:34:00']
# 2010-12-01 08:34:00의 최대 가격
sales_df.groupby('InvoiceDate')['UnitPrice'].max()['2010-12-01 08:34:00']
# 2010-12-01 08:34:00의 최소 가격
sales_df.groupby('InvoiceDate')['UnitPrice'].min()['2010-12-01 08:34:00']
CREATE MULTI-INDEX DATAFRAME
# You can select any column to be the index for the DataFrame
# Use one column only as follows:
sales_df.set_index(keys=['InvoiceDate'], inplace=True)
sales_df
# We can have multiple keys (indexes) using Pandas Multi-indexing
# Take the columns with the least number of unique values and use it for the outermost index
sales_df = pd.read_csv('ecommerce_sales.csv', encoding='unicode_escape')
sales_df.set_index(keys=['Country', 'InvoiceDate'], inplace=True)
sales_df
# Sort countries to start with alphabetical order
sales_df.sort_index(inplace=True)
sales_df
MINI CHALLENGE #3:
sales_df.sort_index(inplace=True, ascending=False)
sales_df
4. MULTI-INDEXING OPERATIONS - PART #1
# Obtain the first level of indexing (countries)
sales_df.index.get_level_values(0)
# Alternatively, you can select the name of the column/index
sales_df.index.get_level_values('Country')
# Obtain the second level of indexing (Dates)
sales_df.index.get_level_values(1)
# Same here, you can select the name of the coulumn/index
sales_df.index.get_level_values('InvoiceDate')
# You can change the names of the DataFrame by invoking the set_names method
sales_df.index.set_names(names=['Transaction Date', 'Transaction Location'], inplace=True)
sales_df
MINI CHALLENGE #4:
sales_df = pd.read_csv('ecommerce_sales.csv', encoding='unicode_escape')
sales_df.set_index(keys=['InvoiceDate', 'Country'], inplace=True)
sales_df
sales_df.index.set_names(names=['Date', 'location'], inplace=True)
sales_df
5. MULTI-INDEXING OPERATIONS - PART #2
sales_df = pd.read_csv('ecommerce_sales.csv', encoding='unicode_escape')
sales_df.set_index(keys=['Country', 'InvoiceDate'], inplace=True)
# Sort countries to start with alphabetical order
sales_df.sort_index(inplace=True)
sales_df
# you can use a multi-index reference to access specific elements
# Alternatively, you can use a column name instead
sales_df.loc['Australia', '1/11/2011 9:47']
# feed index as a tuple (important to avoid confusion)
# first argument references rows and the second argument references a column
sales_df.loc[('Australia', '1/11/2011 9:47'), 'UnitPrice']
# You can use transpose to Transpose indexes and columns
# reflect the DataFrame over its main diagonal by writing rows as columns and vice-versa.
sales_df = sales_df.transpose()
sales_df.head(10)
sales_df.loc['UnitPrice', ('Australia', '1/10/2011 9:58'):('Bellgium', '1/10/2011 9:58')]
# You can perform swaplevel as follows:
sales_df = sales_df.swaplevel()
sales_df
(Country, InvoiceDate) → (InvoiceDate, Country)
MINI CHALLENGE #5:
sales_df.loc[('United Kingdom', '12/1/2010 8:26'), 'UnitPrice']
sales_df.loc[('United Kingdom', '12/1/2010 8:26'), 'UnitPrice'].mean()