Data original source
Target Data(CSV): Information about Summer Olympic medalists (1976–2008) medalists
Source: Kaggle
DownLoad: archive.zip
1-1 Import Target Data
import pandas as pd
df_target = pd.read_csv('../data/Summer-Olympic-medals-1976-to-2008.csv', encoding='latin1') # latin1 or cp1252
df_target.head()

1-2 Preprocessing : missing data
Check basic data information (make sure there is no missing data)
df_target.info()

Check for null values
df_target.isnull().sum()

Data cleansing
df_target.dropna(inplace=True)
df_target.isnull().sum()

df_target.reset_index(inplace=True)
df_target.drop(columns='index', inplace=True)
1-3 Preprocessing: Organizing Data Type
Condition 1: Float data to int.
Check data type

Data type conversion
df_target['Year'] = df_target['Year'].astype(int)
Checking

2-1 Find the 2008 Korea Medalist
Condition 1: Create a DataFrame containing only the gold medalists in archery at the 2008 Beijing Olympics.
import copy
df_copy = copy.deepcopy(df_clean)
df_archery = df_copy
df_archery = df_archery[(df_copy['Year'] == 2008) &
(df_copy['Sport'] == 'Archery') &
(df_copy['Medal'] == 'Gold') &
(df_copy['Country_Code'] == 'KOR')
]

2-2 South Korea Summer Olympic medal history(1976-2008)
df_kor = df_target.copy()
df_kor = df_target[df_target['Country_Code'] == 'KOR']
df_kor = df_kor.drop_duplicates(subset = ['City','Year', 'Sport', 'Discipline',
'Event', 'Country_Code', 'Country',
'Event_gender', 'Medal'])
df_kor.tail(2)

df_kor = df_kor.pivot_table(index=['Year', 'Medal'], values = 'City', aggfunc='count')
df_kor = df_kor.reindex(['Gold', 'Silver', 'Bronze'], level=1)
df_kor

2-3) Top 10 countries by total number of medals at the 1996 Atlanta Olympics.
Check medals by country
import copy
df_rank = df_target.copy()
df_rank = df_rank[df_rank['Year']==1996]
df_rank = df_rank.drop_duplicates(subset = ['City','Year', 'Sport', 'Discipline',
'Event', 'Country_Code', 'Country',
'Event_gender', 'Medal'])
df_rank = pd.pivot_table(df_rank, index='Country', values='Medal', aggfunc='count')
df_rank

Top 10 countries
df_rank = df_rank.reset_index()
df_rank.sort_values('Medal', ascending=False, inplace=True)
df_rank_10 = df_rank.head(10)
df_rank_10

2-4) Top 10 countries by number of gold medals at the 1996 Atlanta Olympics
Medal Status Reset
import copy
df_rank = df_target.copy()
df_rank[['Gold', 'Silver', 'Bronze']] = 0
df_rank.tail(1)

Add required conditions
df_rank = df_rank[(df_rank['Year'] == 1996) & (df_rank['City'] == 'Atlanta')]
df_rank = df_rank.drop_duplicates(subset = ['City','Year', 'Sport', 'Discipline',
'Event', 'Country_Code', 'Country',
'Event_gender', 'Medal'])
df_rank = df_rank.reset_index()
df_rank.info()
df_rank.tail(1)


Medal count
del df_rank['index']
for idx in range(0, len(df_rank)):
if df_rank['Medal'].loc[idx] == 'Gold':
df_rank['Gold'].loc[idx] += 1
elif df_rank['Medal'].loc[idx] == 'Silver':
df_rank['Silver'].loc[idx] += 1
elif df_rank['Medal'].loc[idx] == 'Bronze':
df_rank['Bronze'].loc[idx] += 1
df_rank.tail(2)

Find the top 10 countries with gold medals
df_rank = df_rank.groupby('Country').agg({'Gold': 'sum', 'Silver': 'sum', 'Bronze': 'sum'}).reset_index()
df_rank = df_rank.sort_values(by=['Gold', 'Silver', 'Bronze'], ascending=False)
df_rank = df_rank[['Country','Gold', 'Silver', 'Bronze']]
df_rank_10 = df_rank.head(10)
df_rank_10
