[빅데이터시스템] pandas

이상현·2020년 9월 29일

빅데이터시스템_강의노트

목록 보기

3/10

Pandas

데이터 분석을 위한 데이터 처리 라이브러리
대용량 데이터를 처리하기 위한 함수, 메소드들이 제공

Series (one-dimensional array)

: index와 data로 이루어진 pandas의 데이터구조

DataFrame (two-dimensional array)

: 각각의 인덱스가 series 로 이루어져 있는 pandas의 데이터구조

Pandas Tutorial

import pandas as pd
import numpy as np
import matplotlib as plt

#check pandas version
pd.__version__

'0.24.1'

#creating a series from a list
arr = [0,1,2,3,4]
print(type(arr))
s1 = pd.Series(arr)
s1

<class 'list'>





0    0
1    1
2    2
3    3
4    4
dtype: int64

s1.index

RangeIndex(start=0, stop=5, step=1)

npArr = np.arange(5)
print(type(npArr))
s1_1=pd.Series(npArr)
s1_1

<class 'numpy.ndarray'>





0    0
1    1
2    2
3    3
4    4
dtype: int64

np.array([0,1,2,3,4])

array([0, 1, 2, 3, 4])

#change index
arr = [0,1,2,3,4]
s1 = pd.Series(arr)
s1

0    0
1    1
2    2
3    3
4    4
dtype: int64

order = [1,2,3,4,5]
s2 = pd.Series(arr, index = order) # index 값을 할당해서 인덱스 변경
s2

1    0
2    1
3    2
4    3
5    4
dtype: int64

n = np.random.randn(5)
n

array([ 0.86042598, -0.96561393, -0.34827275, -0.41878316,  0.24770315])

pd.Series(n)

0    0.860426
1   -0.965614
2   -0.348273
3   -0.418783
4    0.247703
dtype: float64

index = ['a','b','c','d','e']
s2 = pd.Series(n, index = index)
s2

a    0.860426
b   -0.965614
c   -0.348273
d   -0.418783
e    0.247703
dtype: float64

# create a series from dictionary
d = {'a':1, 'b':2, 'c':3, 'd':4, 'e':5}
s3 = pd.Series(d)
s3

a    1
b    2
c    3
d    4
e    5
dtype: int64

# you can modify the index of series
print(s1)
print(s1.index)

0    0
1    1
2    2
3    3
4    4
dtype: int64
RangeIndex(start=0, stop=5, step=1)

s1.index = ['A','B','C','D','E']
s1

A    0
B    1
C    2
D    3
E    4
dtype: int64

print(s1.index)

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

# slicing
a = s1[:3]
a

A    0
B    1
C    2
dtype: int64

s1[:-1]

A    0
B    1
C    2
D    3
dtype: int64

s1[-2:]

D    3
E    4
dtype: int64

print(s1)
print(s3)
s4 = s1.append(s3)
print(s4)

A    0
B    1
C    2
D    3
E    4
dtype: int64
a    1
b    2
c    3
d    4
e    5
dtype: int64
A    0
B    1
C    2
D    3
E    4
a    1
b    2
c    3
d    4
e    5
dtype: int64

# drop
s4

A    0
B    1
C    2
D    3
E    4
a    1
b    2
c    3
d    4
e    5
dtype: int64

s4.drop('e') # e를 제외하고 출력

A    0
B    1
C    2
D    3
E    4
a    1
b    2
c    3
d    4
dtype: int64

s4 # s4 자체는 변하지 않음

A    0
B    1
C    2
D    3
E    4
a    1
b    2
c    3
d    4
e    5
dtype: int64

s4 = s4.drop('e') # s4를 변화
s4

A    0
B    1
C    2
D    3
E    4
a    1
b    2
c    3
d    4
dtype: int64

Series operation

arr1 = [0,1,2,3,4,5,7]
arr2 = [6,7,8,9,5]

s5 = pd.Series(arr2)
s5

0    6
1    7
2    8
3    9
4    5
dtype: int64

s6 = pd.Series(arr1)
s6

0    0
1    1
2    2
3    3
4    4
5    5
6    7
dtype: int64

s5.add(s6) # 같은 인덱스까리 더해주고, 없는 것들은 NaN

0     6.0
1     8.0
2    10.0
3    12.0
4     9.0
5     NaN
6     NaN
dtype: float64

s5.sub(s6)

0    6.0
1    6.0
2    6.0
3    6.0
4    1.0
5    NaN
6    NaN
dtype: float64

s5.mul(s6)

0     0.0
1     7.0
2    16.0
3    27.0
4    20.0
5     NaN
6     NaN
dtype: float64

s5.div(s6) #  inf

0     inf
1    7.00
2    4.00
3    3.00
4    1.25
5     NaN
6     NaN
dtype: float64

s6

0    0
1    1
2    2
3    3
4    4
5    5
6    7
dtype: int64

print("median", s6.median())
print("max", s6.max())
print("min", s6.min())

median 3.0
max 7
min 0

s7 = s5.mul(s6)
s7

0     0.0
1     7.0
2    16.0
3    27.0
4    20.0
5     NaN
6     NaN
dtype: float64

# NaN은 제외하고 계산
print("median", s5.median())
print("max", s5.max())
print("min", s5.min())

median 7.0
max 9
min 5

Create Dataframe

dates = pd.date_range('today', periods=6)
dates

DatetimeIndex(['2020-09-29 19:59:15.088382', '2020-09-30 19:59:15.088382',
               '2020-10-01 19:59:15.088382', '2020-10-02 19:59:15.088382',
               '2020-10-03 19:59:15.088382', '2020-10-04 19:59:15.088382'],
              dtype='datetime64[ns]', freq='D')

num_arr = np.random.randn(6,4)
num_arr

array([[ 0.38406967, -1.02847711,  0.83911484,  0.79294576],
       [-0.23224823, -0.78362413, -3.17592451,  0.61708198],
       [-1.59406003,  1.646996  ,  0.45997278,  0.28394742],
       [ 0.0244321 , -0.15328843, -1.70151417,  0.64868737],
       [-0.56298435, -0.83298394,  1.26453586, -1.59056282],
       [ 0.01777079,  0.50983863,  2.29359854,  2.04781485]])

columns = ['A','B','C','D']
df1 = pd.DataFrame(num_arr, index = dates, columns = columns)
df1

	A	B	C	D
2020-09-29 19:59:15.088382	0.384070	-1.028477	0.839115	0.792946
2020-09-30 19:59:15.088382	-0.232248	-0.783624	-3.175925	0.617082
2020-10-01 19:59:15.088382	-1.594060	1.646996	0.459973	0.283947
2020-10-02 19:59:15.088382	0.024432	-0.153288	-1.701514	0.648687
2020-10-03 19:59:15.088382	-0.562984	-0.832984	1.264536	-1.590563
2020-10-04 19:59:15.088382	0.017771	0.509839	2.293599	2.047815

pd.DataFrame(num_arr)

	0	1	2	3
0	0.384070	-1.028477	0.839115	0.792946
1	-0.232248	-0.783624	-3.175925	0.617082
2	-1.594060	1.646996	0.459973	0.283947
3	0.024432	-0.153288	-1.701514	0.648687
4	-0.562984	-0.832984	1.264536	-1.590563
5	0.017771	0.509839	2.293599	2.047815

df1 = pd.DataFrame(num_arr, index = [1,2,3,4,5,6], columns = columns)
df1

	A	B	C	D
1	0.384070	-1.028477	0.839115	0.792946
2	-0.232248	-0.783624	-3.175925	0.617082
3	-1.594060	1.646996	0.459973	0.283947
4	0.024432	-0.153288	-1.701514	0.648687
5	-0.562984	-0.832984	1.264536	-1.590563
6	0.017771	0.509839	2.293599	2.047815

# create dataframe with dictionary array

data = {'animal':['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog','dog'],
       'age':[2.5, 3, 0.5, np.nan, 5, 2, 4.5 , np.nan, 7, 3],
       'visits':[1,3,2,3,2,3,1,1,2,1],
       'priority':['yes', 'yes', 'no', 'no', 'yes', 'yes', 'no', 'no', 'yes', 'no']}
df2 = pd.DataFrame(data)
df2

	animal	age	visits	priority
0	cat	2.5	1	yes
1	cat	3.0	3	yes
2	snake	0.5	2	no
3	dog	NaN	3	no
4	dog	5.0	2	yes
5	cat	2.0	3	yes
6	snake	4.5	1	no
7	cat	NaN	1	no
8	dog	7.0	2	yes
9	dog	3.0	1	no

data = {'animal':['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog','dog'],
       'age':[2.5, 3, 0.5, np.nan, 5, 2, 4.5 , np.nan, 7, 3],
       'visits':[1,3,2,3,2,3,1,1,2,1],
       'priority':['yes', 'yes', 'no', 'no', 'yes', 'yes', 'no', 'no', 'yes', 'no']}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df2 = pd.DataFrame(data, index = labels)
df2

	animal	age	visits	priority
a	cat	2.5	1	yes
b	cat	3.0	3	yes
c	snake	0.5	2	no
d	dog	NaN	3	no
e	dog	5.0	2	yes
f	cat	2.0	3	yes
g	snake	4.5	1	no
h	cat	NaN	1	no
i	dog	7.0	2	yes
j	dog	3.0	1	no

df2.dtypes

animal       object
age         float64
visits        int64
priority     object
dtype: object

df2.head() # 데이터프레임의 앞 5개만 나옴

	animal	age	visits	priority
a	cat	2.5	1	yes
b	cat	3.0	3	yes
c	snake	0.5	2	no
d	dog	NaN	3	no
e	dog	5.0	2	yes

df2.head(3)

	animal	age	visits	priority
a	cat	2.5	1	yes
b	cat	3.0	3	yes
c	snake	0.5	2	no

df3 = df2.head(3)
df3

	animal	age	visits	priority
a	cat	2.5	1	yes
b	cat	3.0	3	yes
c	snake	0.5	2	no

df2.tail() # tail은 뒤에 5개만 나옴

	animal	age	visits	priority
f	cat	2.0	3	yes
g	snake	4.5	1	no
h	cat	NaN	1	no
i	dog	7.0	2	yes
j	dog	3.0	1	no

df2.index

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')

df2.columns

Index(['animal', 'age', 'visits', 'priority'], dtype='object')

df2.values

array([['cat', 2.5, 1, 'yes'],
       ['cat', 3.0, 3, 'yes'],
       ['snake', 0.5, 2, 'no'],
       ['dog', nan, 3, 'no'],
       ['dog', 5.0, 2, 'yes'],
       ['cat', 2.0, 3, 'yes'],
       ['snake', 4.5, 1, 'no'],
       ['cat', nan, 1, 'no'],
       ['dog', 7.0, 2, 'yes'],
       ['dog', 3.0, 1, 'no']], dtype=object)

type(df2.values)

numpy.ndarray

df2

	animal	age	visits	priority
a	cat	2.5	1	yes
b	cat	3.0	3	yes
c	snake	0.5	2	no
d	dog	NaN	3	no
e	dog	5.0	2	yes
f	cat	2.0	3	yes
g	snake	4.5	1	no
h	cat	NaN	1	no
i	dog	7.0	2	yes
j	dog	3.0	1	no

df2.describe() # 데이터프레임을 바탕을 여러가지 통계값을 출력한다.

	age	visits
count	8.000000	10.000000
mean	3.437500	1.900000
std	2.007797	0.875595
min	0.500000	1.000000
25%	2.375000	1.000000
50%	3.000000	2.000000
75%	4.625000	2.750000
max	7.000000	3.000000

df2.T # transpose

	a	b	c	d	e	f	g	h	i	j
animal	cat	cat	snake	dog	dog	cat	snake	cat	dog	dog
age	2.5	3	0.5	NaN	5	2	4.5	NaN	7	3
visits	1	3	2	3	2	3	1	1	2	1
priority	yes	yes	no	no	yes	yes	no	no	yes	no

df2.sort_values(by='age') # age에 따라서 인덱스들이 정렬된다.

	animal	age	visits	priority
c	snake	0.5	2	no
f	cat	2.0	3	yes
a	cat	2.5	1	yes
b	cat	3.0	3	yes
j	dog	3.0	1	no
g	snake	4.5	1	no
e	dog	5.0	2	yes
i	dog	7.0	2	yes
d	dog	NaN	3	no
h	cat	NaN	1	no

#slicing dataframe
df2[1:3]

	animal	age	visits	priority
b	cat	3.0	3	yes
c	snake	0.5	2	no

df2.sort_values(by='age')[1:3]

	animal	age	visits	priority
f	cat	2.0	3	yes
a	cat	2.5	1	yes

# query dataframe by tag
df2

	animal	age	visits	priority
a	cat	2.5	1	yes
b	cat	3.0	3	yes
c	snake	0.5	2	no
d	dog	NaN	3	no
e	dog	5.0	2	yes
f	cat	2.0	3	yes
g	snake	4.5	1	no
h	cat	NaN	1	no
i	dog	7.0	2	yes
j	dog	3.0	1	no

# query dataframe by tag
df2[['age', 'visits']] # [] 대괄호 두개가 필요

	age	visits
a	2.5	1
b	3.0	3
c	0.5	2
d	NaN	3
e	5.0	2
f	2.0	3
g	4.5	1
h	NaN	1
i	7.0	2
j	3.0	1

type(df2[['age','visits']])

pandas.core.frame.DataFrame

df2

	animal	age	visits	priority
a	cat	2.5	1	yes
b	cat	3.0	3	yes
c	snake	0.5	2	no
d	dog	NaN	3	no
e	dog	5.0	2	yes
f	cat	2.0	3	yes
g	snake	4.5	1	no
h	cat	NaN	1	no
i	dog	7.0	2	yes
j	dog	3.0	1	no

#iloc  slice와 같음
df2.iloc[1:3]

	animal	age	visits	priority
b	cat	3.0	3	yes
c	snake	0.5	2	no

df2[1:3]

	animal	age	visits	priority
b	cat	3.0	3	yes
c	snake	0.5	2	no

print(type(df2.iloc[1:3]))
print(type(df2[1:3]))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>

df3 = df2.copy()
df3

	animal	age	visits	priority
a	cat	2.5	1	yes
b	cat	3.0	3	yes
c	snake	0.5	2	no
d	dog	NaN	3	no
e	dog	5.0	2	yes
f	cat	2.0	3	yes
g	snake	4.5	1	no
h	cat	NaN	1	no
i	dog	7.0	2	yes
j	dog	3.0	1	no

df3.isnull() # Nan에 True임을 확인할 수 있다

	animal	age	visits	priority
a	False	False	False	False
b	False	False	False	False
c	False	False	False	False
d	False	True	False	False
e	False	False	False	False
f	False	False	False	False
g	False	False	False	False
h	False	True	False	False
i	False	False	False	False
j	False	False	False	False

df3

	animal	age	visits	priority
a	cat	2.5	1	yes
b	cat	3.0	3	yes
c	snake	0.5	2	no
d	dog	NaN	3	no
e	dog	5.0	2	yes
f	cat	2.0	3	yes
g	snake	4.5	1	no
h	cat	NaN	1	no
i	dog	7.0	2	yes
j	dog	3.0	1	no

#change value
df3.loc['f','age']

2.0

df3.loc['f','age'] = 1.5
df3 # f의 age가 1.5로 변경

	animal	age	visits	priority
a	cat	2.5	1.0	yes
b	cat	3.0	3.0	yes
c	snake	0.5	2.0	no
d	dog	NaN	3.0	no
e	dog	5.0	2.0	yes
f	cat	1.5	3.0	yes
g	snake	4.5	1.0	no
h	cat	NaN	1.0	no
i	dog	7.0	2.0	yes
j	dog	3.0	1.0	no
fage	1.5	1.5	1.5	1.5

# mean 숫자가 들어있는 row의 mean을 구함
df3.mean()

age       3.166667
visits    1.863636
dtype: float64

df3[['age']].mean()

age    3.166667
dtype: float64

df3['age'].mean()

3.1666666666666665

# 대괄호를 두개 입력해야함. 타입이 달라짐
print(type(df3[['age']].mean()))
print(type(df3['age'].mean()))

<class 'pandas.core.series.Series'>
<class 'float'>

print(df3['age'])
type(df3['age'])

a       2.5
b       3.0
c       0.5
d       NaN
e       5.0
f       1.5
g       4.5
h       NaN
i       7.0
j       3.0
fage    1.5
Name: age, dtype: float64





pandas.core.series.Series

print(df3[['age']])
type(df3[['age']])

      age
a     2.5
b     3.0
c     0.5
d     NaN
e     5.0
f     1.5
g     4.5
h     NaN
i     7.0
j     3.0
fage  1.5





pandas.core.frame.DataFrame

print(df3['visits'].sum())
print(type(df3['visits'].sum()))

20.5
<class 'numpy.float64'>

df3[['visits']].sum()
print(type(df3[['visits']].sum()))

<class 'pandas.core.series.Series'>

# lowwer, upper
string = pd.Series(['A','C','D','Aaa','BaCa',np.nan,'CBA', 'cow', 'owl'])
string

0       A
1       C
2       D
3     Aaa
4    BaCa
5     NaN
6     CBA
7     cow
8     owl
dtype: object

string.str.upper()

0       A
1       C
2       D
3     AAA
4    BACA
5     NaN
6     CBA
7     COW
8     OWL
dtype: object

string.str.lower()

0       a
1       c
2       d
3     aaa
4    baca
5     NaN
6     cba
7     cow
8     owl
dtype: object

Operations for DataFrame missing values

df4 = df3.copy()
df4

	animal	age	visits	priority
a	cat	2.5	1.0	yes
b	cat	3.0	3.0	yes
c	snake	0.5	2.0	no
d	dog	NaN	3.0	no
e	dog	5.0	2.0	yes
f	cat	1.5	3.0	yes
g	snake	4.5	1.0	no
h	cat	NaN	1.0	no
i	dog	7.0	2.0	yes
j	dog	3.0	1.0	no
fage	1.5	1.5	1.5	1.5

df4.fillna(4, inplace=True) # NaN 값에 4를 넣어줌
df4

	animal	age	visits	priority
a	cat	2.5	1.0	yes
b	cat	3.0	3.0	yes
c	snake	0.5	2.0	no
d	dog	4.0	3.0	no
e	dog	5.0	2.0	yes
f	cat	1.5	3.0	yes
g	snake	4.5	1.0	no
h	cat	4.0	1.0	no
i	dog	7.0	2.0	yes
j	dog	3.0	1.0	no
fage	1.5	1.5	1.5	1.5

CSV file

CSV (comma separated values) file read
read.csv("csv file")
head

import pandas as pd

df = pd.read_csv("./8.intropandas/olympics.csv")
df.head()

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15
0	NaN	№ Summer	01 !	02 !	03 !	Total	№ Winter	01 !	02 !	03 !	Total	№ Games	01 !	02 !	03 !	Combined total
1	Afghanistan (AFG)	13	0	0	2	2	0	0	0	0	0	13	0	0	2	2
2	Algeria (ALG)	12	5	2	8	15	3	0	0	0	0	15	5	2	8	15
3	Argentina (ARG)	23	18	24	28	70	18	0	0	0	0	41	18	24	28	70
4	Armenia (ARM)	5	1	2	9	12	6	0	0	0	0	11	1	2	9	12

Index and Skiprows

df = pd.read_csv("./8.intropandas/olympics.csv", index_col=0) # index를 컬럼 0으로 지정
df.head()

	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15
0
NaN	№ Summer	01 !	02 !	03 !	Total	№ Winter	01 !	02 !	03 !	Total	№ Games	01 !	02 !	03 !	Combined total
Afghanistan (AFG)	13	0	0	2	2	0	0	0	0	0	13	0	0	2	2
Algeria (ALG)	12	5	2	8	15	3	0	0	0	0	15	5	2	8	15
Argentina (ARG)	23	18	24	28	70	18	0	0	0	0	41	18	24	28	70
Armenia (ARM)	5	1	2	9	12	6	0	0	0	0	11	1	2	9	12

df = pd.read_csv("./8.intropandas/olympics.csv", index_col=0, skiprows = 1) # 시작 row를 1번으로 지정
df.head()

	№ Summer	01 !	02 !	03 !	Total	№ Winter	№ Games	01 !.2	02 !.2	03 !.2	Combined total
Afghanistan (AFG)	13	0	0	2	2	0	13	0	0	2	2
Algeria (ALG)	12	5	2	8	15	3	15	5	2	8	15
Argentina (ARG)	23	18	24	28	70	18	41	18	24	28	70
Armenia (ARM)	5	1	2	9	12	6	11	1	2	9	12
Australasia (ANZ) [ANZ]	2	3	4	5	12	0	2	3	4	5	12

Columns and indexs

df.columns

Index(['№ Summer', '01 !', '02 !', '03 !', 'Total', '№ Winter', '01 !.1',
       '02 !.1', '03 !.1', 'Total.1', '№ Games', '01 !.2', '02 !.2', '03 !.2',
       'Combined total'],
      dtype='object')

df.index

Index(['Afghanistan (AFG)', 'Algeria (ALG)', 'Argentina (ARG)',
       'Armenia (ARM)', 'Australasia (ANZ) [ANZ]', 'Australia (AUS) [AUS] [Z]',
       'Austria (AUT)', 'Azerbaijan (AZE)', 'Bahamas (BAH)', 'Bahrain (BRN)',
       ...
       'Uzbekistan (UZB)', 'Venezuela (VEN)', 'Vietnam (VIE)',
       'Virgin Islands (ISV)', 'Yugoslavia (YUG) [YUG]',
       'Independent Olympic Participants (IOP) [IOP]', 'Zambia (ZAM) [ZAM]',
       'Zimbabwe (ZIM) [ZIM]', 'Mixed team (ZZX) [ZZX]', 'Totals'],
      dtype='object', length=147)

Cleaning Dataframes

df.columns 를 보면, 01!, 02!, 03! 으로 금,은,동메달을 표현하는 등 데이터 저장을 위한 표기법을 사용한 경우가 있음. 데이터를 시각화 할 때 이와같은 데이터들을 보기좋게 표현할 수 있음.

for col in df.columns:
    if col[:2] == '01' :
        df.rename(columns={col:"Gold"+col[4:]}, inplace = True)
    if col[:2] == '02' : 
        df.rename(columns={col:"Silver"+col[4:]}, inplace = True)
    if col[:2] == '03' :
        df.rename(columns={col:"Bronze"+col[4:]}, inplace = True)
    if col[:1] == '№' :
        df.rename(columns={col:"#"+col[2:]}, inplace = True)
df.head()

	#Summer	Gold	Silver	Bronze	Total	#Winter	#Games	Gold.2	Silver.2	Bronze.2	Combined total
Afghanistan (AFG)	13	0	0	2	2	0	13	0	0	2	2
Algeria (ALG)	12	5	2	8	15	3	15	5	2	8	15
Argentina (ARG)	23	18	24	28	70	18	41	18	24	28	70
Armenia (ARM)	5	1	2	9	12	6	11	1	2	9	12
Australasia (ANZ) [ANZ]	2	3	4	5	12	0	2	3	4	5	12

Boolean Masking

df['Gold'].head()

Afghanistan (AFG)           0
Algeria (ALG)               5
Argentina (ARG)            18
Armenia (ARM)               1
Australasia (ANZ) [ANZ]     3
Name: Gold, dtype: int64

(df['Gold'] > 0).head()

Afghanistan (AFG)          False
Algeria (ALG)               True
Argentina (ARG)             True
Armenia (ARM)               True
Australasia (ANZ) [ANZ]     True
Name: Gold, dtype: bool

gold = df.where(df["Gold"]>0)
gold.head()
# True는 값이 나오고 false는 NaN 출력

	#Summer	Gold	Silver	Bronze	Total	#Winter	Gold.1	Silver.1	Bronze.1	Total.1	#Games	Gold.2	Silver.2	Bronze.2	Combined total
Afghanistan (AFG)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Algeria (ALG)	12.0	5.0	2.0	8.0	15.0	3.0	0.0	0.0	0.0	0.0	15.0	5.0	2.0	8.0	15.0
Argentina (ARG)	23.0	18.0	24.0	28.0	70.0	18.0	0.0	0.0	0.0	0.0	41.0	18.0	24.0	28.0	70.0
Armenia (ARM)	5.0	1.0	2.0	9.0	12.0	6.0	0.0	0.0	0.0	0.0	11.0	1.0	2.0	9.0	12.0
Australasia (ANZ) [ANZ]	2.0	3.0	4.0	5.0	12.0	0.0	0.0	0.0	0.0	0.0	2.0	3.0	4.0	5.0	12.0

#drop NaN -> dropna
gold["Gold"].count()

df["Gold"].count()

gold1 = gold.dropna()
gold1.head()

	#Summer	Gold	Silver	Bronze	Total	#Winter	Gold.1	Silver.1	Bronze.1	Total.1	#Games	Gold.2	Silver.2	Bronze.2	Combined total
Algeria (ALG)	12.0	5.0	2.0	8.0	15.0	3.0	0.0	0.0	0.0	0.0	15.0	5.0	2.0	8.0	15.0
Argentina (ARG)	23.0	18.0	24.0	28.0	70.0	18.0	0.0	0.0	0.0	0.0	41.0	18.0	24.0	28.0	70.0
Armenia (ARM)	5.0	1.0	2.0	9.0	12.0	6.0	0.0	0.0	0.0	0.0	11.0	1.0	2.0	9.0	12.0
Australasia (ANZ) [ANZ]	2.0	3.0	4.0	5.0	12.0	0.0	0.0	0.0	0.0	0.0	2.0	3.0	4.0	5.0	12.0
Australia (AUS) [AUS] [Z]	25.0	139.0	152.0	177.0	468.0	18.0	5.0	3.0	4.0	12.0	43.0	144.0	155.0	181.0	480.0

gold1["Gold"].count()

Boolean indexing

df.head()

	#Summer	Gold	Silver	Bronze	Total	#Winter	#Games	Gold.2	Silver.2	Bronze.2	Combined total
Afghanistan (AFG)	13	0	0	2	2	0	13	0	0	2	2
Algeria (ALG)	12	5	2	8	15	3	15	5	2	8	15
Argentina (ARG)	23	18	24	28	70	18	41	18	24	28	70
Armenia (ARM)	5	1	2	9	12	6	11	1	2	9	12
Australasia (ANZ) [ANZ]	2	3	4	5	12	0	2	3	4	5	12

a = df["Gold"]>0
a.head()

Afghanistan (AFG)          False
Algeria (ALG)               True
Argentina (ARG)             True
Armenia (ARM)               True
Australasia (ANZ) [ANZ]     True
Name: Gold, dtype: bool

b = df[a]
b.head()

	#Summer	Gold	Silver	Bronze	Total	#Winter	Gold.1	Silver.1	Bronze.1	Total.1	#Games	Gold.2	Silver.2	Bronze.2	Combined total
Algeria (ALG)	12	5	2	8	15	3	0	0	0	0	15	5	2	8	15
Argentina (ARG)	23	18	24	28	70	18	0	0	0	0	41	18	24	28	70
Armenia (ARM)	5	1	2	9	12	6	0	0	0	0	11	1	2	9	12
Australasia (ANZ) [ANZ]	2	3	4	5	12	0	0	0	0	0	2	3	4	5	12
Australia (AUS) [AUS] [Z]	25	139	152	177	468	18	5	3	4	12	43	144	155	181	480

And Or

가능한 ( )를 사용하여 operation 순서를 정해주는 것이 필요함

df[ (df["Gold"]>0) | \
    (df["Gold.1"]>0) \
  ]

	#Summer	Gold	Silver	Bronze	Total	#Winter	Gold.1	Silver.1	Bronze.1	Total.1	#Games	Gold.2	Silver.2	Bronze.2	Combined total
Algeria (ALG)	12	5	2	8	15	3	0	0	0	0	15	5	2	8	15
Argentina (ARG)	23	18	24	28	70	18	0	0	0	0	41	18	24	28	70
Armenia (ARM)	5	1	2	9	12	6	0	0	0	0	11	1	2	9	12
Australasia (ANZ) [ANZ]	2	3	4	5	12	0	0	0	0	0	2	3	4	5	12
Australia (AUS) [AUS] [Z]	25	139	152	177	468	18	5	3	4	12	43	144	155	181	480
Austria (AUT)	26	18	33	35	86	22	59	78	81	218	48	77	111	116	304
Azerbaijan (AZE)	5	6	5	15	26	5	0	0	0	0	10	6	5	15	26
Bahamas (BAH)	15	5	2	5	12	0	0	0	0	0	15	5	2	5	12
Belarus (BLR)	5	12	24	39	75	6	6	4	5	15	11	18	28	44	90
Belgium (BEL)	25	37	52	53	142	20	1	1	3	5	45	38	53	56	147
Brazil (BRA)	21	23	30	55	108	7	0	0	0	0	28	23	30	55	108
Bulgaria (BUL) [H]	19	51	85	78	214	19	1	2	3	6	38	52	87	81	220
Burundi (BDI)	5	1	0	0	1	0	0	0	0	0	5	1	0	0	1
Cameroon (CMR)	13	3	1	1	5	1	0	0	0	0	14	3	1	1	5
Canada (CAN)	25	59	99	121	279	22	62	56	52	170	47	121	155	173	449
Chile (CHI) [I]	22	2	7	4	13	16	0	0	0	0	38	2	7	4	13
China (CHN) [CHN]	9	201	146	126	473	10	12	22	19	53	19	213	168	145	526
Colombia (COL)	18	2	6	11	19	1	0	0	0	0	19	2	6	11	19
Costa Rica (CRC)	14	1	1	2	4	6	0	0	0	0	20	1	1	2	4
Croatia (CRO)	6	6	7	10	23	7	4	6	1	11	13	10	13	11	34
Cuba (CUB) [Z]	19	72	67	70	209	0	0	0	0	0	19	72	67	70	209
Czech Republic (CZE) [CZE]	5	14	15	15	44	6	7	9	8	24	11	21	24	23	68
Czechoslovakia (TCH) [TCH]	16	49	49	45	143	16	2	8	15	25	32	51	57	60	168
Denmark (DEN) [Z]	26	43	68	68	179	13	0	1	0	1	39	43	69	68	180
Dominican Republic (DOM)	13	3	2	1	6	0	0	0	0	0	13	3	2	1	6
Ecuador (ECU)	13	1	1	0	2	0	0	0	0	0	13	1	1	0	2
Egypt (EGY) [EGY] [Z]	21	7	9	10	26	1	0	0	0	0	22	7	9	10	26
Estonia (EST)	11	9	9	15	33	9	4	2	1	7	20	13	11	16	40
Ethiopia (ETH)	12	21	7	17	45	2	0	0	0	0	14	21	7	17	45
Finland (FIN)	24	101	84	117	302	22	42	62	57	161	46	143	146	174	463
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
Russia (RUS) [RUS]	5	132	121	142	395	6	49	40	35	124	11	181	161	177	519
Russian Empire (RU1) [RU1]	3	1	4	3	8	0	0	0	0	0	3	1	4	3	8
Soviet Union (URS) [URS]	9	395	319	296	1010	9	78	57	59	194	18	473	376	355	1204
Unified Team (EUN) [EUN]	1	45	38	29	112	1	9	6	8	23	2	54	44	37	135
Serbia (SRB) [SRB]	3	1	2	4	7	2	0	0	0	0	5	1	2	4	7
Serbia and Montenegro (SCG) [SCG]	3	2	4	3	9	3	0	0	0	0	6	2	4	3	9
Slovakia (SVK) [SVK]	5	7	9	8	24	6	2	2	1	5	11	9	11	9	29
Slovenia (SLO)	6	4	6	9	19	7	2	4	9	15	13	6	10	18	34
South Africa (RSA)	18	23	26	27	76	6	0	0	0	0	24	23	26	27	76
Spain (ESP) [Z]	22	37	59	35	131	19	1	0	1	2	41	38	59	36	133
Suriname (SUR) [E]	11	1	0	1	2	0	0	0	0	0	11	1	0	1	2
Sweden (SWE) [Z]	26	143	164	176	483	22	50	40	54	144	48	193	204	230	627
Switzerland (SUI)	27	47	73	65	185	22	50	40	48	138	49	97	113	113	323
Syria (SYR)	12	1	1	1	3	0	0	0	0	0	12	1	1	1	3
Chinese Taipei (TPE) [TPE] [TPE2]	13	2	7	12	21	11	0	0	0	0	24	2	7	12	21
Thailand (THA)	15	7	6	11	24	3	0	0	0	0	18	7	6	11	24
Trinidad and Tobago (TRI) [TRI]	16	2	5	11	18	3	0	0	0	0	19	2	5	11	18
Tunisia (TUN)	13	3	3	4	10	0	0	0	0	0	13	3	3	4	10
Turkey (TUR)	21	39	25	24	88	16	0	0	0	0	37	39	25	24	88
Uganda (UGA)	14	2	3	2	7	0	0	0	0	0	14	2	3	2	7
Ukraine (UKR)	5	33	27	55	115	6	2	1	4	7	11	35	28	59	122
United Arab Emirates (UAE)	8	1	0	0	1	0	0	0	0	0	8	1	0	0	1
United States (USA) [P] [Q] [R] [Z]	26	976	757	666	2399	22	96	102	84	282	48	1072	859	750	2681
Uruguay (URU)	20	2	2	6	10	1	0	0	0	0	21	2	2	6	10
Uzbekistan (UZB)	5	5	5	10	20	6	1	0	0	1	11	6	5	10	21
Venezuela (VEN)	17	2	2	8	12	4	0	0	0	0	21	2	2	8	12
Yugoslavia (YUG) [YUG]	16	26	29	28	83	14	0	3	1	4	30	26	32	29	87
Zimbabwe (ZIM) [ZIM]	12	3	4	1	8	1	0	0	0	0	13	3	4	1	8
Mixed team (ZZX) [ZZX]	3	8	5	4	17	0	0	0	0	0	3	8	5	4	17
Totals	27	4809	4775	5130	14714	22	959	958	948	2865	49	5768	5733	6078	17579

101 rows × 15 columns

df[ (df["Gold.1"]>0) & \
    (df["Gold"] == 0) \
     ]

	#Summer	Gold	Silver	Bronze	Total	#Winter	Gold.1	Silver.1	Bronze.1	Total.1	#Games	Gold.2	Silver.2	Bronze.2	Combined total
Liechtenstein (LIE)	16	0	0	0	0	18	2	2	5	9	34	2	2	5	9

Add column

df.head()

	#Summer	Gold	Silver	Bronze	Total	#Winter	#Games	Gold.2	Silver.2	Bronze.2	Combined total
Afghanistan (AFG)	13	0	0	2	2	0	13	0	0	2	2
Algeria (ALG)	12	5	2	8	15	3	15	5	2	8	15
Argentina (ARG)	23	18	24	28	70	18	41	18	24	28	70
Armenia (ARM)	5	1	2	9	12	6	11	1	2	9	12
Australasia (ANZ) [ANZ]	2	3	4	5	12	0	2	3	4	5	12

df.index

Index(['Afghanistan (AFG)', 'Algeria (ALG)', 'Argentina (ARG)',
       'Armenia (ARM)', 'Australasia (ANZ) [ANZ]', 'Australia (AUS) [AUS] [Z]',
       'Austria (AUT)', 'Azerbaijan (AZE)', 'Bahamas (BAH)', 'Bahrain (BRN)',
       ...
       'Uzbekistan (UZB)', 'Venezuela (VEN)', 'Vietnam (VIE)',
       'Virgin Islands (ISV)', 'Yugoslavia (YUG) [YUG]',
       'Independent Olympic Participants (IOP) [IOP]', 'Zambia (ZAM) [ZAM]',
       'Zimbabwe (ZIM) [ZIM]', 'Mixed team (ZZX) [ZZX]', 'Totals'],
      dtype='object', length=147)

df["Country"] = df.index
df.head()

	#Summer	Gold	Silver	Bronze	Total	#Winter	#Games	Gold.2	Silver.2	Bronze.2	Combined total	Country
Afghanistan (AFG)	13	0	0	2	2	0	13	0	0	2	2	Afghanistan (AFG)
Algeria (ALG)	12	5	2	8	15	3	15	5	2	8	15	Algeria (ALG)
Argentina (ARG)	23	18	24	28	70	18	41	18	24	28	70	Argentina (ARG)
Armenia (ARM)	5	1	2	9	12	6	11	1	2	9	12	Armenia (ARM)
Australasia (ANZ) [ANZ]	2	3	4	5	12	0	2	3	4	5	12	Australasia (ANZ) [ANZ]

set_index, reset_index

특정한 컬럼을 인덱스로 할 수 있음

df = df.set_index("Gold") # Gold 컬럼을 인덱스로 설정. 
df.head()

	#Summer	Silver	Bronze	Total	#Winter	Gold.1	Silver.1	Bronze.1	Total.1	#Games	Gold.2	Silver.2	Bronze.2	Combined total	Country
Gold
0	13	0	2	2	0	0	0	0	0	13	0	0	2	2	Afghanistan (AFG)
5	12	2	8	15	3	0	0	0	0	15	5	2	8	15	Algeria (ALG)
18	23	24	28	70	18	0	0	0	0	41	18	24	28	70	Argentina (ARG)
1	5	2	9	12	6	0	0	0	0	11	1	2	9	12	Armenia (ARM)
3	2	4	5	12	0	0	0	0	0	2	3	4	5	12	Australasia (ANZ) [ANZ]

이상현

'당신을 한 줄로 소개해보세요'를 이 블로그로 대신 해볼까합니다.