import pandas as pd
import numpy as np
DataFrame은 2차원 테이블이고 테이블의 한 줄(행 or 열)을 Series라고하며 Series의 모임이 DataFrame이 된다.
np.random.seed(42)
df = pd.DataFrame(data=np.random.randn(12, 4),
index=np.arange(12),
columns=['X1', 'X2', 'X3', 'X4'])
df
X1 X2 X3 X40 0.496714 -0.138264 0.647689 1.523030
1 -0.234153 -0.234137 1.579213 0.767435
2 -0.469474 0.542560 -0.463418 -0.465730
3 0.241962 -1.913280 -1.724918 -0.562288
4 -1.012831 0.314247 -0.908024 -1.412304
5 1.465649 -0.225776 0.067528 -1.424748
6 -0.544383 0.110923 -1.150994 0.375698
7 -0.600639 -0.291694 -0.601707 1.852278
8 -0.013497 -1.057711 0.822545 -1.220844
9 0.208864 -1.959670 -1.328186 0.196861
10 0.738467 0.171368 -0.115648 -0.301104
11 -1.478522 -0.719844 -0.460639 1.057122
df.index
Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], dtype='int64')
df.columns
Index(['X1', 'X2', 'X3', 'X4'], dtype='object')
# DataFrame의 데이터를 np.array로 가져온다.
df.values
array([[ 0.49671415, -0.1382643 , 0.64768854, 1.52302986],
[-0.23415337, -0.23413696, 1.57921282, 0.76743473],
[-0.46947439, 0.54256004, -0.46341769, -0.46572975],
[ 0.24196227, -1.91328024, -1.72491783, -0.56228753],
[-1.01283112, 0.31424733, -0.90802408, -1.4123037 ],
[ 1.46564877, -0.2257763 , 0.0675282 , -1.42474819],
[-0.54438272, 0.11092259, -1.15099358, 0.37569802],
[-0.60063869, -0.29169375, -0.60170661, 1.85227818],
[-0.01349722, -1.05771093, 0.82254491, -1.22084365],
[ 0.2088636 , -1.95967012, -1.32818605, 0.19686124],
[ 0.73846658, 0.17136828, -0.11564828, -0.3011037 ],
[-1.47852199, -0.71984421, -0.46063877, 1.05712223]])
df['X1']
0 0.496714
1 -0.234153
2 -0.469474
3 0.241962
4 -1.012831
5 1.465649
6 -0.544383
7 -0.600639
8 -0.013497
9 0.208864
10 0.738467
11 -1.478522
Name: X1, dtype: float64
df['X1'] + 2
0 2.496714
1 1.765847
2 1.530526
3 2.241962
4 0.987169
5 3.465649
6 1.455617
7 1.399361
8 1.986503
9 2.208864
10 2.738467
11 0.521478
Name: X1, dtype: float64
df.head()
X1 X2 X3 X40 0.496714 -0.138264 0.647689 1.523030
1 -0.234153 -0.234137 1.579213 0.767435
2 -0.469474 0.542560 -0.463418 -0.465730
3 0.241962 -1.913280 -1.724918 -0.562288
4 -1.012831 0.314247 -0.908024 -1.412304
df.tail()
X1 X2 X3 X47 -0.600639 -0.291694 -0.601707 1.852278
8 -0.013497 -1.057711 0.822545 -1.220844
9 0.208864 -1.959670 -1.328186 0.196861
10 0.738467 0.171368 -0.115648 -0.301104
11 -1.478522 -0.719844 -0.460639 1.057122
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 12 entries, 0 to 11
Data columns (total 4 columns):
#Column Non-Null Count Dtype
0 X1 12 non-null float64
1 X2 12 non-null float64
2 X3 12 non-null float64
3 X4 12 non-null float64
dtypes: float64(4)
memory usage: 480.0 bytes
df.describe()
X1 X2 X3 X4
count 12.000000 12.000000 12.000000 12.000000
mean -0.100154 -0.450107 -0.303047 0.032117
std 0.801497 0.817508 0.962175 1.118572
min -1.478522 -1.959670 -1.724918 -1.424748
25% -0.558447 -0.804311 -0.968766 -0.726927
50% -0.123825 -0.229957 -0.462028 -0.052121
75% 0.305650 0.126034 0.212568 0.839857
max 1.465649 0.542560 1.579213 1.852278
# X2 column을 기준으로 내림차순 정렬
df.sort_values(by='X2', ascending=False)
X1 X2 X3 X42 -0.469474 0.542560 -0.463418 -0.465730
4 -1.012831 0.314247 -0.908024 -1.412304
10 0.738467 0.171368 -0.115648 -0.301104
6 -0.544383 0.110923 -1.150994 0.375698
0 0.496714 -0.138264 0.647689 1.523030
5 1.465649 -0.225776 0.067528 -1.424748
1 -0.234153 -0.234137 1.579213 0.767435
7 -0.600639 -0.291694 -0.601707 1.852278
11 -1.478522 -0.719844 -0.460639 1.057122
8 -0.013497 -1.057711 0.822545 -1.220844
3 0.241962 -1.913280 -1.724918 -0.562288
9 0.208864 -1.959670 -1.328186 0.196861
df['X1']
0 0.496714
1 -0.234153
2 -0.469474
3 0.241962
4 -1.012831
5 1.465649
6 -0.544383
7 -0.600639
8 -0.013497
9 0.208864
10 0.738467
11 -1.478522
Name: X1, dtype: float64
df[0:3]
X1 X2 X3 X40 0.496714 -0.138264 0.647689 1.523030
1 -0.234153 -0.234137 1.579213 0.767435
2 -0.469474 0.542560 -0.463418 -0.465730
df.loc[3, 'X4']
-0.5622875292409727
df.loc[[0, 1, 4, 5], 'X3']
0 0.647689
1 1.579213
4 -0.908024
5 0.067528
Name: X3, dtype: float64
mask = df['X3'] > 0
df.loc[mask, 'X3']
0 0.647689
1 1.579213
5 0.067528
8 0.822545
Name: X3, dtype: float64
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A1', 'A2'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']})
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
'B': ['B8', 'B9', 'B10', 'B11'],
'C': ['C6', 'C7', 'C10', 'C11'],
'D': ['D8', 'D9', 'D10', 'D11']})
pd.concat([df1, df2, df3], axis=0) # column을 기준으로 합침.
pd.concat([df1, df2.reset_index(drop=True), df3.reset_index(drop=True)], axis=1) # index를 기준으로 합침.
A B C D A B C D A B C D0 A0 B0 C0 D0 A4 B4 C4 D4 A8 B8 C6 D8
1 A1 B1 C1 D1 A5 B5 C5 D5 A9 B9 C7 D9
2 A2 B2 C2 D2 A1 B6 C6 D6 A10 B10 C10 D10
3 A3 B3 C3 D3 A2 B7 C7 D7 A11 B11 C11 D11
# A inner join B --> table A와 table B의 특정 column에서 겹치는 값들을 기준으로 두 테이블을 합치는 연산.
pd.merge(left=df1, right=df2, on='A', how='inner')
# df2 inner join df3 (column C)
pd.merge(df2, df3, on="C", how='inner')[['C', 'A_x', 'B_x', 'D_x', 'A_y', 'B_y', 'D_y']]
A B_x C_x D_x B_y C_y D_y0 A0 B0 C0 D0 NaN NaN NaN
1 A1 B1 C1 D1 B6 C6 D6
2 A2 B2 C2 D2 B7 C7 D7
3 A3 B3 C3 D3 NaN NaN NaN
titanic = pd.read_csv('./data/train.csv')
titanic
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q
# Q1. 여성 승객들의 평균 나이를 계산해보세요.
#np.mean(titanic.loc[titanic.Sex == 'female', 'Age'])
titanic.loc[titanic.Sex == 'female', 'Age'].mean()
# Q2. 1등석에 탑승한 승객 중 최대 요금을 낸 사람의 이름을 찾아주세요.
temp = titanic.loc[titanic.Pclass == 1, ['Name', 'Fare']]
temp.loc[temp.Fare == temp.Fare.max(), "Name"]
# Q3. 1등석에 탑승한 승객 중 승선한 곳이 Queenstown인 사람들의 수는?
#temp2 = titanic[titanic.Pclass == 1]
#temp2[temp2.Embarked == 'Q']
## mask & mask : 두 조건을 둘 다 만족하는 mask. (AND)
## mask | mask : 두 조건중 하나 이상 만족하는 mask. (OR)
## ~mask : 조건을 반전. (NOT)
titanic[(titanic.Pclass == 1) & (titanic.Embarked == 'Q')].shape[0]
# Q4. 승선한 곳이 "S"인 사람들의 생존자 수는?
titanic.loc[titanic.Embarked.isin(['S']), 'Survived'].sum()
# Q5. 미혼 여성중에 나이를 모르는 사람의 수는?
titanic.loc[titanic.Name.str.contains('Miss.'), 'Age'].isnull()#.sum()
## boolean mask는 무조건 mask와 적용 대상의 index가 같아야합니다!