df = pd.DataFrame(np.random.rand(5,2), columns=['A','B'])
# A B
# 0 0.842226 0.942528
# 1 0.741128 0.785701
# 2 0.836446 0.723379
# 3 0.974574 0.278584
# 4 0.921905 0.014840
df['A'] < 0.5
# 0 False
# 1 False
# 2 False
# 3 False
# 4 False
# Name: A, dtype: bool
df[(df['A'] < 0.8) & (df['B'] > 0.3)] # df[(조건문) & (조건문)]
df.query('A < 0.8 and B > 0.3') # df.query('조건 and 조건')
# A B
# 1 0.741128 0.785701
animals = pd.DataFrame({
'Animal':['Dog','Cat','Cat','Pig','Cat'],
'Name': ['Happy','Sam','Toby','Mini','Rocky']
})
# Animal Name
# 0 Dog Happy
# 1 Cat Sam
# 2 Cat Toby
# 3 Pig Mini
# 4 Cat Rocky
animals['Animal'].str.contains('Cat')
animals.Animal.str.match('Cat')
animals['Animal'] == 'Cat'
# 0 False
# 1 True
# 2 True
# 3 False
# 4 True
# Name: Animal, dtype: bool
df = pd.DataFrame(np.arange(5), columns=['Num'])
def square(x):
return x**2
df['Num'].apply(square)
# 0 0
# 1 1
# 2 4
# 3 9
# 4 16
# Name: Num, dtype: int64
df['Square'] = df.Num.apply(lambda x: x**2)
# Num Square
# 0 0 0
# 1 1 1
# 2 2 4
# 3 3 9
# 4 4 16
df = pd.DataFrame({
'Sex': ['Male','Male','Female','Female','Male']
})
# Sex
# 0 Male
# 1 Male
# 2 Female
# 3 Female
# 4 Male
df.Sex.replace({'Male':0, 'Female':1})
# 0 0
# 1 0
# 2 1
# 3 1
# 4 0
# Name: Sex, dtype: int64
df.Sex.replace({'Male':0, 'Female':1}, inplace=True)
# 출력되는 결과물은 없다. df가 아래처럼 바껴있다.
# Sex
# 0 0
# 1 0
# 2 1
# 3 1
# 4 0
df = pd.DataFrame({
'key': ['A','B','C','A','B','C'],
'data1': [1,2,3,1,2,3],
'data2': np.random.randint(0,6,6)
})
# key data1 data2
# 0 A 1 1
# 1 B 2 3
# 2 C 3 5
# 3 A 1 2
# 4 B 2 5
# 5 C 3 4
df.groupby('key')
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000018FEEEB1A60>
df.groupby('key').sum()
# data1 data2
# key
# A 2 2
# B 4 2
# C 6 6
df.groupby(['key','data1']).sum()
# data2
# key data1
# A 1 2
# B 2 2
# C 3 6
df.groupby('key').aggregate([min, np.median, max])
# data1 data2
# min median max min median max
# key
# A 1 1.0 1 1 1.0 1
# B 2 2.0 2 1 1.0 1
# C 3 3.0 3 2 3.0 4
df.groupby('key').aggregate({'data1':min, 'data2':np.sum})
# data1 data2
# key
# A 1 2
# B 2 2
# C 3 6
def filter_by_mean(x):
return x['data2'].mean() > 1
df.groupby('key').filter(filter_by_mean)
# key data1 data2
# 2 C 3 2
# 5 C 3 4
df.groupby('key').apply(lambda x: x.max() - x.min())
# data1 data2
# key
# A 0 0
# B 0 0
# C 0 2
df = pd.DataFrame(
np.random.randn(4,2),
index=[['A','A','B','B'], [1,2,1,2]],
columns=['data1','data2'])
# data1 data2
# A 1 -0.335494 0.131620
# 2 0.916448 1.680326
# B 1 0.415776 -0.081459
# 2 0.286054 1.214381
df.loc['A']
# data1 data2
# 1 -0.335494 0.131620
# 2 0.916448 1.680326
df.loc['A'].loc[1]
# data1 -0.335494
# data2 0.131620
# Name: 1, dtype: float64
출처 : 엘리스 AI트랙 3기 11주차 수업