열생성
df.info()
df.insert(loc=idx, column='평균리뷰수', value=df['num_critic_for_reviews']/df['num_voted_users'])
def my_func(x):
if x > 180:
return 1
else:
return 0
df['duration'].apply(my_func)
df['장편영화']=df['duration'].apply(my_func)
for i in htt.index.tolist():
nice_url.at[i,'hompage_url']=htt.at[i,'hompage_url']
열선택
df.select_dtypes('int64')
df.select_dtypes(['int64', 'float64']).info()
df.select_dtypes('number').info()
df[['duration', 'director_name', '단편영화']]
df.filter(items=['duration', 'director_name', '단편영화'])
df.filter(like='facebook')
Indexing
df[3,4]
df.iloc[3,4]
df.loc[:'Skyfall']
df.loc[['Avatar', 'Skyfall'], 'duration']
df.loc[['Avatar', 'Skyfall'], ['duration']]
df.columns.get_loc('duration')
df.iloc[:6, 'director_name':'duration']
col_start=df.columns.get_loc('director_name')
col_end=df.columns.get_loc('duration')+1
df.iloc[:6, col_start:col_end]
Filtering
cr1=df['color']=='Black and White'
cr2=df['duration'] <=60
cr=cr1&cr2
df[cr]
df.iloc[cr, :3]
df.iloc[cr.values, :3]
my_lan=['Korean','Dutch']
cr=df['language'].isin(my_lan)
df.loc[cr, 'language':'budget']
cr1=df['language'].isin(my_lan)
cr2=df['duration'].between(60,120)
cr=cr1&cr2
df[cr]
qur="language in @my_lan and 60<=duration<=120"
df.query(qur)
data_imp_ohe.filter(like='car').head()
idx_1 = cardo_v3[cr1].index
cardo_v4 = cardo_v3.drop(index=idx_1)
filtered_df = df[df['이름'].isin(특정값_리스트)]
기타
df.last_valid_index()
df.first_valid_index()
df[col].idxmax(axis=1)
df[col].idxmax(axis=1)
df.astype({'GENDER' : 'objct'})
cardo['company_id']=pd.to_numeric(cardo['company_id'])
df.apply(lambda x: pd.to_numeric(x, errors='ignore'))
df = df.applymap(lambda x: int(x) if pd.notna(x) else x)
df.duplicated(['key1'])
df.duplicated(['key1'], keep=False)
cardo=cardo.drop_duplicates(['company_id'], keep='first')
cat = features.select_dtypes(include=['object','category']).columns.to_list()
num = features.select_dtypes(exclude=['object','category']).columns.to_list()
data = data[con+cat]
custom_order = [A, B, C]
df['ID'] = pd.Categorical(df['ID'], categorical=custom_order, ordered=True)
result_tuples = [tuple(row) for row in df.values]