>>> import pydataset
>>> import pandas as pd
>>> train = pydataset.data('Train')
train
>>> train.info()
>>> train['choice'].astype('category')
>>> train['choice'] = train['choice'].astype('category')
train.info()
>>> import numpy as np
>>> from datetime import datetime
>>> ts = pd.Series(np.random.randn(100), index = pd.date_range('1/1/2018', periods = 100))
ts
#ts에서 데이터 간격을 2주간으로 잡고 그 값의 평균을 구해보세요
>>> ts.resample('2W').mean()
>>> tips = pydataset.data('tips')
tips.head()
#흡연자와 요일별 평균
>>> tips.groupby(['day', 'smoker']).mean()
>>> tips.pivot_table(index = ['day', 'smoker'], aggfunc = 'mean')
DataFrame의 칼럼을 categorical로 변환해주는 이유는
-> 메모리를 훨씬 적게 사용하여 전체 성능을 효과적으로 개선할 수 있음
>>> titanic = pydataset.data('titanic')
titanic.head()
#class 별 성별 평균 나이
>>> import seaborn as sns
>>> titanic = sns.load_dataset('titanic')
titanic[:3]
>>> titanic.pivot_table(values = 'age', index = ['class', 'sex'])
>>> pd.crosstab(index = titanic['class'], columns = titanic['sex'])
>>> mpg = pydataset.data("mpg")
mpg
#hwy와 cty 값에 대해 manufacturer를 index, year를 col으로 하는 pivot table
>>> mpg.pivot_table(values = ['hwy', 'cty'], index = 'manufacturer', columns = 'year')
>>> df = pd.DataFrame({'info' : ['홍길동/1/46', '전우치/1/27', '김철수/1/33', '이민수/1/51',
'홍길순/2/46', '전영희/2/22', '김철순/2/39', '이유리/2/31'],
'visit_year' : [2020,2020,2020,2021,2021,2021,2022,2022],
'visit_month' : [1, 3, 4, 4, 8, 11, 7, 10],
'visit_day' : [29, 22, 16, 6, 2, 11, 26, 7]})
df
#info 칼럼에 있는 내용으로 name, gender, age 칼럼 만들기
>>> df['info'].str.split('/')
>>> df['info'].str.split('/').str[0]
>>> df['info'].str.split('/').str[1]
>>> data_div = df['info'].str.split('/').str
data_div
>>> df['name'] = data_div[0]
df['gender'] = data_div[1]
df['age'] = data_div[2]
df
#다른 방법1
>>> df['sex'] = df['info'].str.split('/').str[1]
df['age'] = df['info'].str.split('/').str[2]
df['info'] = df['info'].str.split('/').str[0]
#다른 방법2
>>> for i in range(len(df['info'])):
gender = df['info'][i][4]
age = df['info'][i][6:]
df.loc[i,'gender'] = gender
df.loc[i,'age'] = age
df
#방문 년/월/일 각각의 칼럼을 하나로 합쳐 'visit' 칼럼으로
#방법1
>>> visit = []
for a,b,c in zip(df['visit_year'],df['visit_month'],df['visit_day']):
visit.append(f'{a}/{b}/{c}')
df['visit']=visit
df
#방법2
>>> df['visit'] = df['visit_year'].astype(str) + '/' + df['visit_month'].astype(str) + '/' + df['visit_day'].astype(str)
df
#visit 칼럼을 datetime 형으로
>>> df['visit'] = pd.to_datetime(df['visit'])
>>> df = df.iloc[:, 4:]
df
>>> df.info()
>>> df['gender'] = np.where(df['gender'] == '1', '남', '여')
df
>>> df['gender'].astype('category')
https://github.com/wesm/pydata-book/tree/3rd-edition/datasets/movielens
>>> import pandas as pd
# Make display smaller
>>> pd.options.display.max_rows = 10
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('https://raw.githubusercontent.com/wesm/pydata-book/3rd-edition/datasets/movielens/users.dat', sep='::', header=None, names=unames)
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('https://raw.githubusercontent.com/wesm/pydata-book/3rd-edition/datasets/movielens/ratings.dat', sep='::', header=None, names=rnames)
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('https://raw.githubusercontent.com/wesm/pydata-book/3rd-edition/datasets/movielens/movies.dat', sep='::', header=None, names=mnames)
>>> users[:5]
>>> ratings[:5]
>>> movies[:5]
>>> data = pd.merge(pd.merge(ratings, users), movies)
data
#성별에 따른 각 영화의 평균 평점
>>> data.pivot_table(values = 'rating', index = 'title', columns = 'gender', aggfunc = 'mean')
>>> mean_ratings = data.pivot_table(values = 'rating', index = 'title', columns = 'gender', aggfunc = 'mean')
>>> ratings['rating'].min()
#250건 이상의 평점 정보가 있는 영화만
>>> ratings_by_title = data.groupby('title').size()
ratings_by_title[:10]
>>> ratings_by_title.index[ratings_by_title >= 250]
>>> active_titles = ratings_by_title.index[ratings_by_title >= 250]
#영화 색인으로 로우 선택
>>> mean_ratings = mean_ratings.loc[active_titles]
mean_ratings
#여성에게 높은 평점을 받은 영화 목록 확인
>>> mean_ratings.sort_values(by = 'F', ascending = False)[:10]
>>> mean_ratings.sort_values(by = 'F', ascending = False)[:10].index
>>> mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
mean_ratings[:10]
#여성의 선호도 순
>>> mean_ratings.sort_values(by = 'diff')[:10]
#남성의 선호도 순
>>> mean_ratings.sort_values(by = 'diff', ascending = False)[:10]
#성별에 관계 없이 호불호가 극명하게 갈리는 영화 -> 평점의 분산이나 표준편차로 측정 가능
>>> s = data.groupby('title')['rating'].std() #.std() : 표준편차
s
>>> s = s.loc[active_titles]
>>> s.sort_values(ascending = False)
>>> s.sort_values(ascending = False)[:10]