220930 Day26

유예지·2022년 9월 30일

[0929 복습]

1

>>> import numpy as np
>>> import pandas as pd

>>> arr = np.arange(12).reshape((3,4))
	arr
>>> arr1 = arr.copy()

>>> np.concatenate([arr, arr1], axis = 0)
>>> np.concatenate([arr, arr1], axis = 1)

>>> arr2 = pd.DataFrame(np.arange(12).reshape((3, 4)))
	arr2
>>> arr3 = arr2.copy()

>>> pd.concat([arr2, arr3], ignore_index = True)

2

>>> df = pd.DataFrame({'key1': ['a', 'a', 'b',' b', 'a'],

                      'key2': ['one', 'two', 'one', 'two', 'one'],

                      'data1': np.random.randn(5),

                      'data2': np.random.randn(5)}) 

    df
    
>>> df.groupby('key1').mean()[['data1']]
>>> df.groupby('key1').apply('mean')[['data1']]
>>> df.groupby('key1').agg('mean')[['data1']]

3

>>> import pydataset
>>> titanic = pydataset.data('titanic')
	titanic
    
>>> titanic.groupby(['class', 'sex']).agg('count')[['age']]
>>> titanic.groupby(['class', 'sex']).apply('count')[['age']]
>>> titanic.groupby(['class', 'sex']).count()[['age']]

4

>>> import seaborn as sns
>>> df1 = titanic.groupby(['class', 'sex']).count()[['age']]

>>> df1.reset_index()
>>> df1 = df1.reset_index()

>>> sns.barplot(data = df1, x = 'class', y = 'age', hue = 'sex')

5

>>> titanic.melt(id_vars = 'class', var_name = 'info')

6

>>> mpg = pydataset.data('mpg')
	mpg.head()
    
>>> df = pd.crosstab(mpg['class'], mpg['manufacturer'])
	df
    
>>> df.reset_index()
>>> df1 = df.reset_index()

>>> df1.melt(id_vars = 'class')

>>> df.columns
>>> df.melt(value_vars = df.columns)

>>> df.stack()

7

>>> df2 = df1.melt(id_vars = 'class')
	df2[:3]
    
>>> bins = [0, 1, 10, 20]
	pd.cut(df2['value'], bins, labels = ['none', 'few', 'many'])
    
>>> pd.cut(df2['value'], bins, right = False)    #'right = False' : 이상 ~ 미만
>>> pd.cut(df2['value'], bins, labels = ['none', 'few', 'many'], right = False)

>>> df2['frequency'] = pd.cut(df2['value'], bins, labels = ['none', 'few', 'many'], right = False)
	df2

8

>>> mpg = pydataset.data('mpg')
	mpg.head()
    
>>> mpg.groupby(['manufacturer', 'year']).agg(['max', 'mean'])    
>>> mpg.groupby(['year', 'manufacturer']).agg(['max', 'mean'])

>>> mpg.groupby(['year', 'manufacturer']).agg({'displ' : 'max', 'cyl' : 'sum'})
>>> mpg.groupby(['year', 'manufacturer']).agg(displ_max = ('displ', 'max'),
                                          	  cyl_sum = ('cyl', 'sum'))
                                              
>>> mpg.groupby(['year', 'manufacturer'])[['displ']].quantile(0.85)

(4) 피벗테이블과 교차일람표

피벗테이블은 데이터를 하나 이상의 키로 수집해서 어떤 키는 로우에, 어떤 키는 칼럼에 나열해서 데이터를 정리한다
pandas 에서 피벗테이블은 groupby 기능을 사용한다
DataFrame 에는 pivot_table 메서드가 있으며, 이는 groupby를 위한 편리한 인터페이스를 제공하기 위해 '마진(margins)' 이라고 하는 부분합(중간값)을 추가할 수 있는 기능을 제공한다

① 피벗테이블 pivot_table

pivot_table 의 기본연산은 '평균' 이다

>>> tips = pydataset.data('tips')
>>> tips['tip_pct'] = tips['tip'] / tips['total_bill']
	tips.head()
    
>>> tips.groupby(['day','smoker']).agg('mean')

>>> tips.pivot_table(values = 'tip_pct', index = 'time', columns = 'smoker')
>>> tips.pivot_table(values = ['size','tip_pct'], index = 'time', columns = 'smoker')
>>> tips.pivot_table(values = ['size','tip_pct'], index = ['time','day'], columns = 'smoker')

pivot_table 은 'margin(마진)' 이라는 부분합(중간값)을 추가할 수 있다

>>> tips.pivot_table(values = 'tip_pct', index = 'time', columns = 'smoker')

>>> tips.pivot_table(values = 'tip_pct', index = 'time', columns = 'smoker', margins = True)
>>> tips.pivot_table(values = 'tip_pct', index = ['time','day'], columns = 'smoker', margins = True)
#결과물에서 'All' 칼럼은 흡연자와 비흡연자를 구분하지 않은 평균값 이다

평균값 이외에 다른 집계함수를 사용하려면 'aggfunc' 를 사용한다
'count'나 len 함수는 그룹크기의 교차일람표(총 개수나 빈도) 를 반환한다

>>> tips.pivot_table(values = 'tip_pct', index = ['time','day'],
                 	columns = 'smoker', margins = True, aggfunc = len)

>>> tips.pivot_table(values = 'tip_pct', index = ['time','day'],
                 	columns = 'smoker', margins = True, aggfunc = sum)

② 교차일람표 crosstab

그룹의 빈도를 계산하기 위한 피벗테이블의 특수한 경우다

>>> pd.crosstab(index = tips.time, columns = tips.smoker)

>>> pd.crosstab(index = [tips.time, tips.day], columns = tips.smoker, margins = True)

10. 시계열

대부분의 시계열은 고정 빈도(fixed frequency)로 표현된다
데이터가 존재하는 지점이 15초 마다, 5분 마다, 한달에 한번 등과 같은 특정 규칙에 따라 고정 간격을 갖는다
불규칙적인 모습으로도 표현될 수 있다

(1) 날짜, 시간 자료형, 도구 datetime

>>> from datetime import datetime
#'import datetime' 에서 datetime 은 함수가 아니라 클래스 이다

>>> now = datetime.now()
	now
>>> now.year, now.month, now.day
>>> now.second, now.microsecond

>>> new_year = datetime(2023,1,1)
	new_year - now
    
#datetime.timedelta 는 두 datetime 객체 간의 시간적인 차이를 표현할 수 있다
>>> delta = new_year - now
	delta.days
>>> new_year - delta

* 문자열을 datetime 객체로 변환하기

strftime : datetime 을 문자열로
strptime : 문자열을 datetime 으로

>>> print(now.strftime('%Y - %m - %d'))
	now.strftime('%S - %M - %H')
    
>>> value = now.strftime('%Y - %m - %d - %H - %M - %S')
	value    #str 형태이다

datetime.strptime 사용 방법1 - 띄어쓰기까지 정확하게 맞추어야한다

>>> datetime.strptime(value, '%Y - %m-%d')
>>> datetime.strptime(value, '%Y - %m - %d - %H - %M - %S')

strptime - 방법2 : parse()
매번 포맷 규칙을 지켜서 써야하는 것이 번거롭다 -> dateutil에 포함된 parser.parse 메서드를 사용한다

>>> from dateutil.parser import parse

>>> value1 = now.strftime('%Y-%m-%d-%H-%M-%S')
	value1    
>>> parse(value1)

>>> value2 = now.strftime('%Y/%m/%d/%H/%M/%S')
	value2
>>> parse(value2)

strptime - 방법3 : to_datetime
많은 종류의 날짜 표현을 처리할 수 있다
결과물이 DatetimeIndex 이다

>>> datestrs = ['2011-07-06 12:00:00', '2011-08-06 00:00:00']
>>> pd.to_datetime(datestrs)

(2) 시계열 기초 : Time Series 만들기

pandas 에서 가장 기본적인 시계열 객체의 종류는 파이썬 문자열이나 datetime 객체로 표현되는, 타임스탬프로 색인된 Series 이다

>>> pd.date_range(start = '1/1/2000', periods = 10)
>>> pd.date_range(start = '1/1/2000', end = '2/2/2000')
>>> pd.date_range(start = '1/1/2000', end = '2/2/2000', freq = '5H')

>>> dates = pd.date_range(start = '1/2/2011', end = '1/12/2011', freq = '2D')
	dates
    
>>> ts = pd.Series(np.random.randint(6), index = dates)
	ts    
    
#dates를 index로 했기 때문에 인덱싱, 슬라이싱이 가능하다
>>> ts[2]
>>> ts[::2]

#연산이 가능하다
>>> ts + ts[::2]

① 색인, 선택, 부분 선택

시계열은 라벨에 기반해서 데이터를 선택하고, 인덱싱할 때 pandas.Series와 동일하게 동작한다

>>> longer_ts = pd.Series(np.random.randn(1000), index = pd.date_range('1/1/2000', periods = 1000))
	longer_ts

#검색
>>> longer_ts['2001']
>>> longer_ts['2001-01']

#datetime 객체로 데이터를 잘라내는 작업은 일반적인 Series와 동일한 방식이다
>>> longer_ts['2002-09': ]
>>> longer_ts['2002/09/15': ]     #'-' 대신 '/' 를 써도 검색이 된다

Numpy 배열을 나누는 것처럼 데이터 복사가 발생하지 않고 슬라이스에 대한 변경이 원본 데이터에도 반영된다 -> truncate 를 사용하여 할 수 있다

>>> longer_ts.truncate(before = '2002/03')
>>> longer_ts.truncate(before = '2002/08/15', after = '2002/09/25')

#DataFrame 에도 동일하게 적용되며, 로우에 인덱싱 된다
>>> df = pd.DataFrame(longer_ts)
	df
    
>>> df[:3]
>>> df['2002-09']
>>> df.loc['2002-09']

② 중복된 색인을 갖는 시계열

여러 데이터가 특정 타임스탬프에 몰려 있는 것을 발견할 수 있다
is_unique 속성을 통해 확인해볼 수 있다

>>> dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000',
                          	'1/2/2000', '1/3/2000'])
	dates
    
>>> ts = pd.Series(np.arange(5), index = dates)
	ts

>>> ts['1/2/2000']

#groupby(level = 0)
>>> ts.groupby(level = 0).mean()

>>> df = pd.DataFrame(ts)
	df.reset_index()
    
>>> df = df.reset_index()
	df.groupby('index').mean()

(3) 날짜 범위, 빈도, 이동

pandas 에서 일반적인 시계열은 불규칙적인 것으로 간주된다. 즉 고정된 빈도를 갖지 않는다
하지만 일별, 월별, 혹은 매 15분 같은 상대적인 고정 빈도에서의 작업이 요구되는 경우가 있다

resample : 고정된 일 빈도로 변환

>>> list(ts.resample('D'))    #'D' : 일 빈도 ('M': 월별, 'W': 주별)

날짜 범위 생성하기 : pd.date_range(start = , end = , periods = , freq = )
특정 빈도에 따라 지정한 길이만큼의 DatetimeIndex 를 생성한다

* 데이터 시프트 shift

시프트는 데이터를 시간 축에서 앞이나 뒤로 이동하는 것을 의미한다
Series와 DataFrame은 인덱스는 변경하지 않고 데이터를 앞이나 뒤로 옮기는 shift 메서드를 갖고 있다

shift는 일반적으로 한 시계열 내에서, 또는 DataFrame의 칼럼으로 표현할 수 있는 여러 시계열에서의 퍼센트 변화를 계산할 때 흔히 사용한다

>>> ts = pd.Series(np.random.randn(4),
	               index=pd.date_range('1/1/2000', periods=4, freq='M')) 
	ts
    
>>> ts.shift(1)    #데이터가 한칸씩 내려감
>>> ts.shift(-1)    #데이터가 한칸씩 올라감

>>> ts / ts.shift(1)     #어제 값에 대한 오늘의 비율
>>> ts / ts.shift(1) - 1    #하루 전에 대한 증가율
>>> ts / ts.shift(2) - 1    #이틀 전에 대한 증가율

(4) 시간대 다루기

UTC : 국제 표준시
tz_localize : 지역화 시간으로의 변환
시계열이 특정 시간대로 지역화되고 나면 tz_convert 를 이용해서 다른 시간대로 변환이 가능하다

>>> ts = pd.date_range('3/9/2012 9:30', periods = 10)
	ts
    
>>> ts.tz_localize('UTC')

>>> ts = ts.tz_localize('UTC')
	ts.tz_convert("America/New_York")

(5) 리샘플링과 빈도 변환

리샘플링(resample) 은 시계열의 빈도를 변환하는 과정을 말한다
resample 은 groupby 와 비슷하다 -> resample 을 호출해서 데이터를 그룹짓고 요약함수를 적용하는 식이다

>>> ts = pd.Series(np.random.randn(100),
               	   index = pd.date_range('2000-01-01', periods = 100, freq = 'D'))
	ts
    
>>> ts.resample('M').mean()    #'M' : '월' 별로
>>> ts.resample('M', kind = 'period').mean()

* 다운샘플링

각 간격의 양끝 중에서 어느 쪽을 닫아둘 것인가
집계하려는 구간의 라벨을 간격의 시작으로 할 것인가, 끝으로 할 것인가

>>> rng = pd.date_range('2000-01-01', periods=12, freq='T')
	rng    #freq='T' : 분 단위
    
>>> ts = pd.Series(np.arange(12), index = rng)
	ts

#5분 단위로 묶는다
>>> ts.resample('5min')
	list(ts.resample('5min'))
    
>>> ts.resample('5min').sum()
>>> list(ts.resample('5min', closed = 'right'))
>>> ts.resample('5min', closed = 'right').sum()

인자로 넘긴 빈도는 5분 단위로 증가하는 그룹의 경계를 정의한다
기본적으로 시작값을 그룹의 '왼쪽에' 포함시키므로, 00:00의 값은 첫번째 그룹의 00:00 부터 00:05 까지의 값을 집계한다
closed = 'right' : 시작값을 그룹의 오른쪽에 포함시킨다

OHLC 샘플링
open 시가, high 고가, low 저가, close 종가
how = 'ohlc' 를 입력해서 한번에 이 4가지 값을 담고 있는 DataFrame 을 얻을 수 있다

>>> list(ts.resample('5min'))
>>> ts.resample('5min').ohlc()

11. 고급 Pandas

(1) Categorical 데이터

pandas 의 categorical형(범주형)
배열 내에서 유일한 값을 추출하거나(unique) 특정 값이 얼마나 많이 존재하는지 확인할 수 있는(value_counts) unique와 value_counts 메서드

>>> import numpy as np
>>> import pandas as pd

>>> values = pd.Series(['apple', 'orange', 'apple', 'apple'] * 2)
	values

#유일한 값 추출
>>> values.unique()

#특정 값의 개수 확인
>>> values.value_counts()

>>> values = values.astype('category')
	values

>>> values = pd.Series([0,1,0,0]*2)
	values
    
>>> dim = pd.Series(['apple', 'orange'])
	dim    
    
#take 메서드 : Series 내에 저장된 원래 문자열을 구할 수 있다
>>> dim.take(values)

정수로 표현된 값은 '범주형' 또는 '사전형 표기법' 이라고 한다
별개의 값을 담고 있는 배열은 '범주형 데이터' 라고 부른다
범주형 데이터를 가리키는 정수값은 '범주 코드' 또는 그냥 '코드' 라고 부른다
범주 코드를 변경하지 않은 채로 범주형 데이터를 변형하는 것이 가능하다

(2) Pandas 의 Categorical

pandas 에는 정수 기반의 범주형 데이터를 표현(또는 인코딩) 할 수 있는 Categorical형 이라고 하는 특수한 데이터형이 존재한다

>>> fruits = ['apple', 'orange', 'apple', 'apple'] * 2
>>>	N = len(fruits)
    
>>> df = pd.DataFrame({'fruit': fruits,
                   'basket_id': np.arange(N),
                   'count': np.random.randint(3, 15, size=N),
                   'weight': np.random.uniform(0, 4, size=N)})
	df

>>> df.info()
>>> df['fruit'] = df['fruit'].astype('category')
	df.info()
    
>>> df['fruit'].values
>>> c = df['fruit'].values

#Categorical 객체 'c'는 categories와 codes 속성을 갖는다
>>> c.codes
>>> c.categories

pd.Categorical 형을 직접 생성할 수도 있다
기존에 정의된 범주와 범주 코드가 있다면 from_codes 함수를 이용해서 범주형 데이터를 생성하는 것도 가능하다
순서가 없는 범주형 인스턴스는 'as_ordered' 메서드를 이용해 순서를 가지도록 할 수 있다

>>> codes = [0,1,2,0,0,1]
	categories = ['A', 'B', 'C']
    
>>> cats = pd.Categorical.from_codes(codes, categories)
	cats
    
#ordered = True : categories 의 순서 지정    
>>> cats_o = pd.Categorical.from_codes(codes, categories, ordered = True)
	cats_o
    
>>> cats.as_ordered()

(3) Categorical 연산

임의의 숫자 데이터를 pd.qcut() 함수로 구분해보자. 그러면 pd.Categorical 객체를 반환한다

#1000개의 randn
>>> draws = np.random.randn(1000)
	draws

#4분위로 나누자
>>> pd.qcut(draws, 4)

#4분위에 이름 지정
>>> pd.qcut(draws, 4, labels = ['A','B','C','D'])

#groupby 를 이용해서 요약 통계를 내보자
>>> bins = pd.qcut(draws, 4, labels = ['A','B','C','D'])
>>> bins = pd.Series(bins, name = 'quantile')

>>> results = pd.Series(draws).groupby(bins).agg(['count', 'min', 'max']).reset_index()
	results
    
>>> results['quantile']
#quantile 칼럼은 bins 의 순서를 포함한 원래 범주 정보를 유지하고 있다

유예지

이전 포스트

220929 Day25

다음 포스트