본 노트는 Aidemy의 강의, 라이브러리 pandas기초의 노트입니다.
[!abstract]+ Curriculum
1. pandas 란
2. pandas 조작방법
3. 복수의 DataFrame 이용
첨삭문제
테이블 데이터를 다루는 데 특화된 라이브러리.
#pd/read #pd/save
#pd/new
#pd/type
df["price"] = df["price"].astype(float) # 非破壊なメソッドなのでイコールを用いて上書き
#pd/access
series[1]
print(series[0:2])
series[['orange','peach']]
series["orange":]
series['orange']
print(df.head())
print(df.tail(10))
print(df.loc[1:2,"product":])
>>>出力結果
product price quantity
1 orange 1000 2
2 orange 800 1
#pd/access/bull
import pandas as pd
food = {"snack": 200, "ra-men": 1000, "rice": 800, "coffee": 100, "green-tea":250, "wine":900}
series = pd.Series(food)
conditions = [True, True, False, False, False, False]
print(series[conditions])
>>>出力結果
snack 200
ra-men 1000
dtype: int64
import pandas as pd
data = {"store": ["shibuya", "shinjuku", "yokohama", "shibuya", "shinjuku", "shibuya"],
"product": ["banana", "orange", "orange", "grape", "banana", "peach"],
"price": [200, 1000, 800, 100, 250, 900],
"quantity": [1, 2, 1, 2, 3, 2]}
df = pd.DataFrame(data)
print(df["price"]>500)
print()
print(df[df["price"]>500])
>>>出力結果
0 False
1 True
2 True
3 False
4 False
5 True
Name: price, dtype: bool
store product price quantity
1 shinjuku orange 1000 2
2 yokohama orange 800 1
5 shibuya peach 900 2
#pd/sort
print(series.sort_index())
series.sort_values(ascending=False)
print(df.sort_values(by=["quantity", "price"], ascending=True))
#pd/append
grape = {"grape": 3} # 既に作成したseriesにgrapeを追加
series = series.append(pd.Series(grape)) # 非破壊なメソッドなのでイコールを用いて上書き
import pandas as pd
data = {"store": ["shibuya", "shinjuku", "yokohama", "shibuya", "shinjuku"],
"product": ["banana", "orange", "orange", "grape", "banana"],
"price": [200, 1000, 800, 100, 250],
"quantity": [1, 2, 1, 2, 3]}
df = pd.DataFrame(data)
print(df) # 元のDataFrameの出力
print() # 改行
series = pd.Series(
{"store": "shibuya", "product": "peach", "price": 900, "quantity": 2, "month": "August"})
# Seriesの作成(DataFrameにはmonthカラムは存在しない)
df = df.append(series, ignore_index=True)
print(df) # 追加後のDataFrameの出力
>>>出力結果
store product price quantity
0 shibuya banana 200 1
1 shinjuku orange 1000 2
2 yokohama orange 800 1
3 shibuya grape 100 2
4 shinjuku banana 250 3
store product price quantity month
0 shibuya banana 200 1 NaN
1 shinjuku orange 1000 2 NaN
2 yokohama orange 800 1 NaN
3 shibuya grape 100 2 NaN
4 shinjuku banana 250 3 NaN
5 shibuya peach 900 2 August
import pandas as pd
data = {"store": ["shibuya", "shinjuku", "yokohama", "shibuya", "shinjuku"],
"product": ["banana", "orange", "orange", "grape", "banana"],
"price": [200, 1000, 800, 100, 250],
"quantity": [1, 2, 1, 2, 3]}
df = pd.DataFrame(data)
print(df)
print() # 改行
df["month"] = ["August", "September", "November",
"January", "October"] # カラムmonthを追加
print(df)
>>>出力結果
store product price quantity
0 shibuya banana 200 1
1 shinjuku orange 1000 2
2 yokohama orange 800 1
3 shibuya grape 100 2
4 shinjuku banana 250 3
store product price quantity month
0 shibuya banana 200 1 August
1 shinjuku orange 1000 2 September
2 yokohama orange 800 1 November
3 shibuya grape 100 2 January
4 shinjuku banana 250 3 October
#pd/del
# バナナのインデックスを削除
series = series.drop("banana") # 非破壊なメソッドなのでイコールを用いて上書き
# インデックス1,2を削除
df = df.drop(range(0, 2)) # 非破壊なメソッドなのでイコールを用いて上書き
df = df.drop("store", axis=1) # カラムstoreの削除
#pd/cal
import pandas as pd
import numpy as np
data = {"apple": [6, 1, 4, 4, 8],
"orange": [1, 4, 5, 6, 3],
"banana": [6, 10, 9, 10, 5],
"strawberry": [3, 4, 9, 2, 4],
"kiwifruit": [10, 10, 1, 5, 8]
}
df = pd.DataFrame(data)
print(df)
print() # 改行
print(df['apple']+df["strawberry"]) # カラム同士の足し算
>>>出力結果
apple orange banana strawberry kiwifruit
0 6 1 6 3 10
1 1 4 10 4 10
2 4 5 9 9 1
3 4 6 10 2 5
4 8 3 5 4 8
0 9
1 5
2 13
3 6
4 12
dtype: int64
import pandas as pd
data = {"store": ["shibuya", "shinjuku", "yokohama", "shibuya", "shinjuku", "shibuya"],
"product": ["banana", "orange", "orange", "grape", "banana", "peach"],
"price": [200, 1000, 800, 100, 250, 900],
"quantity": [1, 2, 1, 2, 3, 2]}
df = pd.DataFrame(data)
print(df+2) # DataFrameの要素全てに2を足す
>>>出力結果
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
〜中略〜
TypeError: can only concatenate str (not "int") to str
#pd/np #np/pd
import pandas as pd
import numpy as np
data = {"apple": [6, 1, 4, 4, 8],
"orange": [1, 4, 5, 6, 3],
"banana": [6, 10, 9, 10, 5],
"strawberry": [3, 4, 9, 2, 4],
"kiwifruit": [10, 10, 1, 5, 8]
}
df = pd.DataFrame(data)
print(df)
print()
print(np.log(df)) # Numpyの関数を用いて要素の値を対数変換
>>>出力結果
apple orange banana strawberry kiwifruit
0 6 1 6 3 10
1 1 4 10 4 10
2 4 5 9 9 1
3 4 6 10 2 5
4 8 3 5 4 8
apple orange banana strawberry kiwifruit
0 1.791759 0.000000 1.791759 1.098612 2.302585
1 0.000000 1.386294 2.302585 1.386294 2.302585
2 1.386294 1.609438 2.197225 2.197225 0.000000
3 1.386294 1.791759 2.302585 0.693147 1.609438
4 2.079442 1.098612 1.609438 1.386294 2.079442
#pd/stats
print(df.describe()) # 要約統計量
#pd/grouping
.groupby()
로 GroupBy 오브젝트가 반환되지만, print 로 표시는 불가..min()
, .max()
은 문자열에도 적용 : 문자코드의 최소, 최댓값import pandas as pd
data = {"store": ["shibuya", "shinjuku", "yokohama", "shibuya", "shinjuku", "shibuya"],
"product": ["banana", "orange", "orange", "grape", "banana", "peach"],
"price": [200, 1000, 800, 100, 250, 900],
"quantity": [1, 2, 1, 2, 3, 2]}
df = pd.DataFrame(data)
print(df)
print() # 改行
print(df.groupby('store').sum()) # 合計
print() # 改行
print(df.groupby('store').mean()) # 平均
print() # 改行
print(df.groupby('store').var()) # 分散
print() # 改行
print(df.groupby('store').min()) # 最小値
print() # 改行
print(df.groupby('store').max()) # 最大値
>>>出力結果
store product price quantity
0 shibuya banana 200 1
1 shinjuku orange 1000 2
2 yokohama orange 800 1
3 shibuya grape 100 2
4 shinjuku banana 250 3
5 shibuya peach 900 2
price quantity
store
shibuya 1200 5
shinjuku 1250 5
yokohama 800 1
price quantity
store
shibuya 400.0 1.666667
shinjuku 625.0 2.500000
yokohama 800.0 1.000000
price quantity
store
shibuya 190000.0 0.333333
shinjuku 281250.0 0.500000
yokohama NaN NaN
product price quantity
store
shibuya banana 100 1
shinjuku banana 250 2
yokohama orange 800 1
product price quantity
store
shibuya peach 900 2
shinjuku orange 1000 3
yokohama orange 800 1
#pd/concat
import pandas as pd
data1 = {"store": ["shibuya", "shinjuku", "yokohama", "shibuya"], # data2よりインデックスの数が1つ多い
"product": ["banana", "orange", "orange", "grape"],
"price": [200, 1000, 800, 1000],
"quantity": [1, 2, 1, 1]}
data2 = {"store": ["shibuya", "shinjuku", "shibuya"], # data1にはないカラム
"product": ["grape", "banana", "peach"],
"価格": [100, 250, 900],
"quantity": [2, 3, 2]}
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
print(pd.concat([df1, df2], axis=0))
print() # 改行
print(pd.concat([df1, df2], axis=1))
print() # 改行
print(pd.concat([df1, df2], axis=0).dtypes)
>>>出力結果
store product price quantity 価格
0 shibuya banana 200.0 1 NaN
1 shinjuku orange 1000.0 2 NaN
2 yokohama orange 800.0 1 NaN
3 shibuya grape 1000.0 1 NaN
0 shibuya grape NaN 2 100.0
1 shinjuku banana NaN 3 250.0
2 shibuya peach NaN 2 900.0
store product price quantity store product 価格 quantity
0 shibuya banana 200 1 shibuya grape 100.0 2.0
1 shinjuku orange 1000 2 shinjuku banana 250.0 3.0
2 yokohama orange 800 1 shibuya peach 900.0 2.0
3 shibuya grape 1000 1 NaN NaN NaN NaN
store object
product object
price float64
quantity int64
価格 float64
dtype: object
#pd/merge
두 데이터 프레임의 key 집합을 집합 A, B 라고 가정하면 쉬움.
import pandas as pd
pd.merge(左のDataFrame, 右のDataFrame, on = "keyにするカラム名", how="inner")
import pandas as pd
pd.merge(左のDataFrame, 右のDataFrame, on = "keyにするカラム名", how="left")
import pandas as pd
pd.merge(左のDataFrame, 右のDataFrame, on = "keyにするカラム名", how="right")
import pandas as pd
pd.merge(左のDataFrame, 右のDataFrame, on = "keyにするカラム名", how="outer")
import pandas as pd
store_data = {
"store": ["shibuya", "shinjuku", "yokohama", "meguro", "ikebukuro"],
"ID": [1, 2, 3, 4, 5]
}
store_df = pd.DataFrame(store_data) # store_dfを作成
data = {"ID": [1, 2, 3, 3, 2, 1],
"product": ["banana", "orange", "orange", "grape", "banana", "peach"],
"price": [200, 1000, 800, 100, 250, 900],
"quantity": [1, 2, 1, 2, 3, 2]}
df = pd.DataFrame(data) # dfを作成
print(df) # dfを出力
print()
print(store_df) # store_dfを出力
print()
# 問題1
# dfのインデックスが0から4までの要素、カラム名を出力してください。
df_1 = df.head()
print(df_1)
print()
# 問題2
# df とstore_dfをkeyをIDとして外部結合してください。
df_2 = pd.merge(df, store_df, on='ID', how = 'outer')
print(df_2)
print()
# 問題3
# df とstore_dfをkeyをIDとして内部結合してください。
df_3 = pd.merge(df, store_df, on='ID', how = 'inner')
print(df_3)
print()
# 問題4
# 問題3の回答にて作成したdf_3とgroupbyメソッドを用いてstore毎のID、price、quantityの平均値を出力してください。
df_4 = df_3.groupby('store').mean()
print(df_4)
print()
# 問題5
# 問題3の回答にて作成したdf_3とdiscribeメソッドを用いてID、price、quantityの要約統計量を出力してください。
df_5 = df_3.describe()
print(df_5)