import pandas as pd
# ํ์ผ ๊ฒฝ๋ก ์ง์
titanic_train = './titanic/train.csv'
titanic_test = './titanic/test.csv'
# CSV ํ์ผ ์ฝ์ด์ค๊ธฐ
df_train = pd.read_csv(titanic_train)
df_test = pd.read_csv(titanic_test)
encoding='euc-kr'DataUrl = 'https://raw.githubusercontent.com/Datamanim/pandas/main/Jeju.csv'
df = pd.read_csv(DataUrl,encoding='euc-kr')
type(): ๋ฐ์ดํฐ ๋ก๋# ๋ฐ์ดํฐ ํ์ธ
print(type(df_train), type(df_test))
<class 'pandas.core.frame.DataFrame'>
.head(): ์์ ํ 5๊ฐ ์ถ๋ ฅprint(df_train.head())
print(df_test.head())
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
.tail(): ํ์ ํ 5๊ฐ ์ถ๋ ฅprint(df_train.tail())
print(df_test.tail(2))
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.00 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.00 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.45 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.00 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.75 NaN Q
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
416 1308 3 Ware, Mr. Frederick male NaN 0 0 359309 8.0500 NaN S
417 1309 3 Peter, Master. Michael J male NaN 1 1 2668 22.3583 NaN C
.shape: ํ, ์ด ๊ฐฏ์ ์ถ๋ ฅprint(f'๐ณ๏ธ train(ํ,์ด): {df_train.shape}')
print(f'โ๏ธ test ํ: {df_test.shape[0]}, test ์ด: {df_test.shape[1]}')
๐ณ๏ธ train(ํ,์ด): (891, 12)
โ๏ธ test ํ: 418, test ์ด: 11
print(f'๐ณ๏ธ Train columns:\n{df_train.columns}')
print(f'โ๏ธ Test columns:\n{df_test.columns}')
๐ณ๏ธ Train columns:
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
โ๏ธ Test columns:
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
print(f'๐ณ๏ธ trian: {df_train.index}')
print(f'โ๏ธ test: {df_test.index}')
๐ณ๏ธ trian: RangeIndex(start=0, stop=891, step=1)
โ๏ธ test: RangeIndex(start=0, stop=418, step=1
print('๐ณ๏ธ trian:')
print(df_train.info())
print('โ๏ธ test:')
print(df_test.info())
๐ณ๏ธ trian:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
โ๏ธ test:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 418 non-null int64
1 Pclass 418 non-null int64
2 Name 418 non-null object
3 Sex 418 non-null object
4 Age 332 non-null float64
5 SibSp 418 non-null int64
6 Parch 418 non-null int64
7 Ticket 418 non-null object
8 Fare 417 non-null float64
9 Cabin 91 non-null object
10 Embarked 418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB
None
print(f'๐ณ๏ธ train 4๋ฒ์จฐ ํ์ data type: {df_train.iloc[:,3].dtype}')
# df_train.columns
# Index(['PassengerId', 'Survived', 'Pclass', 'Name', ...])
๐ณ๏ธ train 4๋ฒ์จฐ ํ์ data type: object
df_train.iloc[:,3]: Name์ด์ ์ ํ.iloc: ์ธ๋ฑ์ค๋ก ํ/์ด์ ์ ํํ๋ ํจ์[:,3]: โ ๋ชจ๋ ํ์ ์๋ฏธ3 โ ์ด ์ธ๋ฑ์ค 3๋ฒ (๋ค ๋ฒ์งธ ์ด)์ ์ ํdtype('O'/'object') โ ๋ฌธ์์ดdtype('int64') โ ์ ์dtype('float64') โ ์ค์| ํญ๋ชฉ | iloc(= integer location) | loc |
|---|---|---|
| ์๋ฏธ | index location (์ ์ ๊ธฐ๋ฐ ์ธ๋ฑ์ฑ) | label location (๋ ์ด๋ธ ๊ธฐ๋ฐ ์ธ๋ฑ์ฑ) |
| ๊ธฐ์ค | ์ซ์ ์์น (0, 1, 2...) | ์ค์ ์ธ๋ฑ์ค ์ด๋ฆ, ์ด ์ด๋ฆ |
| ์ฌ์ฉ ์์ | df.iloc[0, 1] | df.loc[0, 'Name'] |
.iloc[row_idx, col_idx] | ||
| ์ ์ฉํ ์ํฉ | ์์น๋ก ์ ๊ทผํ ๋ | ์ด๋ฆ ๊ธฐ์ค์ผ๋ก ์ ๊ทผํ ๋ |
import pandas as pd
data = {
'Name': ['Alice', 'Bob', 'Charlie'],
'Age': [25, 30, 35],
}
df = pd.DataFrame(data, index=[100, 101, 102])
print(df)
์ถ๋ ฅ:
Name Age
100 Alice 25
101 Bob 30
102 Charlie 35
df.iloc[0, 0] # 0ํ 0์ด โ 'Alice'
df.iloc[2, 1] # 2ํ 1์ด โ 35
df.iloc[:, 1] # ์ ์ฒด ํ, 1์ด (Age ์ด) โ [25, 30, 35]
df.loc[100, 'Name'] # ์ธ๋ฑ์ค 100 ํ, 'Name' ์ด โ 'Alice'
df.loc[102, 'Age'] # ์ธ๋ฑ์ค 102 ํ, 'Age' ์ด โ 35
df.loc[:, 'Name'] # ์ ์ฒด ํ, 'Name' ์ด โ ['Alice', 'Bob', 'Charlie']
.select_dtypes(exclude[include]=object).columnsprint('๐ณ๏ธ train์ ์์นํ ๋ณ์๋ฅผ ๊ฐ์ง ํ:')
print(df_train.select_dtypes(exclude=object).columns)
print('โ๏ธ test์ ๋ฒ์ฃผํ ๋ณ์๋ฅผ ๊ฐ์ง ํ:')
print(df_test.select_dtypes(include=object).columns)
๐ณ๏ธ train์ ์์นํ ๋ณ์๋ฅผ ๊ฐ์ง ํ:
Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')
โ๏ธ test์ ๋ฒ์ฃผํ ๋ณ์๋ฅผ ๊ฐ์ง ํ:
Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')
print(f'๐ณ๏ธ train ๊ฒฐ์ธก์น ๊ฐฏ์: \n{df_train.isnull().sum()}')
print(f'โ๏ธ test ๊ฒฐ์ธก์น ๊ฐฏ์: \n{df_test.isnull().sum()}')
๐ณ๏ธ train ๊ฒฐ์ธก์น ๊ฐฏ์:
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
โ๏ธ test ๊ฒฐ์ธก์น ๊ฐฏ์:
PassengerId 0
Pclass 0
Name 0
Sex 0
Age 86
SibSp 0
Parch 0
Ticket 0
Fare 1
Cabin 327
Embarked 0
dtype: int64
print('๐ณ๏ธ train describe:')
print(tabulate(df_train.describe(), headers='keys', tablefmt='pretty'))
print('โ๏ธ test describe:')
print(tabulate(df_test.describe(), headers='keys', tablefmt='pretty'))
๐ณ๏ธ train describe:
+-------+-------------------+--------------------+--------------------+--------------------+--------------------+---------------------+--------------------+
| | PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare |
+-------+-------------------+--------------------+--------------------+--------------------+--------------------+---------------------+--------------------+
| count | 891.0 | 891.0 | 891.0 | 714.0 | 891.0 | 891.0 | 891.0 |
| mean | 446.0 | 0.3838383838383838 | 2.308641975308642 | 29.69911764705882 | 0.5230078563411896 | 0.38159371492704824 | 32.204207968574636 |
| std | 257.3538420152301 | 0.4865924542648585 | 0.8360712409770513 | 14.526497332334044 | 1.1027434322934275 | 0.8060572211299559 | 49.6934285971809 |
| min | 1.0 | 0.0 | 1.0 | 0.42 | 0.0 | 0.0 | 0.0 |
| 25% | 223.5 | 0.0 | 2.0 | 20.125 | 0.0 | 0.0 | 7.9104 |
| 50% | 446.0 | 0.0 | 3.0 | 28.0 | 0.0 | 0.0 | 14.4542 |
| 75% | 668.5 | 1.0 | 3.0 | 38.0 | 1.0 | 0.0 | 31.0 |
| max | 891.0 | 1.0 | 3.0 | 80.0 | 8.0 | 6.0 | 512.3292 |
+-------+-------------------+--------------------+--------------------+--------------------+--------------------+---------------------+--------------------+
โ๏ธ test describe:
+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| | PassengerId | Pclass | Age | SibSp | Parch | Fare |
+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| count | 418.0 | 418.0 | 332.0 | 418.0 | 418.0 | 417.0 |
| mean | 1100.5 | 2.2655502392344498 | 30.272590361445783 | 0.4473684210526316 | 0.3923444976076555 | 35.627188489208635 |
| std | 120.81045760473994 | 0.8418375519640519 | 14.18120923562442 | 0.8967595611217125 | 0.9814288785371684 | 55.90757617997383 |
| min | 892.0 | 1.0 | 0.17 | 0.0 | 0.0 | 0.0 |
| 25% | 996.25 | 1.0 | 21.0 | 0.0 | 0.0 | 7.8958 |
| 50% | 1100.5 | 3.0 | 27.0 | 0.0 | 0.0 | 14.4542 |
| 75% | 1204.75 | 3.0 | 39.0 | 1.0 | 0.0 | 31.5 |
| max | 1309.0 | 3.0 | 76.0 | 8.0 | 9.0 | 512.3292 |
+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
print('๐ณ๏ธ Names:')
print(df_train['Name'])
๐ณ๏ธ Names:
0 Braund, Mr. Owen Harris
1 Cumings, Mrs. John Bradley (Florence Briggs Th...
2 Heikkinen, Miss. Laina
3 Futrelle, Mrs. Jacques Heath (Lily May Peel)
4 Allen, Mr. William Henry
...
886 Montvila, Rev. Juozas
887 Graham, Miss. Margaret Edith
888 Johnston, Miss. Catherine Helen "Carrie"
889 Behr, Mr. Karl Howell
890 Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object
print(f'๐ณ๏ธ Age IQR: {df_train['Age'].quantile(0.75) - df_train['Age'].quantile(0.25)}')
๐ณ๏ธ Age IQR: 17.875
.nunique(): ์ ์ผ๊ฐ ๊ฐฏ์.unique(): ์ ์ผ๊ฐ์ ํด๋นํ๋ ๊ฒ ๋ชจ๋ ์ถ๋ ฅprint(f'๐ณ๏ธ Cabin ์ ์ผ๊ฐ ๊ฐฏ์: {df_train.Cabin.nunique()}')
print(f'๐ณ๏ธ Cabin ์ ์ผ๊ฐ: {df_train.Cabin.unique()}')
๐ณ๏ธ Cabin ์ ์ผ๊ฐ ๊ฐฏ์: 147
๐ณ๏ธ Cabin ์ ์ผ๊ฐ: [nan 'C85' 'C123' 'E46' 'G6' 'C103' 'D56' 'A6' 'C23 C25 C27' 'B78' 'D33'
'B30' 'C52' 'B28' 'C83' 'F33' 'F G73' 'E31' 'A5' 'D10 D12' 'D26' 'C110'
'B58 B60' 'E101' 'F E69' 'D47' 'B86' 'F2' 'C2' 'E33' 'B19' 'A7' 'C49'
'F4' 'A32' 'B4' 'B80' 'A31' 'D36' 'D15' 'C93' 'C78' 'D35' 'C87' 'B77'
'E67' 'B94' 'C125' 'C99' 'C118' 'D7' 'A19' 'B49' 'D' 'C22 C26' 'C106'
'C65' 'E36' 'C54' 'B57 B59 B63 B66' 'C7' 'E34' 'C32' 'B18' 'C124' 'C91'
'E40' 'T' 'C128' 'D37' 'B35' 'E50' 'C82' 'B96 B98' 'E10' 'E44' 'A34'
'C104' 'C111' 'C92' 'E38' 'D21' 'E12' 'E63' 'A14' 'B37' 'C30' 'D20' 'B79'
'E25' 'D46' 'B73' 'C95' 'B38' 'B39' 'B22' 'C86' 'C70' 'A16' 'C101' 'C68'
'A10' 'E68' 'B41' 'A20' 'D19' 'D50' 'D9' 'A23' 'B50' 'A26' 'D48' 'E58'
'C126' 'B71' 'B51 B53 B55' 'D49' 'B5' 'B20' 'F G63' 'C62 C64' 'E24' 'C90'
'C45' 'E8' 'B101' 'D45' 'C46' 'D30' 'E121' 'D11' 'E77' 'F38' 'B3' 'D6'
'B82 B84' 'D17' 'A36' 'B102' 'B69' 'E49' 'C47' 'D28' 'E17' 'A24' 'C50'
'B42' 'C148']
์ฐธ๊ณ , https://www.datamanim.com/dataset/99_pandas/pandasMain.html#getting-knowing-data