DataFrame 1

김지윤·2023년 5월 26일

Pandas

목록 보기

2/5

pandas.DataFrame Documentation : pandas.DataFrame documentation

import numpy as np
import pandas as pd
import matplotplib.pyplot as plt

데이터프레임 만들기

s1 = pd.Series(np.arange(10,15), index = np.arange(5))
s2 = pd.Series(np.arange(20,25), index = np.arange(100,105))

df1 = pd.DataFrame({'a':s1, 'b':s2})
print(df1)
#         a     b
# 0    10.0   NaN
# 1    11.0   NaN
# 2    12.0   NaN
# 3    13.0   NaN
# 4    14.0   NaN
# 100   NaN  20.0
# 101   NaN  21.0
# 102   NaN  22.0
# 103   NaN  23.0
# 104   NaN  24.0

index, column

print(df1.index)
# Int64Index([0, 1, 2, 3, 4, 100, 101, 102, 103, 104], dtype='int64')
print(df1.colums)
# Index(['a', 'b'], dtype='object')

# column 명 변경
df1.columns = ['col_A', 'col_B']
print(df1)
#      col_A  col_B
# 0     10.0    NaN
# 1     11.0    NaN
# 2     12.0    NaN
# 3     13.0    NaN
# 4     14.0    NaN
# 100    NaN   20.0
# 101    NaN   21.0
# 102    NaN   22.0
# 103    NaN   23.0
# 104    NaN   24.0

특정 행, 열 추출

for i in df1 :
	print(i)
# col_A
# col_B

print(df1['col_A'])
# 0      10.0
# 1      11.0
# 2      12.0
# 3      13.0
# 4      14.0
# 100     NaN
# 101     NaN
# 102     NaN
# 103     NaN
# 104     NaN
# Name: col_A, dtype: float64

iloc

print(df1.iloc[2:7, 0])
# 2      12.0
# 3      13.0
# 4      14.0
# 100     NaN
# 101     NaN
# Name: col_A, dtype: float64

print(df1.iloc[2:7, 'col_A']) # error !!!

loc : 문자열 label 사용

print(df1.loc[:,'col_A':'col_B'])
#      col_A  col_B
# 0     10.0    NaN
# 1     11.0    NaN
# 2     12.0    NaN
# 3     13.0    NaN
# 4     14.0    NaN
# 100    NaN   20.0
# 101    NaN   21.0
# 102    NaN   22.0
# 103    NaN   23.0
# 104    NaN   24.0

print(df1.loc[2:7, 'col_A'])
# 2      12.0
# 3      13.0
# 4      14.0
# 100     NaN
# 101     NaN
# Name: col_A, dtype: float64

.
.

전국건강증진센터 표준데이터.CSV

pd_data = pd.read_csv('전국건강증진센터표준데이터.csv', encoding = 'cp949')
print(pd_data.head())

for i,e in enumerate(pd_data) :
	print(i,e)
# 0 건강증진센터명
# 1 건강증진센터구분
# 2 소재지도로명주소
# 3 소재지지번주소
# 4 위도
# 5 경도
# 6 건강증진업무내용
# 7 운영시작시각
# 8 운영종료시각
# 9 휴무일정보
# 10 건물면적
# 11 의사수
# 12 간호사수
# 13 사회복지사수
# 14 영양사수
# 15 기타인력현황
# 16 기타이용안내
# 17 운영기관전화번호
# 18 운영기관명
# 19 관리기관전화번호
# 20 관리기관명
# 21 데이터기준일자
# 22 제공기관코드
# 23 제공기관명

df1 = pd_data[['건강증진센터명','의사수','간호사수']]
print(df1)
#                 건강증진센터명  의사수  간호사수
# 0              맞춤형건강클리닉    0     5
# 1              정신건강복지센터    0     4
# 2            중구정신건강복지센터    1     5
# 3           의령군정신건강복지센터    0     6
# 4           고성군정신건강복지센터    0     4
..                  ...  ...   ...
# 290    양평군 건강관리동부센터(추가)    0     0
# 291  울산광역시 동구보건소 건강증진센터    1     1
# 292     울산광역시동구정신건강증진센터    1     0
# 293              건강증진센터    0     3
# 294            정신건강복지센터    1    13

# [295 rows x 3 columns]

pd_data.loc[:,['건강증진센터명','의사수','간호사수']]
pd_data.iloc[:,[0,11,12]]

위의 방법들도 모두 동일하게, '건강증진센터명', '의사수' '간호사수' column을 가져온다.

의료인(의사 + 간호사) 의 수

s_sum = df1['의사수'] + df1['간호사수']
print(s_sum)
# 0       5
# 1       4
# 2       6
# 3       6
# 4       4
#        ..
# 290     0
# 291     2
# 292     1
# 293     3
# 294    14
# Length: 295, dtype: int64

df1['s_sum'] = s_sum
print(df1)
#                 건강증진센터명  의사수  간호사수  s_sum
# 0              맞춤형건강클리닉    0     5      5
# 1              정신건강복지센터    0     4      4
# 2            중구정신건강복지센터    1     5      6
# 3           의령군정신건강복지센터    0     6      6
# 4           고성군정신건강복지센터    0     4      4
# ..                  ...  ...   ...    ...
# 290    양평군 건강관리동부센터(추가)    0     0      0
# 291  울산광역시 동구보건소 건강증진센터    1     1      2
# 292     울산광역시동구정신건강증진센터    1     0      1
# 293              건강증진센터    0     3      3
# 294            정신건강복지센터    1    13     14

# [295 rows x 4 columns]

cnt = s_sum.value_counts()
print(cnt)
# 2     54
# 1     53
# 0     43
# 3     36
# 4     31
# 5     19
# 6     19
# 7     10
# 8      7
# 10     6
# 16     4
# 13     3
# 9      3
# 14     3
# 12     2
# 11     1
# 18     1
# dtype: int64

print(cnt.sort_index())
# 0     43
# 1     53
# 2     54
# 3     36
# 4     31
# 5     19
# 6     19
# 7     10
# 8      7
# 9      3
# 10     6
# 11     1
# 12     2
# 13     3
# 14     3
# 16     4
# 18     1
# dtype: int64

_,axe = plt.subplots()
axe.plot(cnt.sort_index())

대부분 센터의 의료인 (의사 + 간호사) 수는 0명, 1명, 2명 인 것으로 나타난다.

가장 많은 의료인을 보유한 건강증진센터 는 어디일까 ?

filter = df1['s_sum'] == 18
print(df1[filter])

#          건강증진센터명  의사수  간호사수  s_sum
# 235  부산진구 건강증진센터    0    18     18

부산진구 건강증진센터 가 가장 많은 의료인을 보유한 센터이다.

김지윤

데이터 분석 / 데이터 사이언티스트 / AI 딥러닝

이전 포스트

Series

다음 포스트

DataFrame 1

Pandas

pandas.DataFrame Documentation : pandas.DataFrame documentation

데이터프레임 만들기

index, column

특정 행, 열 추출

전국건강증진센터 표준데이터.CSV

의료인(의사 + 간호사) 의 수

Series

DataFrame2

0개의 댓글

관련 채용 정보