https://raw.githubusercontent.com/wesm/pydata-book/3rd-edition/datasets/fec/P00000001-ALL.csv
>>> fec = pd.read_csv('https://raw.githubusercontent.com/wesm/pydata-book/3rd-edition/datasets/fec/P00000001-ALL.csv')
>>> fec.info()
>>> fec.iloc[123456]
fec[123456 : 123457]
정당 가입 여부에 대한 데이터를 'unique' 메서드를 이용해서 추가한다
>>> fec.cand_nm.unique()
#소속 정당
>>> parties = {'Bachmann, Michelle': 'Republican',
'Cain, Herman': 'Republican',
'Gingrich, Newt': 'Republican',
'Huntsman, Jon': 'Republican',
'Johnson, Gary Earl': 'Republican',
'McCotter, Thaddeus G': 'Republican',
'Obama, Barack': 'Democrat',
'Paul, Ron': 'Republican',
'Pawlenty, Timothy': 'Republican',
'Perry, Rick': 'Republican',
"Roemer, Charles E. 'Buddy' III": 'Republican',
'Romney, Mitt': 'Republican',
'Santorum, Rick': 'Republican'}
#map 을 이용해 후보 이름(cand_nm)으로부터 정당 배열을 계산한다
>>> fec.cand_nm.map(parties)
#party 칼럼으로 추가
>>> fec['party'] = fec.cand_nm.map(parties)
fec[:3]
>>> fec['party'].value_counts()
분석하기 전 데이터 다듬기
>>> fec.contb_receipt_amt.min()
>>> (fec.contb_receipt_amt > 0).value_counts()
>>> fec = fec[fec.contb_receipt_amt > 0]
fec #기부금액이 양수인 것만
#버락 오바마와 미트 롬니가 양대 후보이므로 이 두 후보의 기부금액 정보만 추출
#방법1
>>> fec.query('cand_nm == "Obama, Barack" or cand_nm == "Romney, Mitt"')
#방법2
>>> fec.query('cand_nm in ["Obama, Barack", "Romney, Mitt"]')
#방법3 (책에 있는 방법)
>>> fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack', 'Romney, Mitt'])]
#방법4
>>> cond1 = fec.cand_nm == 'Obama, Barack'
cond2 = fec.cand_nm == 'Romney, Mitt'
fec_mrbo = fec[cond1 | cond2]
>>> fec.contbr_occupation.value_counts()
>>> fec.contbr_occupation.value_counts()[:10]
#일반적인 직업 유형이거나, 같은 유형이지만 다른 이름으로 결과가 포함되어있음을 볼 수 있다
#하나의 직업을 다른 직업으로 매핑
>>> occ_mapping = {
'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',
'INFORMATION REQUESTED' : 'NOT PROVIDED',
'INFORMATION REQUESTED (BEST EFFORTS)' : 'NOT PROVIDED',
'C.E.O.': 'CEO' }
#dict.get() -> 매핑 정보가 없는 직업은 그대로 사용
#mapping 이 없다면 x를 반환한다
>>> fec.contbr_occupation.map(lambda x: occ_mapping.get(x, x))
>>> fec.contbr_occupation = fec.contbr_occupation.map(lambda x: occ_mapping.get(x, x))
#고용주에 대해서도 같은 방법으로 처리
>>> emp_mapping = {
'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',
'INFORMATION REQUESTED' : 'NOT PROVIDED',
'SELF' : 'SELF-EMPLOYED',
'SELF EMPLOYED' : 'SELF-EMPLOYED',
}
>>> fec.contbr_employer = fec.contbr_employer.map(lambda x: emp_mapping.get(x, x))
pivot_table 을 사용해서 정당과 직업별로 데이터를 집계, 최소 2백만불 이상 기부한 직업만 추출
>>> by_occupation = fec.pivot_table(values = 'contb_receipt_amt',
index = 'contbr_occupation', columns = 'party',
aggfunc = 'sum')
by_occupation
>>> by_occupation[by_occupation.sum(1) > 2000000]
>>> over_2mm = by_occupation[by_occupation.sum(1) > 2000000]
over_2mm.plot(kind = 'barh')
오바마 후보와 롬니 후보별로 가장 많은 금액을 기부한 직군 추출
>>> def get_top_amounts(df, key, n = 5):
totals = df.groupby(key)['contb_receipt_amt'].sum()
return totals.nlargest(n)
>>> grouped = fec_mrbo.groupby('cand_nm')
>>> grouped.apply(get_top_amounts, 'contbr_occupation', n=7)
>>> grouped.apply(get_top_amounts, 'contbr_employer', n=10)
기부 규모별로 버킷을 만들어 기부자 수를 분할한다
>>> bins = np.array([0, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000])
>>> labels = pd.cut(fec_mrbo.contb_receipt_amt, bins)
labels
>>> grouped = fec_mrbo.groupby(['cand_nm', labels])
>>> grouped.size()
>>> grouped.size().reset_index()
>>> grouped.size().unstack(0)
>>> contb_num = grouped.size().unstack(0)
>>> bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)
>>> normed_sums = bucket_sums.div(bucket_sums.sum(axis = 1), axis = 0)
normed_sums
>>> normed_sums[:-2].plot(kind = 'barh')
>>> grouped = fec_mrbo.groupby(['cand_nm', 'contbr_st'])
>>> totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
>>> totals = totals[totals.sum(1) > 100000]
totals[:10]
>>> percent = totals.div(totals.sum(1), axis = 0)
percent[:10]
https://parkgihyeon.github.io/project/geocoding-api/
import pandas as pd
import openpyxl
GwanGwang = pd.read_excel('jejudoGwanGwang.xlsx')
jejudoMatJip = pd.read_excel('jejudoMatJip.xlsx')
jejuMatJip = pd.read_excel('jejuMatJip.xlsx')
jejuYeoHang = pd.read_excel('jejuYeoHang.xlsx')
# 중복데이터는 거르고 결합
# df1.combine_first(df2) ====> df1을 기준으로 df2를 중복없이 결합하겠다.
jejuGwanGwang1 = GwanGwang.combine_first(jejudoMatJip)
jejuGwanGwang1 = jejuGwanGwang1.combine_first(jejuMatJip)
jejuGwanGwang1 = jejuGwanGwang1.combine_first(jejuYeoHang)
jejuGwanGwang1[:2]
#location_count = pd.DataFrame(jejuGwanGwang1)
#location_count
jejuGwanGwang1['place'].value_counts()
import requests
def place_find(s) :
# 접속 url
url = 'http://dapi.kakao.com/v2/local/search/keyword.json?query={}'
.format(s)
# headers 입력
headers = {
# 카카오지도 접속
"Authorization" : "KakaoAK f5310e1d1d5f93581064788ae5a11f81"
}
# API 요청
palces = requests.get(url, headers=headers).json()['documents']
try :
palces = palces[0]
jejuGwanGwang1_name = palces['place_name']
위도 = palces['y']
경도 = palces['x']
result = [jejuGwanGwang1_name, 위도, 경도]
return result
except:
pass
place_find('청초수물회&섭국 [신관]')
# 혹시 있을지 모르는 결측치를 제거하고싶다
a = jejuGwanGwang1['place'].dropna()
a.value_counts()
# 위경도 가져오기
jeju_palce_list = []
for a in a :
try :
# 카카오 API 사용한 사용자 함수로 던지기
data = place_find(a)
jeju_palce_list.append(data)
except :
pass
# 복사
jeju_palce_list_1 = jeju_palce_list
# None 값 삭제
jeju_palce_list_1 = [x for x in jeju_palce_list_1 if x is not None]
jeju_palce_list_1
jeju_palce = pd.DataFrame(jeju_palce_list_1)
# 컬럼 명 지정해주기
jeju_palce.columns = ['place_name', '위도', '경도']
# '제주도'는 의미 없을거 같아서 제거
jeju_palce = jeju_palce[(jeju_palce['place_name'] != ['제주도',,,,,,])]
jeju_palce
from folium.plugins import MarkerCluster
import folium
jejuMap = folium.Map(location=[33.36, 126.52], zoom_start = 10)
marker_cluster = MarkerCluster().add_to(jejuMap)
for i in range(len(jeju_palce)):
folium.Marker(
location = [jeju_palce.iloc[i]['위도'], jeju_palce.iloc[i]['경도']],
popup = jeju_palce.iloc[i]['place_name'],
icon = folium.Icon(color='blue',icon='ok'),
).add_to(marker_cluster)
jejuMap
#folium
#pip install folium
import pandas as pd
import folium
df_1 = pd.read_excel('jejuMatJip.xlsx')
df_2 = pd.read_excel('jejudoMatJip.xlsx')
df_3 = pd.read_excel('jejuYeoHang.xlsx')
df_4 = pd.read_excel('jejudoGwanGwang.xlsx')
df_place=list(df_1.place.unique())+list(df_2.place.unique())+list(df_3.place.unique())+list(df_4.place.unique())
len(df_place)
# 카카오API를 사용하여 주소 -> 좌표 변환
import requests, json
def get_location(address):
url = 'https://dapi.kakao.com/v2/local/search/address.json?query=' + address
#headers = {"Authorization": "KakaoAK 개인키"}
headers = {"Authorization": "KakaoAK f5310e1d1d5f93581064788ae5a11f81"}
api_json = json.loads(str(requests.get(url,headers=headers).text))
address = api_json['documents'][0]['address']
crd = {"lat": str(address['y']), "lng": str(address['x'])}
address_name = address['address_name']
return crd
pip install geopy
import geopy
from geopy.geocoders import Nominatim
def geocoding(address):
geolocoder = Nominatim(user_agent = 'South Korea', timeout=None)
geo = geolocoder.geocode(address)
crd = {"lat": str(geo.latitude), "lng": str(geo.longitude)}
return crd
import numpy as np
crd = []
for i in range(len(df_place)):
try :
crd.append(geocoding(df_place[i]))
except:
crd.append(np.nan)
len(crd)
crd
df = pd.DataFrame({'지역명' : df_place, '위치' : crd})
df2 = df[df.위치.isna() != 1]
df2 = df2.reset_index()
df2
df2 = df2.drop('index', axis = 1)
df2.to_csv('jejumap.csv')
df2['위도'] = 0
df2['경도'] = 0
for i in range(len(df2.위치)):
df2['위도'].iloc[i] = float(df2.위치[:][i]['lat'])
df2['경도'].iloc[i] = float(df2.위치[:][i]['lng'])
jejumap = folium.Map(location = [33.26,126.5],zoom_start=12)
for name, lat, lng in zip(df2.지역명, df2.위도, df2.경도):
folium.Marker([lat, lng], popup=name).add_to(jejumap)
# 지도를 HTML 파일로 저장하기
jejumap.save
('./top5.html')
df3 = df2[((df2.위도 >= 33.11) & (df2.위도 <= 34.0)) & ((df2.경도 >= 126.0) & (df2.경도 <= 127.0))]
# 북위 331127~333350, 동경 1260843″~1265820
df3 = df3.reset_index().drop('index', axis = 1)
df3
df3['color'] = ''
df3['dataframe'] = 0
for i in range(len(df3)):
if df3.지역명.iloc[i] in list(df_1.place.unique()) :
df3['dataframe'].iloc[i] = 1
df3['color'].iloc[i] = 'blue'
elif df3.지역명.iloc[i] in list(df_2.place.unique()) :
df3['dataframe'].iloc[i] = 2
df3['color'].iloc[i] = 'yellow'
elif df3.지역명.iloc[i] in list(df_3.place.unique()):
df3['dataframe'].iloc[i] = 3
df3['color'].iloc[i] = 'red'
elif df3.지역명.iloc[i] in list(df_4.place.unique()):
df3['dataframe'].iloc[i] = 4
df3['color'].iloc[i] = 'green'
else :
df3['color'].iloc[i] = 'black'
df_list = []
for i in range(len(df3)):
if df3.dataframe.iloc[i] == 1:
df_list.append(df_1[df_1.place == df3.지역명.iloc[i]])
elif df3.dataframe.iloc[i] == 2:
df_list.append(df_2[df_2.place == df3.지역명.iloc[i]])
elif df3.dataframe.iloc[i] == 3:
df_list.append(df_3[df_3.place == df3.지역명.iloc[i]])
elif df3.dataframe.iloc[i] == 4:
df_list.append(df_4[df_4.place == df3.지역명.iloc[i]])
jejumap =
folium.Map
(location = [33.26,126.5],zoom_start=12)
for name, lat, lng, color in zip(df3.지역명, df3.위도, df3.경도, df3.color):
folium.Marker([lat, lng], popup=folium.Popup(f'지역명 : {name}<br>', min_width=200, max_width=200),icon=folium.Icon(color = color)).add_to(jejumap)
# 지도를 HTML 파일로 저장하기
jejumap.save
('./jejumap.html')
df3.to_csv('jeju.csv')
import pandas as pd
jeju_insta_df = pd.DataFrame( [ ] )
folder = './files/'
f_list = ['jejudoGwanGwang.xlsx', 'jejudoMatJip.xlsx', 'jejuMatJip.xlsx', 'jejuYeoHang.xlsx']
for fname in f_list:
fpath = folder + fname
temp = pd.read_excel(fpath)
jeju_insta_df = pd.concat([jeju_insta_df, temp])
jeju_insta_df.columns =['content','data','like','place','tags']
jeju_insta_df.info()
jeju_insta_df.drop_duplicates(subset = ["content"] , inplace = True)
jeju_insta_df.tags
jeju_insta_df['place'] # 빈도수 상위 500개를 뽑는다.
location_counts = jeju_insta_df['place'].value_counts( )
location_counts
# 등록된 위치정보별 빈도수 데이터
location_counts_df = pd.DataFrame(location_counts)
location_counts_df.head()
list(location_counts_df.index)
locations = list( location_counts.index )
locations
import requests
searching = '합정 스타벅스'
url = '
https://dapi.kakao.com/v2/local/search/keyword.json?query={}
'.format(searching)
headers = {
"Authorization": "KakaoAK f5310e1d1d5f93581064788ae5a11f81"
# 입력시 반드시 KakaoAK 뒤에 한 칸 띄우고 API키 값을 적어야 합니다.
}
places = requests.get(url, headers = headers).json()['documents']
places
# 카카오 로컬 API를 활용한 장소 검색 함수 만들기
def find_places(searching):
# ① 접속URL 만들기
url = '
https://dapi.kakao.com/v2/local/search/keyword.json?query={}
'.format(searching)
# ② headers 입력하기
headers = {
"Authorization": "KakaoAK 3004e67678252b2b6c227e7c9d258681"
}
# ③ API 요청&정보 받기
places = requests.get(url, headers = headers).json()['documents']
# ④ 필요한 정보 선택하기
place = places[0]
name = place['place_name']
x=place['x']
y=place['y']
data = [name, x, y, searching]
return data
data = find_places('제주공항')
data
# 반복작업 진행시 진행바 표시하기위한 라이브러리 tqdm 활용하기
# ! pip install tqdm
from tqdm.notebook import tqdm
import time
locations_inform = [ ]
for location in tqdm(locations):
try:
data = find_places(location)
locations_inform.append(data)
time.sleep(0.5)
except:
pass
locations_inform
# 위치정보 저장하기
locations_inform_df = pd.DataFrame(locations_inform)
location_counts_df
# 위치 데이터 병합하기
location_data = pd.merge(locations_inform_df, location_counts_df,
how = 'inner', left_on = 'name_official', right_index=True)
location_data.head()
# 데이터 중복 점검하기
location_data['name_official'].value_counts()
# 장소 이름 기준 병합하기
location_data = location_data.pivot_table(index = ['name_official','경도','위도'], values = 'place', aggfunc='sum')
location_data.head()
### folium을 이용한 지도 시각화 ① - 개별 표시
# 데이터 불러오기
location_data.info()
# 지도 표시하기
import folium
Mt_Hanla =[33.362500, 126.533694]
map_jeju =
folium.Map
(location = Mt_Hanla, zoom_start = 11)
for i in range(len(location_data)):
name = location_data ['name_official'][i] # 공식명칭
count = location_data ['place'][i] # 게시글 개수
size = int(count)*2
long = float(location_data['위도'][i])
lat = float(location_data['경도'][i])
folium.CircleMarker((long,lat), radius = size, color='red', popup=name).add_to(map_jeju)
map_jeju
### folium을 이용한 지도 시각화 ② - 그룹으로 표시
# 지도 표시하기(마커 집합)
from folium.plugins import MarkerCluster
locations = []
names = []
for i in range(len(location_data)):
data = location_data.iloc[i] # 행 하나씩
locations.append((float(data['위도']),float(data['경도']))) # 위도, 경도 순으로
names.append(data['name_official'])
Mt_Hanla =[33.362500, 126.533694]
map_jeju2 =
folium.Map
(location = Mt_Hanla, zoom_start = 11)
marker_cluster = MarkerCluster(
locations=locations, popups=names,
name='Jeju',
overlay=True,
control=True )
marker_cluster.add_to(map_jeju2)
folium.LayerControl().add_to(map_jeju2)
map_jeju2