올해도 롯데는 가을야구 못했네요
어쩌다 딥러닝
가을야구 흑흑
최동원이 21세기에 활동했더라면, 그의 연봉은 얼마일까?
최동원은 80년대 야구선수 중 연봉정보를 알 수 있는 몇 안 되는 선수이지만, 야구선수의 연봉은 물가상승률로만 계산할 수는 없다.
그 시절 한국 야구의 투수는 전체 리그에 100명이 채 안 되었고, 현재는 300명에 가깝다. 그만큼 야구 시장이 커졌고, 과거에는 측정하지 않던 지표도 많다. 과거 선수의 남아있는 통계자료로 연봉 측정을 하기에는 무리가 있다. 그치만... 재미로... 해볼 수는 있잖아요?
수집여부 결정 + DB 컬럼명
CG, SHO, TBF가 최근에는 버튼을 한 번 더 눌려야 볼 수 있는 정보가 되었음
이외에 최근 정보에는 추가된 컬럼이 많음
최근 정보는 첫 페이지에 있는 것만 가져올 것
WHIP 외에는 공통된 정보만 가져올 것
create table baseball (
id int not null auto_increment,
kbo_year int,
in_team_rank int,
name varchar(8),
team varchar(8),
ERA decimal(5,2),
G int,
W int,
L int,
SV int,
HLD int,
WPCT decimal(4,3),
IP decimal(5,2),
H int,
HR int,
BB int,
HBP int,
SO int,
R int,
ER int,
WHIP decimal(4,2),
money int,
WAR decimal(5,2),
WAR_mean decimal(5,2),
WAR_mean_diff decimal(5,2),
constraint pk_baseball primary key(id)
);
options = Options()
options.add_argument("start-maximized")
# service = Service("../../EDA/driver/chromedriver")
# use chromedriver for windows
service = Service("../../EDA/driver/windows/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
driver.get("https://www.koreabaseball.com/Record/Player/PitcherBasic/BasicOld.aspx?sort=ERA_RT")
year_select = driver.find_element(By.ID, "cphContents_cphContents_cphContents_ddlSeason_ddlSeason")
for year in eightys:
# to avoid no such element
year_select = driver.find_element(By.ID, "cphContents_cphContents_cphContents_ddlSeason_ddlSeason")
year_select.send_keys(year)
time.sleep(5)
team_select = driver.find_element(By.ID, "cphContents_cphContents_cphContents_ddlTeam_ddlTeam")
team_option_list = team_select.find_elements(By.TAG_NAME, 'option')
team_list = [team_option.text for team_option in team_option_list]
team_list = team_list[1:]
for team in team_list:
# to avoid stale element
team_select = driver.find_element(By.ID, "cphContents_cphContents_cphContents_ddlTeam_ddlTeam")
team_select.send_keys(team)
time.sleep(5)
page = driver.page_source
soup = BeautifulSoup(page, "html.parser")
tbody = soup.find("tbody")
player_list = tbody.find_all("tr")
for player in player_list:
player_info = player.find_all("td")
player_rank = player_info[0].text
player_name = player_info[1].find("a").text
player_team = player_info[2].text.strip()
player_era = player_info[3].text
player_game = player_info[4].text
player_win = player_info[7].text
player_lose = player_info[8].text
player_save = player_info[9].text
player_hold = player_info[10].text
player_wpct = player_info[11].text
if player_wpct == '-':
player_wpct = 0.5
player_ip = player_info[13].text
if " " in player_ip:
player_ip = player_ip.replace(" 1/3", ".33")
player_ip = player_ip.replace(" 2/3", ".66")
player_hit = player_info[14].text
player_hr = player_info[15].text
player_bb = player_info[16].text
player_hbp = player_info[17].text
player_so = player_info[18].text
player_r = player_info[19].text
player_er = player_info[20].text
# 19개 컬럼
sql = "insert into baseball(kbo_year, in_team_rank, name, team, ERA, G, W, L, SV, HLD, WPCT, IP, H, HR, BB, HBP, SO, R, ER) \
values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
sql_tuple = (year, player_rank, player_name, player_team, player_era, player_game, player_win, player_lose, player_save, \
player_hold, player_wpct, player_ip, player_hit, player_hr, player_bb, player_hbp, player_so, player_r, player_er)
mycursor.execute(sql, sql_tuple)
remote.commit()
options = Options()
options.add_argument("start-maximized")
# service = Service("../../EDA/driver/chromedriver")
# use chromedriver for windows
service = Service("../../EDA/driver/windows/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
driver.get("https://www.koreabaseball.com/Record/Player/PitcherBasic/BasicOld.aspx?sort=ERA_RT")
year_select = driver.find_element(By.ID, "cphContents_cphContents_cphContents_ddlSeason_ddlSeason")
for year in nowadays:
# to avoid no such element
year_select = driver.find_element(By.ID, "cphContents_cphContents_cphContents_ddlSeason_ddlSeason")
year_select.send_keys(year)
time.sleep(5)
team_select = driver.find_element(By.ID, "cphContents_cphContents_cphContents_ddlTeam_ddlTeam")
team_option_list = team_select.find_elements(By.TAG_NAME, 'option')
team_list = [team_option.text for team_option in team_option_list]
team_list = team_list[1:]
for team in team_list:
# to avoid stale element
team_select = driver.find_element(By.ID, "cphContents_cphContents_cphContents_ddlTeam_ddlTeam")
team_select.send_keys(team)
time.sleep(5)
page = driver.page_source
soup = BeautifulSoup(page, "html.parser")
tbody = soup.find("tbody")
player_list = tbody.find_all("tr")
for player in player_list:
player_info = player.find_all("td")
player_rank = player_info[0].text
player_name = player_info[1].find("a").text
player_team = player_info[2].text.strip()
player_era = player_info[3].text
player_game = player_info[4].text
player_win = player_info[5].text
player_lose = player_info[6].text
player_save = player_info[7].text
player_hold = player_info[8].text
player_wpct = player_info[9].text
if player_wpct == '-':
player_wpct = 0.5
player_ip = player_info[10].text
if " " in player_ip:
player_ip = player_ip.replace(" 1/3", ".33")
player_ip = player_ip.replace(" 2/3", ".66")
player_hit = player_info[11].text
player_hr = player_info[12].text
player_bb = player_info[13].text
player_hbp = player_info[14].text
player_so = player_info[15].text
player_r = player_info[16].text
player_er = player_info[17].text
player_whip = player_info[18].text
# 20개 컬럼
sql = "insert into baseball(kbo_year, in_team_rank, name, team, ERA, G, W, L, SV, HLD, WPCT, IP, H, HR, BB, HBP, SO, R, ER, WHIP) \
values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
sql_tuple = (year, player_rank, player_name, player_team, player_era, player_game, player_win, player_lose, player_save, \
player_hold, player_wpct, player_ip, player_hit, player_hr, player_bb, player_hbp, player_so, player_r, player_er, player_whip)
mycursor.execute(sql, sql_tuple)
remote.commit()
for year in all_years:
year_str = str(year)
# queryString sn means number of list
# 80's players are 2 digits but 10's players are more than 200 every year -> 300
url = "http://www.statiz.co.kr/stat_at.php?opt=0&sopt=0&re=1&ys=" + year_str + "&ye=" + year_str \
+ "&se=0&te=&tm=&ty=2017&qu=auto&po=0&as=&ae=&hi=&un=&pl=&da=1&o1=WAR&o2=OutCount&de=1&lr=0&tr=&cv=&ml=1&sn=300&si=&cn="
options = Options()
options.add_argument("start-maximized")
service = Service("../../EDA/driver/windows/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
driver.get(url)
time.sleep(2)
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
war_table_cols = soup.find("div", id="fixcol")
war_table = war_table_cols.find("table").find("tbody")
war_tr = war_table.find_all("tr")
for element in war_tr:
data_row = element.find_all("td")
if data_row:
# print(data_row)
name = data_row[1].find("a").text
WAR = data_row[3].find("font").find("span").text
sql = "update baseball set WAR = %s where name = %s and kbo_year = %s"
mycursor.execute(sql, (WAR, name, year))
remote.commit()
driver.close()
# 최근 선수 중 몇 명 WAR 없음(statiz에서 조회 안됨) => 개명 전 이름 -> statiz 사이트에서 조회되지 않으므로 연봉 정보도 없음? -> 해당 데이터는 사용하지 않기로 하자
for year in nowadays:
year_str = str(year)
url = "http://www.statiz.co.kr/salary.php?opt=0&sopt=" + year_str + "&te="
options = Options()
options.add_argument("start-maximized")
service = Service("../../EDA/driver/windows/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
driver.get(url)
time.sleep(2)
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
war_table = soup.find("table", class_="table table-striped").find("tbody")
war_tr = war_table.find_all("tr")
for element in war_tr:
data_row = element.find_all("td")
if data_row:
# print(data_row)
name = data_row[0].find("a").text
money = data_row[3].text.replace(",", "")
sql = "update baseball set money = %s where name = %s and kbo_year = %s"
mycursor.execute(sql, (money, name, year))
remote.commit()
driver.close()
# 연봉 알 수 있는 선수가 적음.. 전체 선수 200명이 넘는데 해마다 40명 내외 -> 이 선수들로 비교해야 함
from keras.models import Sequential
from keras.layers import Dense
X = baseball.drop(['money', 'WHIP'], axis=1).astype(float)
y = baseball['money']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
first_model = Sequential()
first_model.add(Dense(30, input_dim=20, activation="relu")) # rectified linear unit
first_model.add(Dense(6, activation="relu"))
first_model.add(Dense(1))
first_model.compile(loss="mean_squared_error", optimizer="adam")
first_model_hist = first_model.fit(X_train, y_train, epochs=1000, batch_size=10)
sql = "select * from baseball where money > 100000 order by money desc"
mycursor.execute(sql)
star_desc_result = mycursor.fetchall()
star_desc_df = pd.DataFrame(star_desc_result, columns=mycursor.column_names)
# 연봉에 영향이 있는 지표(승리(W), 삼진아웃(SO), 이닝, WAR)만 잘라보자
star_x_mini = star_desc_df[['W', 'SO', 'IP', 'WAR']].astype(float)
star_y = star_desc_df['money']
# 학습
X_train, X_test, y_train, y_test = train_test_split(star_x_mini, star_y, test_size=0.2, random_state=13)
star_model_min = Sequential()
star_model_min.add(Dense(30, input_dim=4, activation="relu")) # rectified linear unit
star_model_min.add(Dense(6, activation="relu"))
star_model_min.add(Dense(1))
star_model_min.compile(loss="mean_squared_error", optimizer="adam")
star_model_min_hist = star_model_min.fit(X_train, y_train, epochs=1000, batch_size=10)
y_test_arr = np.array(y_test)
롯데도 1등 가즈아!