개선하기
- google colaboratory에서 작업한 파일을 자동으로 google drive에 올리기
- 기존 코드로 접근하던 방식에서 종목 이름으로 접근하도록 바꾸기
colaboratory link
crawler.jpynb
source code
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
from selenium import webdriver
from bs4 import BeautifulSoup
import re
from pandas import DataFrame, Series
import os
import time
from google.colab import drive
from os.path import join
ROOT = "/content/drive"
drive.mount(ROOT)
MY_GOOGLE_DRIVE_PATH = 'My Drive/Colab Notebooks/new_project'
PROJECT_PATH = join(ROOT, MY_GOOGLE_DRIVE_PATH) # 프로젝트 경로
def set_driver():
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
return webdriver.Chrome('chromedriver', chrome_options=chrome_options)
def get_code(name, driver):
driver.get('https://finance.naver.com/')
time.sleep(0.5) # 불러오기 위한 딜레이
element = driver.find_element_by_name('query')
element.send_keys(name)
element.submit()
time.sleep(0.5) # 0.5초
page = BeautifulSoup(driver.page_source, "html.parser")
url_list = []
tdobject = page.findAll('td')
for list in tdobject:
dataObject = list.findAll('a')
for l in dataObject:
url_list.append(l.get('href'))
return url_list[1][-6:]
def get_url(code, page):
url = 'http://finance.naver.com/item/sise_day.nhn?code={code}&page={page}'.format(
code=code, page=page)
return url
def data_save(name, data):
os.path.join(PROJECT_PATH)
if os.path.exists("./StockDataSet"):
output_dir = "./StockDataSet"
else:
os.mkdir("./StockDataSet")
output_dir = "./StockDataSet"
filename = "{}_data_save.csv".format(name)
path = os.path.join(output_dir, filename)
data.to_csv(path, index=True)
return
def get_data(code, day, page, driver):
if day <= 0:
return
driver.get(get_url(code, page))
bsObject = BeautifulSoup(driver.page_source, "html.parser")
tdobject = bsObject.findAll('td')
text_result = []
data = DataFrame(columns=['날짜', '종가', '전일비', '시가', '고가', '저가', '거래량'])
for list in tdobject:
dataObject = list.findAll('span')
for l in dataObject:
text_result.append(l.get_text())
for i in range(0, min(day, 10)):
a = []
for j in text_result[7*i:7*(i+1)]:
val = re.sub('\s', '', j) # 공백 제거
a.append(val)
data = data.append([Series(a, index=data.columns)])
data = data.set_index("날짜")
return data.append(get_data(code, day-10, page+1, driver))
driver = set_driver()
name = input('종목을 입력하세요: ')
print("*"*10, "접근 중..", "*"*10)
code = get_code(name, driver)
print("*"*10, "접근 완료!", "*"*10)
day = int(input('최근 N 일의 data를 불러옵니다: '))
data = get_data(code, day, 1, driver)
print("최근 {}일의 data를 불러왔습니다!".format(day))
data_save(name, data)