HRD넷 크롤링

IngCoding·2022년 2월 28일
1

파이썬 #2 크롤링

목록 보기
5/7

1. 라이브러리 임포트

import time   
import warnings  
warnings.filterwarnings('ignore')  # 경고무시 

import pandas as pd  
import numpy as np  
import chromedriver_autoinstaller  
from selenium import webdriver  
# HRD- 페이지 진입
path = chromedriver_autoinstaller.install( )
driver = webdriver.Chrome(path)
driver.get("https://www.hrd.go.kr/")
time.sleep(2)
driver.find_element_by_link_text("#K-Digital Training").click( ) # k-digital 클릭
# 빈리스트 만들기
name_list = []
local_list = []
title_list = []
period_list = []

2. 크롤링 for 문

try:
    for j in range(2,11):
        for i in range(1,11):
            name_select = f"#contentArea > div.detailListWrap > ul > li:nth-child({i}) >div.title > a >p.zone"
            name = driver.find_element_by_css_selector(name_select)
            name = name.text
            name_list.append(name)

            local_select = f"#contentArea > div.detailListWrap > ul > li:nth-child({i}) > div.title > a > p.school"
            local = driver.find_element_by_css_selector(local_select)
            local = local.text
            local_list.append(local)

            title_select = f"#contentArea > div.detailListWrap > ul > li:nth-child({i}) > div.content > p > a"
            title = driver.find_element_by_css_selector(title_select)
            title = title.get_attribute('title')
            title = title.replace(" 새창","")
            title_list.append(title)
            
            period_select = f'#contentArea > div.detailListWrap > ul > li:nth-child({i}) > div.content > div > dl:nth-child(2) > dd'
            period = driver.find_element_by_css_selector(period_select)
            period = period.text
            period_list.append(period)
            
        driver.find_element_by_link_text(str(j)).click()
        time.sleep(2)
except:
    pass

3. CSV 파일로 변환

# 리스트 길이가 같아야 변환 가능하니 확인
print(len(name_list), len(local_list), len(title_list), len(period_list))
90 90 90 90
df = pd.DataFrame({'name':name_list, 'local':local_list, 'title':title_list, 'period':period_list})
df.to_csv("K-digital.csv", encoding='utf-8-sig', index=False)
profile
Data & PM

0개의 댓글