<사람인 백엔드 파트 크롤링>
import requests
from bs4 import BeautifulSoup
saramin_result = requests.get("https://www.saramin.co.kr/zf_user/search/recruit?searchType=search&company_cd=0%2C1%2C2%2C3%2C4%2C5%2C6%2C7%2C9%2C10&searchword=%EB%B0%B1%EC%97%94%EB%93%9C&panel_type=&search_optional_item=y&search_done=y&panel_count=y&recruitPage=1&recruitSort=relation&recruitPageCount=40&inner_com_type=&quick_apply=&except_read=")
saramin_soup = BeautifulSoup(saramin_result.text, "html.parser")
pagination = saramin_soup.find("div", {"class" : "pagination"})
pages = pagination.find_all("a")
spans = []
for page in pages:
spans.append(page.find("span"))
spans = spans[:-1]
마찬가지로 다음 버튼 전까지인 10페이지까지 크롤링 된다. 여기까지는 인디드 홈페이지와 거의 유사한 과정을 통해 크롤링했다. 심지어 태그명까지 같다.
import requests
from bs4 import BeautifulSoup
saramin_result = requests.get("https://www.saramin.co.kr/zf_user/search/recruit?searchType=search&company_cd=0%2C1%2C2%2C3%2C4%2C5%2C6%2C7%2C9%2C10&searchword=%EB%B0%B1%EC%97%94%EB%93%9C&panel_type=&search_optional_item=y&search_done=y&panel_count=y&recruitPage=1&recruitSort=relation&recruitPageCount=40&inner_com_type=&quick_apply=&except_read=")
saramin_soup = BeautifulSoup(saramin_result.text, "html.parser")
pagination = saramin_soup.find("div", {"class" : "pagination"})
links = pagination.find_all("a")
pages = []
for link in links[:-1]:
pages.append(int(link.string))
max_page = pages[-1]
saramin.py
import requests
from bs4 import BeautifulSoup
URL = "https://www.saramin.co.kr/zf_user/search/recruit?searchType=search&company_cd=0%2C1%2C2%2C3%2C4%2C5%2C6%2C7%2C9%2C10&searchword=%EB%B0%B1%EC%97%94%EB%93%9C&panel_type=&search_optional_item=y&search_done=y&panel_count=y&recruitPage=1&recruitSort=relation&recruitPageCount=40&inner_com_type=&quick_apply=&except_read="
def extract_saramin_pages():
saramin_result = requests.get(URL)
saramin_soup = BeautifulSoup(saramin_result.text, "html.parser")
pagination = saramin_soup.find("div", {"class" : "pagination"})
links = pagination.find_all("a")
pages = []
for link in links[:-1]:
pages.append(int(link.string))
max_page = pages[-1]
return max_page
def extract_saramin_jobs(last_page):
for page in range(last_page):
result = requests.get(URL)
print(result.status_code)
main.py
from saramin import extract_saramin_pages, extract_saramin_jobs
last_saramin_page = extract_saramin_pages()
extract_saramin_jobs(last_saramin_page)
import requests
from bs4 import BeautifulSoup
URL = "https://www.saramin.co.kr/zf_user/search/recruit?searchType=search&company_cd=0%2C1%2C2%2C3%2C4%2C5%2C6%2C7%2C9%2C10&searchword=%EB%B0%B1%EC%97%94%EB%93%9C&panel_type=&search_optional_item=y&search_done=y&panel_count=y&recruitPage=1&recruitSort=relation&recruitPageCount=40&inner_com_type=&quick_apply=&except_read="
def extract_saramin_pages():
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
pagination = soup.find("div", {"class" : "pagination"})
links = pagination.find_all("a")
pages = []
for link in links[:-1]:
pages.append(int(link.string))
max_page = pages[-1]
return max_page
def extract_saramin_jobs(last_page):
#for page in range(last_page):
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
results = soup.find_all("div", {"class" :"area_job"})
for result in results:
title = result.find("h2", {"class" : "job_tit"}).find("a")["title"]
print(title)
saramin.py(~last_page 부분)
def extract_saramin_jobs(last_page):
jobs = []
#for page in range(last_page):
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
results = soup.find_all("div", {"class" :"area_job"})
for result in results:
title = result.find("h2", {"class" : "job_tit"}).find("a")["title"]
results2 = soup.find_all("div", {"class" : "area_corp"})
for result2 in results2:
company = result2.find("strong", {"class" : "corp_name"}).find("a")["title"]
print(company)
return jobs
def extract_saramin_jobs(last_page):
jobs = []
#for page in range(last_page):
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
results = soup.find_all("div", {"class" :"area_job"})
for result in results:
title = result.find("h2", {"class" : "job_tit"}).find("a")["title"]
location = result.find("div", {"class" : "job_condition"}).find("span").get_text()
results2 = soup.find_all("div", {"class" : "area_corp"})
for result2 in results2:
company = result2.find("strong", {"class" : "corp_name"}).find("a")["title"]
return jobs
import requests
from bs4 import BeautifulSoup
URL = "https://www.saramin.co.kr/zf_user/search/recruit?searchType=search&company_cd=0%2C1%2C2%2C3%2C4%2C5%2C6%2C7%2C9%2C10&searchword=%EB%B0%B1%EC%97%94%EB%93%9C&panel_type=&search_optional_item=y&search_done=y&panel_count=y&recruitPage=1&recruitSort=relation&recruitPageCount=40&inner_com_type=&quick_apply=&except_read="
def extract_saramin_pages():
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
pagination = soup.find("div", {"class" : "pagination"})
links = pagination.find_all("a")
pages = []
for link in links[:-1]:
pages.append(int(link.string))
max_page = pages[-1]
return max_page
def extract_jobs(html):
company = html.find("strong", {"class" : "corp_name"}).find("a")["title"]
title = html.find("h2", {"class" : "job_tit"}).find("a")["title"]
location = html.find("div", {"class" : "job_condition"}).find("span").get_text()
return {"company" : company, "title" : title, "location" : location}
def extract_saramin_jobs(last_page):
jobs = []
for page in range(last_page):
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
results = soup.find_all("div", {"class" :"item_recruit"})
for result in results:
job = extract_jobs(result)
jobs.append(job)
return jobs
main.py
from saramin import extract_saramin_pages, extract_saramin_jobs
last_saramin_page = extract_saramin_pages()
saramin_jobs = extract_saramin_jobs(last_saramin_page)
print(saramin_jobs)
마지막에 큰 기대 없이 공통 클래스값을 변경했다. 그래서 앞서 for문을 두 씩 돌리는 결과를 없애고 (item_recruit)로 클래스값을 바꿨더니 원하는 결과가 나왔다. 다음에는 최대한 통일해서 class 값 맞출 수 있는지 더 찾아보자. 플라스크 응용은 아직은 미숙해서 어려울 것 같다.