Process
- csv.writer.writerow를 사용해서 칼럼의 값을 저장합니다.
- 총정리 합니다.
save.py
import csv
def save_to_file(jobs):
file = open ("jobs.csv", mode="w")
writer = csv.writer(file)
writer.writerow(["title","company","location","link"])
for job in jobs:
writer.writerow(list(job.values()))
return
indeed.py
import requests
from bs4 import BeautifulSoup
LIMIT=50
URL = f"https://kr.indeed.com/%EC%B7%A8%EC%97%85?q=python&limit={LIMIT}"
def extract_pages():
result = requests.get(URL)
soup = BeautifulSoup(result.text,"html.parser")
paginations = soup.find("div",class_='pagination')
links = paginations.find_all("a")
pages = []
for link in links[:-1]:
pages.append(int(link.string))
max_page = pages[-1]
return max_page
def extract_job(html):
title = html.find("h2",{"class" : "title"}).find("a")["title"]
comp = html.find("span",{"class" : "company"})
comp_anchor = comp.find("a")
comp_name = ""
if comp_anchor is not None :
comp_name = str(comp_anchor.string).strip("\n ")
else:
comp_name = str(comp.string).strip("\n ")
location = html.find("div",{"class" : "recJobLoc"})["data-rc-loc"]
job_id = html.find("h2",{"class":"title"}).find("a")["href"]
return {"title" : title, "company" : comp_name, "location" : location, "link" : f"https://kr.indeed.com{job_id}"}
def extract_jobs(max_page):
jobs = []
for page in range(max_page):
print(f"scrapping ID page {page}")
result = requests.get(f"{URL}&start={page*LIMIT}")
soup = BeautifulSoup(result.text, "html.parser")
job_cards = soup.find_all("div",{"class" : "jobsearch-SerpJobCard" })
for job_card in job_cards:
job = extract_job(job_card)
jobs.append(job)
return jobs
def get_jobs():
last_page = extract_pages()
jobs = extract_jobs(last_page)
return jobs
so.py
import requests
from bs4 import BeautifulSoup
URL = "https://stackoverflow.com/jobs?q=python"
def get_last_page():
result = requests.get(URL)
soup = BeautifulSoup(result.text,"html.parser")
pagination = soup.find("div",{"class" : "s-pagination"})
links = pagination.find_all("a")
last_page = links[-2].get_text(strip=True)
return int(last_page)
def extract_job(html):
title = html.find("h2",{ "class" : "mb4" }).find("a")["title"]
company, location = html.find("h3",{ "class" : "mb4" }).find_all("span",recursive=False)
company = company.get_text(strip=True)
location = location.get_text(strip=True)
job_id = html["data-jobid"]
link = f"https://stackoverflow.com/jobs/{job_id}"
return {"title": title, "company" : company , "location": location, "link" : link}
def extract_jobs(last_page):
jobs= []
for page in range(last_page):
print(f"scrapping SO {page+1}")
res = requests.get(f"{URL}&pg={page+1}")
html = BeautifulSoup(res.text,"html.parser")
job_cards = html.find_all("div",{"class" :"-job"})
for job_card in job_cards:
job = extract_job(job_card)
jobs.append(job)
return jobs
def get_jobs():
last_page = get_last_page()
jobs = extract_jobs(last_page)
return jobs
main.py
from indeed import get_jobs as get_id_jobs
from so import get_jobs as get_so_jobs
from save import save_to_file
jobs = get_id_jobs() + get_so_jobs()
save_to_file(jobs)
결과