Scrapping in Python (3) - save data to csv format

dk99521·2021년 2월 28일
0

scrapping-python

목록 보기
4/5

Process

  1. csv.writer.writerow를 사용해서 칼럼의 값을 저장합니다.
  2. 총정리 합니다.

save.py

import csv

def save_to_file(jobs):
  file = open ("jobs.csv", mode="w")
  writer = csv.writer(file)
  writer.writerow(["title","company","location","link"])


  for job in jobs:
    writer.writerow(list(job.values()))

  return 

indeed.py

import requests
from bs4 import BeautifulSoup

LIMIT=50
URL = f"https://kr.indeed.com/%EC%B7%A8%EC%97%85?q=python&limit={LIMIT}"

def extract_pages():
  # html이 포함된 객체를 불러오기
  result = requests.get(URL)
  # print(indeed_result)

  # Parsing html in indeed_result object
  soup = BeautifulSoup(result.text,"html.parser")
  # print(indeed_soup)

  # find "div" which have pagination className
  paginations = soup.find("div",class_='pagination')
  # print(indeed_soup("div",class_='pagination'))

  # find all a tag
  links = paginations.find_all("a")
  # print(pages)


  # make page_list except last one
  pages = []
  for link in links[:-1]:
    pages.append(int(link.string))
  # print(pages)

  max_page = pages[-1]
  # print(max_page)
  return max_page


def extract_job(html):
  title = html.find("h2",{"class" : "title"}).find("a")["title"]
  
  comp = html.find("span",{"class" : "company"})
  comp_anchor = comp.find("a")
  comp_name = ""
  if comp_anchor is not None :
    comp_name = str(comp_anchor.string).strip("\n ")
  else:
    comp_name = str(comp.string).strip("\n ")
  

  # location =  html.find("span",{"class" : "location"}).string
  location = html.find("div",{"class" : "recJobLoc"})["data-rc-loc"]


  job_id = html.find("h2",{"class":"title"}).find("a")["href"]
  
  return {"title" : title, "company" : comp_name, "location" : location, "link" : f"https://kr.indeed.com{job_id}"}
  # print(title, comp_name)
  

# 5개의 페이지를 request하기
def extract_jobs(max_page):
  jobs = []
  for page in range(max_page):
    print(f"scrapping ID page {page}")
    result = requests.get(f"{URL}&start={page*LIMIT}")
    
    soup = BeautifulSoup(result.text, "html.parser")
    job_cards = soup.find_all("div",{"class" : "jobsearch-SerpJobCard" })
    for job_card in job_cards:
      job = extract_job(job_card)
      jobs.append(job)
    

  return jobs




def get_jobs():

  last_page = extract_pages()

  jobs = extract_jobs(last_page)

  return jobs

so.py

import requests
from bs4 import BeautifulSoup


URL = "https://stackoverflow.com/jobs?q=python"

def get_last_page():

  result = requests.get(URL)
  soup = BeautifulSoup(result.text,"html.parser")
  pagination = soup.find("div",{"class" : "s-pagination"})
  links = pagination.find_all("a")
  last_page = links[-2].get_text(strip=True)

  return int(last_page)
  
def extract_job(html):
  title = html.find("h2",{ "class" : "mb4" }).find("a")["title"]
  company, location = html.find("h3",{ "class" : "mb4" }).find_all("span",recursive=False)
  company = company.get_text(strip=True)
  location = location.get_text(strip=True) 
  job_id = html["data-jobid"]
  link = f"https://stackoverflow.com/jobs/{job_id}"
  # link = "https://stackoverflow.com/" + html.find("h2",{ "class" : "mb4" }).find("a")["href"]
  return {"title": title, "company" : company , "location": location, "link" : link}

def extract_jobs(last_page):
  jobs= []
  for page in range(last_page):
    print(f"scrapping SO {page+1}")
    res = requests.get(f"{URL}&pg={page+1}")
    html = BeautifulSoup(res.text,"html.parser")
    job_cards = html.find_all("div",{"class" :"-job"})

    
    for job_card in job_cards:
      job = extract_job(job_card)
      jobs.append(job)
    
  return jobs


def get_jobs():
  
  last_page = get_last_page()
  jobs = extract_jobs(last_page)
  
  return jobs

main.py

from indeed import get_jobs as get_id_jobs
from so import get_jobs as get_so_jobs
from save import save_to_file

jobs = get_id_jobs() + get_so_jobs()
save_to_file(jobs)

결과

profile
쫓다.

0개의 댓글