<노마드 코더 웹 스크래핑 수업 통해 웹 크롤링 배워보기>
import requests
indeed_result = requests.get("https://kr.indeed.com/jobs?q=python&limit=50")
print(indeed_result.text)
여기서 사용한 모듈은 requests
import requests
from bs4 import BeautifulSoup
indeed_result = requests.get("https://kr.indeed.com/%EC%B7%A8%EC%97%85?q=python&limit=50")
indeed_soup = BeautifulSoup(indeed_result.text, "html.parser")
pagination = indeed_soup.find("div", {"class" : "pagination"})
pages = pagination.find_all('a')
spans = []
for page in pages:
spans.append(page.find("span"))
spans = spans[:-1]
페이지를 HTML 태그 순서대로 뽑아온다. 근데 페이지가 많이 개편되었나보다. 전에 했던 것과 태그가 좀 달라졌고 문제는 16페이지까지 있는 페이지 넘버가 2~5페이지밖에 안 긁힌다.(홈페이지 태그 변경이 확실한 듯)
import requests
from bs4 import BeautifulSoup
indeed_result = requests.get("https://kr.indeed.com/%EC%B7%A8%EC%97%85?q=python&limit=50")
indeed_soup = BeautifulSoup(indeed_result.text, "html.parser")
pagination = indeed_soup.find("div", {"class" : "pagination"})
links = pagination.find_all('a')
pages = []
for link in links[:-1]:
pages.append(int(link.string))
pages = pages[:-1]
max_page = pages[-1]
원래는 16페이지까지인데 보니까 보이는 페이지별로 태그를 잘랐다. 나는 인덱스 4페이지까지밖에 안 간다. 아마 5에서 짤렸으니 인덱스 번호가 4인게 맞지 싶다.
(indeed.py)
import requests
from bs4 import BeautifulSoup
LIMIT = 50
URL = "https://kr.indeed.com/%EC%B7%A8%EC%97%85?q=python&limit={LIMIT}"
def extract_indeed_pages():
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
pagination = soup.find("div", {"class" : "pagination"})
links = pagination.find_all('a')
pages = []
for link in links[:-1]:
pages.append(int(link.string))
max_page = pages[-1]
return max_page
def extract_indeed_jobs(last_page):
for page in range(last_page):
result = requests.get(f"{URL}&start={page*LIMIT}")
print(result.status_code)
먼저 전에 했던 내용 함수처리 하여 코드 정리 후 페이지가 50 단위로 이동하는 것을 착안하여 정리.
status_code : 200이 나와야 정상작동 한다는 것.
(main.py)
from indeed import extract_indeed_pages, extract_indeed_jobs
last_indeed_page = extract_indeed_pages()
extract_indeed_jobs(last_indeed_page)
모든 파일은 indeed.py에서 작동. main.py에 import해줌.
(indeed.py)
html 태그 하나씩 보면서 찾은 뒤 마지막에 체이닝
def extract_indeed_jobs(last_page):
jobs = []
#for page in range(last_page):
result = requests.get(f"{URL}&start={0*LIMIT}")
soup = BeautifulSoup(result.text, "html.parser")
results = soup.find_all("div", {"class" : "jobsearch-SerpJobCard"})
for result in results:
title = result.find("h2", {"class" : "title"}).find("a")["title"]
print(title)
return jobs
(indeed.py)
def extract_indeed_jobs(last_page):
jobs = []
#for page in range(last_page):
result = requests.get(f"{URL}&start={0*LIMIT}")
soup = BeautifulSoup(result.text, "html.parser")
results = soup.find_all("div", {"class" : "jobsearch-SerpJobCard"})
for result in results:
title = result.find("h2", {"class" : "title"}).find("a")["title"]
company = result.find("span", {"class" :"company"})
company_anchor = company.find('a')
if company_anchor is not None:
company = (str(company_anchor.string))
else:
company = str(company.string)
company = company.strip()
print(title, company) # 두 가지 같이 표기
return jobs
(indeed.py)
import requests
from bs4 import BeautifulSoup
LIMIT = 50
URL = "https://kr.indeed.com/%EC%B7%A8%EC%97%85?q=python&limit={LIMIT}"
def extract_indeed_pages():
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
pagination = soup.find("div", {"class" : "pagination"})
links = pagination.find_all('a')
pages = []
for link in links[:-1]:
pages.append(int(link.string))
max_page = pages[-1]
return max_page
def extract_jobs(html): # 따로 함수로 뺌
title = html.find("h2", {"class" : "title"}).find("a")["title"]
company = html.find("span", {"class" :"company"})
company_anchor = company.find('a')
if company_anchor is not None:
company = (str(company_anchor.string))
else:
company = str(company.string)
company = company.strip()
return {'title': title, 'company' : company}
def extract_indeed_jobs(last_page):
jobs = []
#for page in range(last_page):
result = requests.get(f"{URL}&start={0*LIMIT}")
soup = BeautifulSoup(result.text, "html.parser")
results = soup.find_all("div", {"class" : "jobsearch-SerpJobCard"})
for result in results:
job = extract_jobs(result)
jobs.append(job) # jobs에 딕셔너리 집어넣기
return jobs
(main.py)
from indeed import extract_indeed_pages, extract_indeed_jobs
last_indeed_page = extract_indeed_pages()
indeed_jobs = extract_indeed_jobs(last_indeed_page)
print(indeed_jobs)
(indeed.py) #홈페이지 한국 버전으로는 이 코드도 에러가 나지 않았음.
def extract_jobs(html):
title = html.find("h2", {"class" : "title"}).find("a")["title"]
company = html.find("span", {"class" :"company"})
company_anchor = company.find('a')
if company_anchor is not None:
company = (str(company_anchor.string))
else:
company = str(company.string)
company = company.strip()
location = html.find("span", {"class" : "location"}).string
return {'title': title, 'company' : company, "location" : location}
(indeed.py)
import requests
from bs4 import BeautifulSoup
LIMIT = 50
URL = "https://kr.indeed.com/%EC%B7%A8%EC%97%85?q=python&limit={LIMIT}"
def extract_indeed_pages():
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
pagination = soup.find("div", {"class" : "pagination"})
links = pagination.find_all('a')
pages = []
for link in links[:-1]:
pages.append(int(link.string))
max_page = pages[-1]
return max_page
def extract_jobs(html):
title = html.find("h2", {"class" : "title"}).find("a")["title"]
company = html.find("span", {"class" :"company"})
company_anchor = company.find('a')
if company_anchor is not None:
company = (str(company_anchor.string))
else:
company = str(company.string)
company = company.strip()
location = html.find("span", {"class" : "location"}).string
job_id = html["data-jk"]
return {'title': title, 'company' : company, "location" : location, "link" : f"https://kr.indeed.com/%EC%B7%A8%EC%97%85?q=python&limit=50&vjk={job_id}"}
def extract_indeed_jobs(last_page):
jobs = []
for page in range(last_page):
result = requests.get(f"{URL}&start={page*LIMIT}")
soup = BeautifulSoup(result.text, "html.parser")
results = soup.find_all("div", {"class" : "jobsearch-SerpJobCard"})
for result in results:
job = extract_jobs(result)
jobs.append(job)
return jobs
(main.py)
from indeed import extract_indeed_pages, extract_indeed_jobs
last_indeed_page = extract_indeed_pages()
indeed_jobs = extract_indeed_jobs(last_indeed_page)
print(indeed_jobs)
일단 indeed 출력해보는 것까지 끝. 혼자 해보기도 도전하기.