프로젝트 목표
python, go ,java 등 프로그래밍 언어를 검색하면 Indeed, stackofFlow에서 해당 언어의 구인 리스트를 스크래핑하여 웹에서 보여주도록 합니다.
과정 요약
검색을 할 수 있게 input, button을 버튼을 html으로 만듭니다.
클라이언트에서 서버로 들어온 파라미터(react/go/python)를 캐스팅하여, 스크래퍼를 실행합니다.
이때 혹시 모르니 대문자를 소문자로 처리하고, 값이 들어오지 않으면 다시 Home으로 보냅니다.("/")
스크래퍼가 받아온 구인 리스트를 웹에 표시하도록 합니다. 이때 이미 검색한 키워드가 딕셔너리에 있는지 확인 후, 있다면 저장된 리스트를, 스크래퍼를 실행하여 실행하도록 합니다.
csv로 다운 받을 수 있도록 html anchor 태그를 만들고 "/export?word=키워드"로 이동하여 다운로드 받게 합니다. 이 때 word가 존재하지 않으면 에러를 띄우고, "/"로 redirect하게 합니다. 있다면 jobs.csv로 저장하고 로컬로 저장할수있게 합니다.
Code
from flask import Flask,render_template, request, redirect , send_file
from scrapper import get_jobs
from exporter import save_to_file
app = Flask("flasktutorial")
db = {}
@app.route("/")
def home(*args):
return render_template("potato.html")
@app.route("/report")
def report():
word = request.args.get("word")
if word:
word = word.lower()
fromDb = db.get(word)
if fromDb:
pass
else:
jobs = get_jobs(word)
db[word] = jobs
else :
return redirect("/")
return render_template("report.html",
searchinBy=word,number=len(db[word]), jobs= db[word])
@app.route("/export")
def export():
try:
word = request.args.get("word")
if not word:
raise Exception()
word = word.lower()
jobs = db.get(word)
if not jobs:
raise Exception()
save_to_file(jobs)
return send_file("jobs.csv")
except:
return redirect("/")
# just can run on repl.it, but you can't do on local
app.run(host="0.0.0.0")
import requests
from bs4 import BeautifulSoup
def get_last_page(url):
result = requests.get(url)
soup = BeautifulSoup(result.text,"html.parser")
pagination = soup.find("div",{"class" : "s-pagination"})
links = pagination.find_all("a")
last_page = links[-2].get_text(strip=True)
return int(last_page)
def extract_job(html):
title = html.find("h2",{ "class" : "mb4" }).find("a")["title"]
company, location = html.find("h3",{ "class" : "mb4" }).find_all("span",recursive=False)
company = company.get_text(strip=True)
location = location.get_text(strip=True)
job_id = html["data-jobid"]
link = f"https://stackoverflow.com/jobs/{job_id}"
# link = "https://stackoverflow.com/" + html.find("h2",{ "class" : "mb4" }).find("a")["href"]
return {"title": title, "company" : company , "location": location, "link" : link}
def extract_jobs(last_page,url):
jobs= []
for page in range(last_page):
print(f"scrapping SO {page+1}")
res = requests.get(f"{url}&pg={page+1}")
html = BeautifulSoup(res.text,"html.parser")
job_cards = html.find_all("div",{"class" :"-job"})
for job_card in job_cards:
job = extract_job(job_card)
jobs.append(job)
return jobs
def get_jobs(word):
url = f"https://stackoverflow.com/jobs?q={word}"
last_page = get_last_page(url)
jobs = extract_jobs(last_page,url)
return jobs
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Job search</title>
</head>
<body>
<h1>Job search</h1>
<form action="/report" method="get">
<input type="text" placeholder="Search for a job" required name="word" >
<button>Search</button>
</form>
</body>
</html>
<!DOCTYPE html>
<html lang="en">
<head>
<style>
section{
display : grid;
grid-template-columns: repeat(4,1fr);
gap:20px;
}
</style>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Job search</title>
</head>
<body>
<h1>Search Results</h1>
<h2>Found {{number}} results for : {{searchinBy}}</h2>
<a href="/export?word={{searchinBy}}">Exprort to csv</a>
<section>
<h4>title</h4>
<h4>company</h4>
<h4>location</h4>
<h4>link</h4>
{% for job in jobs %}
<span>{{job.title}}</span>
<span>{{job.company}}</span>
<span>{{job.location}}</span>
<a href="{{job.link}}" target="_blank">Apply</a>
{% endfor%}
</section>
</body>
</html>
import csv
def save_to_file(jobs):
file = open ("jobs.csv", mode="w")
writer = csv.writer(file)
writer.writerow(["title","company","location","link"])
for job in jobs:
writer.writerow(list(job.values()))
return