Day 053

AWESOMee·2022년 3월 28일
0

Udemy Python Bootcamp

목록 보기
53/64
post-thumbnail

Udemy Python Bootcamp Day 052

Web Scraping Capstone - Data Entry Job Automation

What I wrote

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time

SERVICE = Service("/Users/****/Development/chromedriver")
GOOGLE_FORM = "https://docs.google.com/forms/d/e/1FAIpQLSdlfzrV7SHNXONkQjE7Mj6cvJhq6MWFotKCqq8UlCP9iPtu3g/viewform?usp=sf_link"
ZILLOW_URL = "https://www.zillow.com/homes/for_rent/1-_beds/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-122.92290724707031%2C%22east%22%3A-121.94375075292969%2C%22south%22%3A37.4478420315079%2C%22north%22%3A38.10129696297871%7D%2C%22isMapVisible%22%3Afalse%2C%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A872627%7D%2C%22beds%22%3A%7B%22min%22%3A1%7D%2C%22fore%22%3A%7B%22value%22%3Afalse%7D%2C%22mp%22%3A%7B%22max%22%3A3000%7D%2C%22nc%22%3A%7B%22value%22%3Afalse%7D%2C%22fr%22%3A%7B%22value%22%3Atrue%7D%2C%22cmsn%22%3A%7B%22value%22%3Afalse%7D%2C%22fsba%22%3A%7B%22value%22%3Afalse%7D%7D%2C%22isListVisible%22%3Atrue%7D"

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36",
    "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7"
}

response = requests.get(ZILLOW_URL, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")

links = [link.get("href") for link in soup.find_all(name="a", class_="list-card-link list-card-link-top-margin")]
prices = [price.getText().split("/")[0] for price in soup.find_all(name="div", class_="list-card-price")]
addresses = [address.getText() for address in soup.find_all(name="address", class_="list-card-addr")]


class GoogleForm:
    def __init__(self, service):
        self.driver = webdriver.Chrome(service=service)
        self.driver.get(GOOGLE_FORM)

    def fill_the_form(self):
        address_input = self.driver.find_element(By.XPATH, "/html/body/div/div[2]/form/div[2]/div/div[2]/div[1]/div/div/div[2]/div/div[1]/div/div[1]/input")
        price_input = self.driver.find_element(By.XPATH, "/html/body/div/div[2]/form/div[2]/div/div[2]/div[2]/div/div/div[2]/div/div[1]/div/div[1]/input")
        link_input = self.driver.find_element(By.XPATH, "/html/body/div/div[2]/form/div[2]/div/div[2]/div[3]/div/div/div[2]/div/div[1]/div/div[1]/input")

        for i in range(len(addresses)):
            time.sleep(1)
            address_input.send_keys(addresses[i])
            price_input.send_keys(prices[i])
            link_input.send_keys(links[i])

            submit_button = self.driver.find_element(By.XPATH, "/html/body/div/div[2]/form/div[2]/div/div[3]/div[1]/div[1]/div")
            submit_button.click()

            time.sleep(1)
            another_form = self.driver.find_element(By.XPATH, "/html/body/div[1]/div[2]/div[1]/div/div[4]/a")
            another_form.click()


form = GoogleForm(SERVICE)
form.fill_the_form()

첫번째 form 제출하는데는 성공했는데
두번째 form 제출은 안되고,,,
pricesrun할때 마다 값 제대로 물고 올 때도 있고 못 물고 올때도 있고,,,

Solution

from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time

header = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36",
    "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7"
}

response = requests.get(
    "https://www.zillow.com/homes/for_rent/1-_beds/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-122.92290724707031%2C%22east%22%3A-121.94375075292969%2C%22south%22%3A37.4478420315079%2C%22north%22%3A38.10129696297871%7D%2C%22isMapVisible%22%3Afalse%2C%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A872627%7D%2C%22beds%22%3A%7B%22min%22%3A1%7D%2C%22fore%22%3A%7B%22value%22%3Afalse%7D%2C%22mp%22%3A%7B%22max%22%3A3000%7D%2C%22nc%22%3A%7B%22value%22%3Afalse%7D%2C%22fr%22%3A%7B%22value%22%3Atrue%7D%2C%22cmsn%22%3A%7B%22value%22%3Afalse%7D%2C%22fsba%22%3A%7B%22value%22%3Afalse%7D%7D%2C%22isListVisible%22%3Atrue%7D",
    headers=header)

data = response.text
soup = BeautifulSoup(data, "html.parser")

all_link_elements = soup.select(".list-card-top a")

all_links = []
for link in all_link_elements:
    href = link["href"]
    print(href)
    if "http" not in href:
        all_links.append(f"https://www.zillow.com{href}")
    else:
        all_links.append(href)

all_address_elements = soup.select(".list-card-info address")
all_addresses = [address.get_text().split(" | ")[-1] for address in all_address_elements]

all_price_elements = soup.select(".list-card-heading")
all_prices = []
for element in all_price_elements:
    # Get the prices. Single and multiple listings have different tag & class structures
    try:
        # Price with only one listing
        price = element.select(".list-card-price")[0].contents[0]
    except IndexError:
        print('Multiple listings for the card')
        # Price with multiple listings
        price = element.select(".list-card-details li")[0].contents[0]
    finally:
        all_prices.append(price)


# Create Spreadsheet using Google Form
# Substitute your own path here 👇
SERVICE = Service("/Users/****/Development/chromedriver")
driver = webdriver.Chrome(service=SERVICE)

for n in range(len(all_links)):
    # Substitute your own Google Form URL here 👇
    driver.get("https://docs.google.com/forms/d/e/1FAIpQLSdlfzrV7SHNXONkQjE7Mj6cvJhq6MWFotKCqq8UlCP9iPtu3g/viewform?usp=sf_link")

    time.sleep(2)
    address = driver.find_element(By.XPATH,
        '//*[@id="mG61Hd"]/div[2]/div/div[2]/div[1]/div/div/div[2]/div/div[1]/div/div[1]/input')
    price = driver.find_element(By.XPATH,
        '//*[@id="mG61Hd"]/div[2]/div/div[2]/div[2]/div/div/div[2]/div/div[1]/div/div[1]/input')
    link = driver.find_element(By.XPATH,
        '//*[@id="mG61Hd"]/div[2]/div/div[2]/div[3]/div/div/div[2]/div/div[1]/div/div[1]/input')
    submit_button = driver.find_element(By.XPATH, '//*[@id="mG61Hd"]/div[2]/div/div[3]/div[1]/div/div')

    address.send_keys(all_addresses[n])
    price.send_keys(all_prices[n])
    link.send_keys(all_links[n])
    submit_button.click()

solution은 이런데 이건 아예 작동안하는데,,, 이유를 모르겠네..?

profile
개발을 배우는 듯 하면서도

0개의 댓글