데이터 분석 81일
크롤링
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
def extract_availability_and_sold(url):
options = Options()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
# options.add_argument("--headless")
driver = webdriver.Chrome(options=options)
available, sold = None, None
try:
driver.get(url)
time.sleep(1.5) # 네트워크/컴퓨터 환경 따라 1~2초로 조정
try:
availability_box = driver.find_element(By.ID, "qtyAvailability")
spans = availability_box.find_elements(By.TAG_NAME, "span")
for span in spans:
text = span.text.strip().lower()
if "available" in text:
match = re.search(r"(\d+)", text)
if match:
available = int(match.group(1))
if "sold" in text:
match = re.search(r"(\d+)", text)
if match:
sold = int(match.group(1))
except Exception as e:
pass
except Exception as e:
print(f"❌ Error for {url}: {e}")
driver.quit()
return available, sold
# =========================
# 메인 실행 코드
# =========================
# df = pd.read_csv("your_ebay_file.csv")
sample_urls = df['itemWebUrl'].dropna().iloc[1000:2000]
sample_idx = sample_urls.index
results = [None] * len(sample_urls)
max_workers = 10 # 동시에 몇 개 브라우저 띄울지(PC에 따라 4~8 정도가 적당)
print(f"🌐 {len(sample_urls)}개 병렬 크롤링 시작! (동시 {max_workers}개)")
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_idx = {executor.submit(extract_availability_and_sold, url): i for i, url in enumerate(sample_urls)}
for future in as_completed(future_to_idx):
i = future_to_idx[future]
try:
results[i] = future.result()
except Exception as e:
print(f"Error at index {i}: {e}")
results[i] = (None, None)
print(f"({i+1}/{len(sample_urls)}) 완료")
available_list, sold_list = zip(*results)
df.loc[sample_idx, 'available_quantity'] = available_list
df.loc[sample_idx, 'sold_quantity'] = sold_list
df.to_csv("ebay_with_avail_sold2.csv", index=False) # 변경하기
print("✅ 2000개 병렬 크롤링 완료!") # 변경하기
result = df1.combine_first(df)