개발 세션 - 크롤러 개발

goldenGlow_21·2024년 12월 20일

다크웹 정보 유출 탐지 시스템 개발

목록 보기

7/15

scrapy

crawler/
├── darkweb_crawler/
│   ├── __init__.py
│   ├── items.py
│   ├── middlewares.py
│   ├── pipelines.py
│   ├── settings.py
│   ├── spiders/
│   │   ├── __init__.py
│   │   ├── island_spider.py     # 첫 번째 크롤러
│   │   ├── second_site_spider.py # 두 번째 크롤러
│   │   └── third_site_spider.py  # 세 번째 크롤러
└── scrapy.cfg

BeautifulSoup4

Island

no-Tor

import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import time

# ChromeDriver 경로 (프로젝트 폴더 내에 위치한 chromedriver.exe)
chromedriver_path = "./chromedriver.exe"  # "./"는 현재 디렉토리를 의미

# Selenium WebDriver 설정
service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service)

# 크롤링 대상 URL
base_url = "https://crackingisland.net"
category_url = f"{base_url}/categories/combolists"

# 데이터 저장용 리스트
all_data = []

def crawl_combolists():
    # Selenium으로 페이지 열기
    driver.get(category_url)
    # JavaScript 로딩 대기
    time.sleep(5)  # 페이지 로딩 시간에 따라 조정 가능

    # BeautifulSoup으로 HTML 파싱
    soup = BeautifulSoup(driver.page_source, "html.parser")
    posts = soup.find_all("a", itemprop="url")

    for post in posts:
        try:
            title = post.find("h2", itemprop="headline").text.strip()
            post_url = base_url + post["href"]
            post_type = post.find("span", itemprop="about").text.strip()
            post_date = post.find("span", itemprop="dateCreated").text.strip()
            description = post.find("p", itemprop="text").text.strip()

            # 데이터 저장
            post_data = {
                "title": title,
                "url": post_url,
                "type": post_type,
                "dateCreated": post_date,
                "description": description,
            }

            all_data.append(post_data)
            print(f"추출 완료: {title}")

        except Exception as e:
            print(f"크롤링 중 오류 발생: {e}")

    # JSON 파일로 저장
    with open("test.json", "w", encoding="utf-8") as f:
        json.dump(all_data, f, ensure_ascii=False, indent=4)
    print("test.json 파일 저장 완료.")

if __name__ == "__main__":
    try:
        crawl_combolists()
    finally:
        # WebDriver 종료
        driver.quit()

Tor

import json
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

# ChromeDriver 경로 (프로젝트 폴더 내에 위치한 chromedriver.exe)
current_dir = os.path.dirname(os.path.abspath(__file__))
chromedriver_path = os.path.join(current_dir, "chromedriver.exe")

# TOR 프록시 설정
proxy_address = "127.0.0.1:9050"  # TOR SOCKS5 프록시 주소

# Selenium WebDriver 옵션 설정
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument(f"--proxy-server=socks5://{proxy_address}")  # TOR 프록시 사용

# WebDriver 초기화
service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# 크롤링 대상 URL (onion 사이트)
base_url = "https://crackingisland.net/"  # onion 사이트 URL로 변경
category_url = f"{base_url}/categories/combolists"

# 데이터 저장용 리스트
all_data = []

def crawl_combolists():
    # Selenium으로 페이지 열기
    driver.get(category_url)
    # JavaScript 로딩 대기
    time.sleep(5)  # 페이지 로딩 시간에 따라 조정할 것

    # BeautifulSoup으로 HTML 파싱
    soup = BeautifulSoup(driver.page_source, "html.parser")
    posts = soup.find_all("a", itemprop="url")

    for post in posts:
        try:
            title = post.find("h2", itemprop="headline").text.strip()
            post_url = base_url + post["href"]
            post_type = post.find("span", itemprop="about").text.strip()
            post_date = post.find("span", itemprop="dateCreated").text.strip()
            description = post.find("p", itemprop="text").text.strip()

            # 데이터 저장
            post_data = {
                "title": title,
                "url": post_url,
                "type": post_type,
                "dateCreated": post_date,
                "description": description,
            }

            all_data.append(post_data)
            print(f"추출 완료: {title}")

        except Exception as e:
            print(f"크롤링 중 오류 발생: {e}")

    # JSON 파일로 저장
    with open("test.json", "w", encoding="utf-8") as f:
        json.dump(all_data, f, ensure_ascii=False, indent=4)
    print("test.json 파일 저장 완료.")

if __name__ == "__main__":
    try:
        crawl_combolists()
    finally:
        # WebDriver 종료
        driver.quit()

dark leak market

tor

import json
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

# ChromeDriver 경로
current_dir = os.path.dirname(os.path.abspath(__file__))
chromedriver_path = os.path.join(current_dir, "chromedriver.exe")

# TOR 프록시 설정
proxy_address = "127.0.0.1:9050"  # TOR SOCKS5 프록시 주소

# Selenium WebDriver 옵션 설정
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument(f"--proxy-server=socks5://{proxy_address}")  # TOR 프록시 사용

# WebDriver 초기화
service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# 크롤링 대상 URL
base_url = "http://darkleakyqmv62eweqwy4dnhaijg4m4dkburo73pzuqfdumcntqdokyd.onion"
category_url = f"{base_url}/index.html"

# 데이터 저장용 리스트
all_data = []

def crawl_files():
    # Selenium으로 페이지 열기
    driver.get(category_url)
    time.sleep(5)  # 페이지 로딩 대기

    # BeautifulSoup으로 HTML 파싱
    soup = BeautifulSoup(driver.page_source, "html.parser")
    rows = soup.find_all("tr", onclick=True)  # 클릭 가능한 행 추출

    for row in rows:
        try:
            file_name = row.find("strong").text.strip()  # 파일 이름 추출
            post_data = {"file_name": file_name}

            all_data.append(post_data)
            print(f"추출 완료: {file_name}")

        except Exception as e:
            print(f"크롤링 중 오류 발생: {e}")

    # JSON 파일로 저장
    with open("darkleak.json", "w", encoding="utf-8") as f:
        json.dump(all_data, f, ensure_ascii=False, indent=4)
    print("darkleak.json 파일 저장 완료.")

if __name__ == "__main__":
    try:
        crawl_files()
    finally:
        # WebDriver 종료
        driver.quit()

abyss

tor

import json
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

# ChromeDriver 경로
current_dir = os.path.dirname(os.path.abspath(__file__))
chromedriver_path = os.path.join(current_dir, "chromedriver.exe")

# TOR 프록시 설정
proxy_address = "127.0.0.1:9050"  # TOR SOCKS5 프록시 주소

# Selenium WebDriver 옵션 설정
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument(f"--proxy-server=socks5://{proxy_address}")  # TOR 프록시 사용

# WebDriver 초기화
service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# 크롤링 대상 URL
base_url = "http://3ev4metjirohtdpshsqlkrqcmxq6zu3d7obrdhglpy5jpbr7whmlfgqd.onion"

# 데이터 저장용 리스트
all_data = []

def crawl_posts():
    # Selenium으로 페이지 열기
    driver.get(base_url)
    time.sleep(5)  # 페이지 로딩 대기

    # BeautifulSoup으로 HTML 파싱
    soup = BeautifulSoup(driver.page_source, "html.parser")
    cards = soup.find_all("div", class_="card-body")  # 카드 뉴스 데이터 추출

    for card in cards:
        try:
            title = card.find("h5", class_="card-title").text.strip()  # 제목 추출
            description = card.find("p", class_="card-text").text.strip()  # 설명 추출
            post_data = {"title": title, "description": description}

            all_data.append(post_data)
            print(f"추출 완료: {title}")

        except Exception as e:
            print(f"크롤링 중 오류 발생: {e}")

    # JSON 파일로 저장
    with open("abyss.json", "w", encoding="utf-8") as f:
        json.dump(all_data, f, ensure_ascii=False, indent=4)
    print("abyss.json 파일 저장 완료.")

if __name__ == "__main__":
    try:
        crawl_posts()
    finally:
        # WebDriver 종료
        driver.quit()