Ted 강연 제목/대본 크롤링

Junha Kim·2021년 1월 3일
0

Crawling

목록 보기
4/4

-Ted 강연 제목/대본 크롤링- 20.07.19 ~ 20.07.21

class_define.py

class Topic:
    def __init__(self, name) -> None:
        self.name = name
        self.speech = []

    def add(self, speech):
        self.speech.append(speech)

    def __str__(self):
        return self.name

class Speech:
    def __init__(self, title, speaker) -> None:
        self.title = title
        self.speaker = speaker
        self.script = None
    
        def __str__(self):
            return self.title

selenium_class.py

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

class Driver:
    def __init__(self):
        options = webdriver.ChromeOptions()
        options.add_argument('headless')
        options.add_argument(
            "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")

        self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)

    def __call__(self):
        return self.driver

    def get_url(self, url): # 새 창으로 url 열기
        self.driver.get(url)

    def get_default(self):
        while True:
            try:
                self.driver.switch_to_default_content()
                return
            except:
                print('default frame 이동')
                pass

    def get_fra(self, name):
        while True:
            try:
                self.driver.switch_to_frame(name)
                break
            except:
                self.get_default()
                print(name, 'frame 이동')
                continue

    def get_top(self):
        while True:
            try:
                self.driver.switch_to_frame('top')
                break
            except:
                self.get_fra('body')
                print('body', 'frame 이동')
                continue

    def find_by_xpath(self, xpath): # Xpath로 단일 요소 찾기
        return WebDriverWait(self.driver, 10).until(
                        EC.presence_of_element_located(
                            (By.XPATH, xpath)))

    def find_by_class(self, class_name): # class name으로 단일 요소 찾기
        return WebDriverWait(self.driver, 10).until(
                        EC.presence_of_element_located(
                            (By.CLASS_NAME, class_name)))

    def find_by_tag(self, tag): # tag로 단일 요소 찾기
        return WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located(
                (By.TAG_NAME, tag)))

    def find_by_name(self, name): # name으로 단일 요소 찾기
        return WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located(
                (By.NAME, name)))

    def find_all_by_class(self, class_name): # class name으로 모든 요소 찾기
        return WebDriverWait(self.driver, 10).until(
            EC.presence_of_all_elements_located(
                (By.TAG_NAME, class_name)))

    def find_all_by_tag(self, tag): # tag로 모든 요소 찾기
        return WebDriverWait(self.driver, 10).until(
            EC.presence_of_all_elements_located(
                (By.TAG_NAME, tag)))

    def find_all_by_name(self, name): # name으로 모든 요소 찾기
        return WebDriverWait(self.driver, 10).until(
            EC.presence_of_all_elements_located(
                (By.NAME, name)))

    def find_all_by_tag_with_obj(self, obj, name): # name으로 모든 요소 찾기
        return WebDriverWait(obj, 20).until(
            EC.presence_of_all_elements_located(
                (By.TAG_NAME, name)))

    def find_by_tag_with_obj(self, obj, name): # name으로 요소 찾기
        return WebDriverWait(obj, 20).until(
            EC.presence_of_element_located(
                (By.TAG_NAME, name)))
    def find_by_link(self, text):
        return WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located(
                (By.LINK_TEXT, text)))

    def click(self, btn):
        self.driver.execute_script("arguments[0].click();", btn)

Ted.py

result = []

	driver = Driver()
	for t in topics:
		t_ = t.replace('"', '""')

		# 토픽 객체 생성
		topic_obj = Topic(t)
		while True:
			raw = requests.get("https://www.ted.com/talks?topics%5B%5D=" + t)
			html = BeautifulSoup(raw.text, 'html.parser')
			try:
				max_pagination = html.select('a.pagination__item')[-1]
				break
			except:
				continue

		for n in range(1, int(max_pagination.text) + 1):
			raw = requests.get("https://www.ted.com/talks?page=" + str(n) + "&topics%5B%5D=" + t)
			html = BeautifulSoup(raw.text, 'html.parser')

			# 1. 컨테이너 수집
			container = html.select("div.talk-link")

			# 2. 영상데이터 수집 (제목 & 발표자)
			for cont in container:
				Title = cont.select_one("h4 a")  # 영상제목
				Speaker = cont.select_one("h4.h12")  # 영상발표자
				title = Title.text.strip()
				speaker = Speaker.text.strip()
				# 강연 객체 생성
				speech_obj = Speech(title.replace('"', '""'), speaker)
				print(title, speaker, t_)

				# title 및 speaker 전처리
				title = "_".join(title.split())
				title = re.sub("[^a-zA-Z0-9]", "_", title.lower())
				title = re.sub("_{2,}", "_", title.lower())
				if title[-1] == "_":
					title = title[:-1]
				speaker = re.sub("[^a-zA-Z0-9]", "_", speaker.lower())

				# 대본 페이지 접속
				url = "https://www.ted.com/talks/" + speaker + "_" + title + "/transcript"
				x = requests.get(url)

				## 404 에러 회피
				if x.status_code == 404:
					continue

				while x.status_code == 429 or x.status_code == 500:
					url = "https://www.ted.com/talks/" + speaker + "_" + title + "/transcript"
					x = requests.get(url)

				# 대본 준비
				script = ''
				html = BeautifulSoup(x.text, 'html.parser')
				container_1 = html.select("div.Grid__cell.flx-s\:1")
				for k in container_1:
					k = k.select('p')
					script += k[0].text.strip().replace('\n', ' ').replace(
						'											', '') + ' '
				

				if script == '':
					driver.get_url(url)
					if '404' in driver.title:
							continue
					div = driver.find_all_by_class('div.Grid__cell.flx-s\:1')
					for i in div:
						script += div.text.strip()
	
				speech_obj.script = script.replace('"', '""')
				# 토픽에 강연 추가
				topic_obj.add(speech_obj)

		result.append(topic_obj)

	return result

0개의 댓글