-Ted 강연 제목/대본 크롤링- 20.07.19 ~ 20.07.21
class_define.py
class Topic:
def __init__(self, name) -> None:
self.name = name
self.speech = []
def add(self, speech):
self.speech.append(speech)
def __str__(self):
return self.name
class Speech:
def __init__(self, title, speaker) -> None:
self.title = title
self.speaker = speaker
self.script = None
def __str__(self):
return self.title
selenium_class.py
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument(
"user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
def __call__(self):
return self.driver
def get_url(self, url):
self.driver.get(url)
def get_default(self):
while True:
try:
self.driver.switch_to_default_content()
return
except:
print('default frame 이동')
pass
def get_fra(self, name):
while True:
try:
self.driver.switch_to_frame(name)
break
except:
self.get_default()
print(name, 'frame 이동')
continue
def get_top(self):
while True:
try:
self.driver.switch_to_frame('top')
break
except:
self.get_fra('body')
print('body', 'frame 이동')
continue
def find_by_xpath(self, xpath):
return WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located(
(By.XPATH, xpath)))
def find_by_class(self, class_name):
return WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located(
(By.CLASS_NAME, class_name)))
def find_by_tag(self, tag):
return WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located(
(By.TAG_NAME, tag)))
def find_by_name(self, name):
return WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located(
(By.NAME, name)))
def find_all_by_class(self, class_name):
return WebDriverWait(self.driver, 10).until(
EC.presence_of_all_elements_located(
(By.TAG_NAME, class_name)))
def find_all_by_tag(self, tag):
return WebDriverWait(self.driver, 10).until(
EC.presence_of_all_elements_located(
(By.TAG_NAME, tag)))
def find_all_by_name(self, name):
return WebDriverWait(self.driver, 10).until(
EC.presence_of_all_elements_located(
(By.NAME, name)))
def find_all_by_tag_with_obj(self, obj, name):
return WebDriverWait(obj, 20).until(
EC.presence_of_all_elements_located(
(By.TAG_NAME, name)))
def find_by_tag_with_obj(self, obj, name):
return WebDriverWait(obj, 20).until(
EC.presence_of_element_located(
(By.TAG_NAME, name)))
def find_by_link(self, text):
return WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located(
(By.LINK_TEXT, text)))
def click(self, btn):
self.driver.execute_script("arguments[0].click();", btn)
Ted.py
result = []
driver = Driver()
for t in topics:
t_ = t.replace('"', '""')
topic_obj = Topic(t)
while True:
raw = requests.get("https://www.ted.com/talks?topics%5B%5D=" + t)
html = BeautifulSoup(raw.text, 'html.parser')
try:
max_pagination = html.select('a.pagination__item')[-1]
break
except:
continue
for n in range(1, int(max_pagination.text) + 1):
raw = requests.get("https://www.ted.com/talks?page=" + str(n) + "&topics%5B%5D=" + t)
html = BeautifulSoup(raw.text, 'html.parser')
container = html.select("div.talk-link")
for cont in container:
Title = cont.select_one("h4 a")
Speaker = cont.select_one("h4.h12")
title = Title.text.strip()
speaker = Speaker.text.strip()
speech_obj = Speech(title.replace('"', '""'), speaker)
print(title, speaker, t_)
title = "_".join(title.split())
title = re.sub("[^a-zA-Z0-9]", "_", title.lower())
title = re.sub("_{2,}", "_", title.lower())
if title[-1] == "_":
title = title[:-1]
speaker = re.sub("[^a-zA-Z0-9]", "_", speaker.lower())
url = "https://www.ted.com/talks/" + speaker + "_" + title + "/transcript"
x = requests.get(url)
if x.status_code == 404:
continue
while x.status_code == 429 or x.status_code == 500:
url = "https://www.ted.com/talks/" + speaker + "_" + title + "/transcript"
x = requests.get(url)
script = ''
html = BeautifulSoup(x.text, 'html.parser')
container_1 = html.select("div.Grid__cell.flx-s\:1")
for k in container_1:
k = k.select('p')
script += k[0].text.strip().replace('\n', ' ').replace(
' ', '') + ' '
if script == '':
driver.get_url(url)
if '404' in driver.title:
continue
div = driver.find_all_by_class('div.Grid__cell.flx-s\:1')
for i in div:
script += div.text.strip()
speech_obj.script = script.replace('"', '""')
topic_obj.add(speech_obj)
result.append(topic_obj)
return result