웹페이지 크롤링

허선우·2021년 6월 8일
0

PYTHON

목록 보기
13/17
post-thumbnail

ebs 명의 댓글 크롤링


#2개의 함수
from bs4 import BeautifulSoup
import urllib
import re


def ebs_bestdoctor_href(page):  # page : 출력할 페이지 수 받음

  params3=[]
  for p in range(1,page+1):
    list_url = "https://bestdoctors.ebs.co.kr/bestdoctors/board/6/510093/list?c.page="+str(p)+"&fileClsCd=ANY&hmpMnuId=101&searchCondition=&searchConditionValue=0&searchKeywordValue=0&searchKeyword=&bbsId=510093&"
    url = urllib.request.Request(list_url)
    f=urllib.request.urlopen(url).read().decode("utf-8")

    soup = BeautifulSoup(f,'html.parser')
    params2 = []
    notice = 0
    for i in  soup.find_all('div',class_='txtcut'): #태그 dib 중 class 명이 txtcut인 부분 
      if notice >= 7: #7은 공지사항 갯수. 공지글은 제외함.        
        for j in i.find_all('a'): #태그 a에 해당하는 부분만 출력
          params2.append("https://bestdoctors.ebs.co.kr"+j.get("href"))
      notice += 1
    params3 += params2
  return params3




w_f=open("C://data//명의.csv","w",encoding = 'utf8')

count = 0
url_list= ebs_bestdoctor_href(20) #페이지 20 까지 url 리스트에 담기

for i in url_list:
  url = urllib.request.Request(i)
  f=urllib.request.urlopen(url).read().decode("utf-8")
  soup = BeautifulSoup(f,'html.parser')

  for i in soup.find_all('div',class_='con_txt'):
      count+=1
      w_f.write(str(count)+':\t'+re.sub('[\xa0\n\t\r]','',i.text)+'\n')

w_f.close()

동아일보 크롤링

import re
from bs4 import BeautifulSoup
import urllib

def choo_article(page):
    for i in range(1,page+1):
        urllist = 'https://news.joins.com/Search/JoongangNews?page='+str(i)+'&Keyword=%EB%94%A5%EB%9F%AC%EB%8B%9D&SortType=New&SearchCategoryType=JoongangNews'
        url = urllib.request.Request(urllist)
        f=urllib.request.urlopen(url).read().decode("utf-8")
        soup = BeautifulSoup(f,'html.parser')
        params  = [i.a.get('href') for i in soup.find_all('h2',class_="headline mg")]
    return params


A = int(input("몇 페이지를 스크롤링 하시겠습니까?"))

file = open("C://data//동아일보.txt",'w',encoding = 'utf8')
for i in range(len(choo_article(A))):
    urllist2 = choo_article(A)[i]
    url2 = urllib.request.Request(urllist2)
    f2=urllib.request.urlopen(url2).read().decode("utf-8")
    soup = BeautifulSoup(f2,'html.parser')
    params2  = [re.sub(r"\xa0|\n|\t|\r",'',i.text)+'\n' for i in soup.find_all('div',class_= "article_body mg fs4")]
    file.write(str(i)+'번 기사'+'\n'+params2[0])
file.close()
print('스크롤링을 성공적으로 끝냈습니다.')

0개의 댓글