Today I learned: 시각화

O(logn)·2024년 11월 26일

matplotlib seaborn web scrapping

파이썬 웹크롤링

목록 보기

10/12

강의 예제

질문 목록 6페이지 중 제목 추출하여 워드클라우드 시각화하기

강의 예제 코드 설명

질문 제목 스크래핑 해오기

# Pagination이 되어있는 질문 리스트의 제목을 모두 가져와 리스트 questions에 저장해봅시다.
# https://hashcode.co.kr/?page={i}
# 과도한 요청을 방지하기 위해 0.5초마다 요청을 보내봅시다.
import time

import requests
from bs4 import BeautifulSoup

questions = []

for i in range(1,6):
    res = requests.get("https://hashcode.co.kr/?page={}".format(i), {"User-Agent":user_agent}) 
    soup = BeautifulSoup(res.text, "html.parser")
    
    parsed_data = soup.find_all("li","question-list-item")

    for data in parsed_data:
        questions.append(data.h4.text.strip())
    time.sleep(0.5)
print(questions[:10])

for..: 페이지를 1부터 6까지 순회
requests.get(): 해당 웹사이트의 html파일을 요청하는 코드
BeautifulSoup(res.text, "html.parser"): 받아온 html파일을 텍스트로 바꿔서 파싱(구조 분해)하는 코드
soup.find_all(): 태그의 종류와 클래스 이름을 입력하여 해당되는 모든 요소를 찾아 저장
for...: `"question-list-item"이름을 가진 모든 li태그에서 h4만 가져와 question 리스트에 저장
time.sleep(0.5):서버의 부담을 덜기 위해 0.5 지연시간 추가

# 텍스트 구름을 그리기 위해 필요한 라이브러리를 불러와봅시다.

# 시각화에 쓰이는 라이브러리
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# 횟수를 기반으로 딕셔너리 생성
from collections import Counter

# 문장에서 명사를 추출하는 형태소 분석 라이브러리
from konlpy.tag import Hannanum

# Hannanum 객체를 생성한 후, .nouns()를 통해 명사를 추출합니다.

hannanum = Hannanum()
nouns = hannanum.nouns("".join(questions))

# counter를 이용해 각 단어의 개수를 세줍니다.

counter = Counter(nouns)

# WordCloud를 이용해 텍스트 구름을 만들어봅시다.

wordcloud = WordCloud(
    font_path = "./NotoSansKR-Regular.ttf",
    background_color = "white",
    width = 1000,
    height = 1000
)

img = wordcloud.generate_from_frequencies(counter)

plt.imshow(img)