import requests
from bs4 import BeautifulSoup
response = requests.get("https://search.naver.com/search.naver?where=news&sm=tab_jum&query=%EC%82%BC%EC%84%B1%EC%A0%84%EC%9E%90")
html = response.text
soup = BeautifulSoup(html, "html.parser")
articles = soup.select("div.info_group")
for article in articles:
links = article.select("a.info")
if len(links) >= 2:
url = links[1].attrs["href"]
import requests
from bs4 import BeautifulSoup
import time
response = requests.get("https://search.naver.com/search.naver?where=news&sm=tab_jum&query=%EC%82%BC%EC%84%B1%EC%A0%84%EC%9E%90")
html = response.text
soup = BeautifulSoup(html, "html.parser")
articles = soup.select("div.info_group")
for article in articles:
links = article.select("a.info")
if len(links) >= 2:
url = links[1].attrs["href"]
response = requests.get(url, headers={'User-agent': 'Mozila/5.0'}) # to avoid error use headers
html = response.text # for each url get html
soup = BeautifulSoup(html, "html.parser") # for each html make soup
content = soup.select_one("#dic_area") # get the body
print(content.text)
time.sleep(0.3)
import requests
from bs4 import BeautifulSoup
import time
response = requests.get("https://search.naver.com/search.naver?sm=tab_hty.top&where=news&query=%EB%B8%94%EB%9E%99%ED%95%91%ED%81%AC&oquery=%EC%82%BC%EC%84%B1%EC%A0%84%EC%9E%90&tqi=hyYPFlprvmsss4UxVAhssssssg4-385087")
html = response.text
soup = BeautifulSoup(html, "html.parser")
articles = soup.select("div.info_group")
for article in articles:
links = article.select("a.info")
if len(links) >= 2:
url = links[1].attrs["href"]
response = requests.get(url, headers={'User-agent': 'Mozila/5.0'}) # to avoid error use headers
html = response.text # for each url get html
soup = BeautifulSoup(html, "html.parser") # for each html make soup
# separation
if "entertain" in response.url: # to avoid redirection error
title = soup.select_one(".end_tit") # get the title
content = soup.select_one("#articeBody") # get the body
else:
title = soup.select_one(".media_end_head_headline") # get the title
content = soup.select_one("#dic_area") # get the body
print("========LINK========\n", url)
print("========TITLE========\n", title.text.strip())
print("========BODY========\n", content.text.strip())
time.sleep(0.3)
import requests
from bs4 import BeautifulSoup
import time
response = requests.get("https://search.naver.com/search.naver?where=news&sm=tab_jum&query=%EC%86%90%ED%9D%A5%EB%AF%BC")
html = response.text
soup = BeautifulSoup(html, "html.parser")
articles = soup.select("div.info_group")
for article in articles:
links = article.select("a.info")
if len(links) >= 2:
url = links[1].attrs["href"]
response = requests.get(url, headers={'User-agent': 'Mozila/5.0'}) # to avoid error use headers
html = response.text # for each url get html
soup = BeautifulSoup(html, "html.parser") # for each html make soup
# separation
if "entertain" in response.url: # to avoid redirection error
title = soup.select_one(".end_tit") # get the title
content = soup.select_one("#articeBody") # get the body
elif "sports" in response.url: # to avoid redirection error
title = soup.select_one("h4.title") # get the title
content = soup.select_one("#newsEndContents") # get the body
# delete unnecessary elements
divs = content.select("div")
for div in divs:
div.decompose()
paragraphs = content.select("p")
for p in paragraphs:
p.decompose()
else:
title = soup.select_one(".media_end_head_headline") # get the title
content = soup.select_one("#dic_area") # get the body
print("========LINK========\n", url)
print("========TITLE========\n", title.text.strip())
print("========BODY========\n", content.text.strip())
time.sleep(0.3)
import requests
from bs4 import BeautifulSoup
import time
import pyautogui
keyword = pyautogui.prompt("keyword")
response = requests.get(f"https://search.naver.com/search.naver?where=news&sm=tab_jum&query={keyword}")
html = response.text
soup = BeautifulSoup(html, "html.parser")
articles = soup.select("div.info_group")
for article in articles:
links = article.select("a.info")
if len(links) >= 2:
url = links[1].attrs["href"]
response = requests.get(url, headers={'User-agent': 'Mozila/5.0'}) # to avoid error use headers
html = response.text # for each url get html
soup = BeautifulSoup(html, "html.parser") # for each html make soup
# separation
if "entertain" in response.url: # to avoid redirection error
title = soup.select_one(".end_tit") # get the title
content = soup.select_one("#articeBody") # get the body
elif "sports" in response.url: # to avoid redirection error
title = soup.select_one("h4.title") # get the title
content = soup.select_one("#newsEndContents") # get the body
# delete unnecessary elements
divs = content.select("div")
for div in divs:
div.decompose()
paragraphs = content.select("p")
for p in paragraphs:
p.decompose()
else:
title = soup.select_one(".media_end_head_headline") # get the title
content = soup.select_one("#dic_area") # get the body
print("========LINK========\n", url)
print("========TITLE========\n", title.text.strip())
print("========BODY========\n", content.text.strip())
time.sleep(0.3)
import requests
from bs4 import BeautifulSoup
import time
import pyautogui
# user input
keyword = pyautogui.prompt("keyword")
lastpage = int(pyautogui.prompt("page"))
page_num = 1
for i in range(1, lastpage * 10, 10):
print(f"{page_num} page...")
response = requests.get(f"https://search.naver.com/search.naver?where=news&sm=tab_jum&query={keyword}&start={i}")
html = response.text
soup = BeautifulSoup(html, "html.parser")
articles = soup.select("div.info_group")
for article in articles:
links = article.select("a.info")
if len(links) >= 2:
url = links[1].attrs["href"]
response = requests.get(url, headers={'User-agent': 'Mozila/5.0'}) # to avoid error use headers
html = response.text # for each url get html
soup = BeautifulSoup(html, "html.parser") # for each html make soup
# separation
if "entertain" in response.url: # to avoid redirection error
title = soup.select_one(".end_tit") # get the title
content = soup.select_one("#articeBody") # get the body
elif "sports" in response.url: # to avoid redirection error
title = soup.select_one("h4.title") # get the title
content = soup.select_one("#newsEndContents") # get the body
# delete unnecessary elements
divs = content.select("div")
for div in divs:
div.decompose()
paragraphs = content.select("p")
for p in paragraphs:
p.decompose()
else:
title = soup.select_one(".media_end_head_headline") # get the title
content = soup.select_one("#dic_area") # get the body
print("========LINK========\n", url)
print("========TITLE========\n", title.text.strip())
print("========BODY========\n", content.text.strip())
time.sleep(0.3)
page_num += 1
query=%EC%86%90%ED%9D%A5%EB%AF%BC&start=1
query=%EC%86%90%ED%9D%A5%EB%AF%BC&start=11
맨 뒤의 start가 page의 번화 증가함에 따라 10씩 증가하므로 range(1, lastpage * 10, 10)를 통해 start에 대입할 수를 가져옴
pip install python-docx
from docx import Document
document = Document()
document.add_heading('Title', level=0)
document.add_paragraph('Article Link')
document.add_paragraph('Article Body')
document.save("test.docx")
import requests
from bs4 import BeautifulSoup
import time
import pyautogui
from docx import Document
# user input
keyword = pyautogui.prompt("keyword")
lastpage = int(pyautogui.prompt("page"))
# create docx file
document = Document()
page_num = 1
for i in range(1, lastpage * 10, 10):
print(f"{page_num} page...")
response = requests.get(f"https://search.naver.com/search.naver?where=news&sm=tab_jum&query={keyword}&start={i}")
html = response.text
soup = BeautifulSoup(html, "html.parser")
articles = soup.select("div.info_group")
for article in articles:
links = article.select("a.info")
if len(links) >= 2:
url = links[1].attrs["href"]
response = requests.get(url, headers={'User-agent': 'Mozila/5.0'}) # to avoid error use headers
html = response.text # for each url get html
soup = BeautifulSoup(html, "html.parser") # for each html make soup
# separation
if "entertain" in response.url: # to avoid redirection error
title = soup.select_one(".end_tit") # get the title
content = soup.select_one("#articeBody") # get the body
elif "sports" in response.url: # to avoid redirection error
title = soup.select_one("h4.title") # get the title
content = soup.select_one("#newsEndContents") # get the body
# delete unnecessary elements
divs = content.select("div")
for div in divs:
div.decompose()
paragraphs = content.select("p")
for p in paragraphs:
p.decompose()
else:
title = soup.select_one(".media_end_head_headline") # get the title
content = soup.select_one("#dic_area") # get the body
print("========LINK========\n", url)
print("========TITLE========\n", title.text.strip())
print("========BODY========\n", content.text.strip())
# write datas in docx file
document.add_heading(title.text.strip(), level=0)
document.add_paragraph(url)
document.add_paragraph(content.text.strip())
time.sleep(0.3)
page_num += 1
document.save(f"{keyword}_result.docx")
pip install openpyxl
from openpyxl import Workbook
wb = Workbook()
ws = wb.create_sheet("달승환")
ws["A1"] = "달승환"
wb.save("test.xlsx")
import requests
from bs4 import BeautifulSoup
import time
import pyautogui
from openpyxl import Workbook
from openpyxl.styles import Alignment
# user input
keyword = pyautogui.prompt("keyword")
lastpage = int(pyautogui.prompt("page"))
# create xlsx file
wb = Workbook()
ws = wb.create_sheet(keyword)
ws.column_dimensions['A'].width = 60
ws.column_dimensions['B'].width = 60
ws.column_dimensions['C'].width = 120
# variables
page_num = 1
row = 1
for i in range(1, lastpage * 10, 10):
print(f"{page_num} page...")
response = requests.get(f"https://search.naver.com/search.naver?where=news&sm=tab_jum&query={keyword}&start={i}")
html = response.text
soup = BeautifulSoup(html, "html.parser")
articles = soup.select("div.info_group")
for article in articles:
links = article.select("a.info")
if len(links) >= 2:
url = links[1].attrs["href"]
response = requests.get(url, headers={'User-agent': 'Mozila/5.0'}) # to avoid error use headers
html = response.text # for each url get html
soup = BeautifulSoup(html, "html.parser") # for each html make soup
# separation
if "entertain" in response.url: # to avoid redirection error
title = soup.select_one(".end_tit") # get the title
content = soup.select_one("#articeBody") # get the body
elif "sports" in response.url: # to avoid redirection error
title = soup.select_one("h4.title") # get the title
content = soup.select_one("#newsEndContents") # get the body
# delete unnecessary elements
divs = content.select("div")
for div in divs:
div.decompose()
paragraphs = content.select("p")
for p in paragraphs:
p.decompose()
else:
title = soup.select_one(".media_end_head_headline") # get the title
content = soup.select_one("#dic_area") # get the body
print("========LINK========\n", url)
print("========TITLE========\n", title.text.strip())
print("========BODY========\n", content.text.strip())
# write datas in xlsx file
ws[f"A{row}"] = url
ws[f"B{row}"] = title.text.strip()
ws[f"C{row}"] = content.text.strip()
ws[f"C{row}"].alignment = Alignment(wrap_text=True) # line breaking
row += 1
time.sleep(0.3)
page_num += 1
wb.save(f"{keyword}_result.xlsx")
import requests
from bs4 import BeautifulSoup
import time
import pyautogui
# user input
keyword = pyautogui.prompt("keyword")
lastpage = int(pyautogui.prompt("page"))
page_num = 1
for i in range(1, lastpage * 10, 10):
print(f"{page_num} page...")
response = requests.get(f"https://search.naver.com/search.naver?where=news&sm=tab_jum&query={keyword}&start={i}")
html = response.text
soup = BeautifulSoup(html, "html.parser")
articles = soup.select("div.info_group")
for article in articles:
links = article.select("a.info")
if len(links) >= 2:
url = links[1].attrs["href"]
response = requests.get(url, headers={'User-agent': 'Mozila/5.0'}) # to avoid error use headers
html = response.text # for each url get html
soup_sub = BeautifulSoup(html, "html.parser") # for each html make soup
# separation
if "entertain" in response.url: # to avoid redirection error
title = soup_sub.select_one(".end_tit") # get the title
content = soup_sub.select_one("#articeBody") # get the body
elif "sports" in response.url: # to avoid redirection error
title = soup_sub.select_one("h4.title") # get the title
content = soup_sub.select_one("#newsEndContents") # get the body
# delete unnecessary elements
divs = content.select("div")
for div in divs:
div.decompose()
paragraphs = content.select("p")
for p in paragraphs:
p.decompose()
else:
title = soup_sub.select_one(".media_end_head_headline") # get the title
content = soup_sub.select_one("#dic_area") # get the body
print("========LINK========\n", url)
print("========TITLE========\n", title.text.strip())
print("========BODY========\n", content.text.strip())
time.sleep(0.3)
# checking where the last page is
is_last_page = soup.select_one("a.btn_next").attrs["aria-disabled"]
if is_last_page == "true":
print("last page")
break
page_num += 1
pip install matplotlib
pip install wordcloud
pip install konlpy
1. OS와 비트 수가 일치하고 버전이 1.7 이상인 자바 설치
C:\Program Files\Java\jdk-17.0.4.1\bin\server
cp -> pyhton version
win -> OS bit
C:\Users\USER>cd Downloads
C:\Users\USER\Downloads>pip install JPype1‑1.4.0‑cp39‑cp39‑win_amd64.whl
import matplotlib.pyplot as plt # 생성한 워드클라우드 데이터를 시각화하여 그리기 위해 불러옵니다. (Google Colab 또는 Jupyter Notebook에서 사용)
from wordcloud import WordCloud # 워드클라우드 생성에 필요한 기본 모듈
from konlpy.tag import Okt # 한국어를 처리하는 대표적인 형태소 분석 패키지입니다.
# Okt, Kkma 등 여러가지 패키지들이 존재하는데 형태소 분석기마다 명사, 명사 등의 형태소를 조금씩 다르게 처리합니다.
# 다양하게 사용해본 후, 가지고 있는 문서 특성에 적합한 형태소 분석기를 사용하는 것이 좋습니다.
from collections import Counter # 텍스트를 추출하고, 빈도 수를 추출하기 위해 사용합니다. 기본적으로 워드클라우드는 단어의 출현 빈도가 클수록 더 크게 그려집니다.
import requests
from bs4 import BeautifulSoup
import time
import pyautogui
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from konlpy.tag import Okt
from collections import Counter
# user input
keyword = pyautogui.prompt("keyword")
lastpage = int(pyautogui.prompt("page"))
page_num = 1
total_content = ""
article_num = 0
for i in range(1, lastpage * 10, 10):
print(f"{page_num} page...")
response = requests.get(f"https://search.naver.com/search.naver?where=news&sm=tab_jum&query={keyword}&start={i}")
html = response.text
soup = BeautifulSoup(html, "html.parser")
articles = soup.select("div.info_group")
for article in articles:
links = article.select("a.info")
if len(links) >= 2:
url = links[1].attrs["href"]
response = requests.get(url, headers={'User-agent': 'Mozila/5.0'}) # to avoid error use headers
html = response.text # for each url get html
soup_sub = BeautifulSoup(html, "html.parser") # for each html make soup
# separation
if "entertain" in response.url: # to avoid redirection error
content = soup_sub.select_one("#articeBody") # get the body
elif "sports" in response.url: # to avoid redirection error
content = soup_sub.select_one("#newsEndContents") # get the body
# delete unnecessary elements
divs = content.select("div")
for div in divs:
div.decompose()
paragraphs = content.select("p")
for p in paragraphs:
p.decompose()
else:
content = soup_sub.select_one("#dic_area") # get the body
print("========BODY========\n", content.text.strip())
total_content += content.text.strip()
article_num += 1
time.sleep(0.3)
# checking where the last page is
is_last_page = soup.select_one("a.btn_next").attrs["aria-disabled"]
if is_last_page == "true":
print("last page")
break
page_num += 1
print(f"{article_num} articles")
# wordcloud
okt = Okt()
nouns = okt.nouns(total_content)
words = [word for word in nouns if len(word) > 1]
cnt = Counter(words)
wc = WordCloud(font_path="malgun", width=1280, height=720, scale=2.0, max_font_size=250)
gen = wc.generate_from_frequencies(cnt)
plt.figure()
plt.imshow(gen)
wc.to_file(f"{keyword}_wordcloud.png")