1. 특정 검색 구간에서 사진 가져 오기
import requests
from bs4 import BeautifulSoup
search_url = "https://www.google.com/search?tbm=isch&q="
keyword = "맥주"
url = search_url + keyword
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
imgs = soup.find_all('img')
import os
folder_name = 'imgs'
if not os.path.exists(folder_name):
os.mkdir(folder_name)
for dix, img in enumerate(imgs[1:]):
file_name = f"img_{dix}.jpg"
file_path = os.path.join(folder_name, file_name)
img_response = requests.get(img['src'])
img_data = img_response.content
with open(file_path, "wb") as f:
f.write(img_data)
2. 뉴스에서 타이틀 제목과 연결된 내용들 저장하기.
import requests
from bs4 import BeautifulSoup
import os
folder_name = "test_string_Crawling"
file_name = 'headlines.txt'
if not os.path.exists(folder_name):
os.mkdir(folder_name)
file_path = os.path.join(folder_name, file_name)
headers = {"User-Agent":"Mozilla/5.0"}
url = 'https://news.naver.com/main/main.naver?mode=LSD&mid=shm&sid1=105'
response = requests.get(url, headers = headers)
html = response.text
soup = BeautifulSoup(html, "html.parser")
div = soup.body.find('div', attrs={'class': 'list_body'})
headlines = div.find_all('a', attrs={'class' : 'cluster_text_headline'})
for headline in headlines:
response = requests.get(headline['href'], headers = headers)
html = response.text
soup = BeautifulSoup(html, "html.parser")
div_texts = soup.body.find('div', attrs={'id' : 'dic_area'}).find_all(text=True)
with open(file_path, 'a', encoding="UTF-8") as f:
f.write(f"{headline.text.strip()}\n")
for div_text in div_texts:
f.write(f"{div_text}")
f.write(f"==========================================\n")