from bs4 import BeautifulSoup
from selenium import webdriver
import time
import sys
import os
import requests
import pandas as pd
f_name = os.getcwd()
fc_name = os.getcwd()
fx_name = os.getcwd()
while True:
try:
site = input("사이트를 입력하세요\n")
if site.find("naver") != -1 or site.find("네이버") != -1:
site = "https://search.naver.com/search.naver"
f_name += "\\py_site2\\naver.txt"
fc_name += "\\py_site2\\naver.csv"
fx_name += "\\py_site2\\naver.xls"
break
elif site.find("daum") != -1 or site.find("다음") != -1:
site = "https://search.daum.net/search"
f_name += "\\py_site2\\daum.txt"
fc_name += "\\py_site2\\daum.csv"
fx_name += "\\py_site2\\daum.xls"
break
elif site.find("google") != -1 or site.find("구글") != -1:
site = "https://www.google.co.kr/search"
f_name += "\\py_site2\\google.txt"
fc_name += "\\py_site2\\google.csv"
fx_name += "\\py_site2\\google.xls"
break
else:
print("지원하지 않는 사이트입니다.")
except:
print(" 오류가 발생하였습니다.")
query_txt = input('크롤링할 키워드는 무엇입니까\n')
if site == "https://search.naver.com/search.naver":
params = {'where':'kin', 'query': query_txt}
resp = requests.get(site, params = params)
full_html = resp.content
soup = BeautifulSoup(full_html, 'html.parser')
content_list = soup.select('ul.lst_total > li')
no =1
numbers = []
questions=[]
answers=[]
for i in content_list:
numbers.append(no)
print('번호: ', no)
question = i.find('div', 'question_group').get_text()
questions.append(question)
print('질문: ', question.strip())
answer = i.find('div', 'answer_group').get_text()
answers.append(answer)
print('답변: ', answer.strip())
no += 1
DB = pd.DataFrame()
DB['번호'] = numbers
DB['질문'] = questions
DB['답변'] = answers
DB.to_csv(fc_name, encoding='utf-8-sig')
DB.to_excel(fx_name)
f = open(f_name, 'a', encoding='UTF-8')
f.write(str(numbers))
f.write(str(questions))
f.write(str(answers))
f.close()
elif site == "https://search.daum.net/search":
params = {'w':'blog', 'q': query_txt}
resp = requests.get(site, params = params)
full_html = resp.content
soup = BeautifulSoup(full_html, 'html.parser')
content_list = soup.select('ul.list_info > li')
no =1
numbers = []
titles=[]
bodies=[]
for i in content_list:
numbers.append(no)
print('번호: ', no)
title = i.find('div', 'wrap_tit mg_tit').get_text()
titles.append(title)
print('제목: ', title.strip())
body = i.find('p', 'f_eb desc').get_text()
bodies.append(body)
print('본문: ', body.strip())
no += 1
DB = pd.DataFrame()
DB['번호'] = numbers
DB['제목'] = titles
DB['본문'] = bodies
DB.to_csv(fc_name, encoding='utf-8-sig')
DB.to_excel(fx_name)
f = open(f_name, 'a', encoding='UTF-8')
f.write(str(numbers))
f.write(str(questions))
f.write(str(bodies))
f.close()
elif site == "https://www.google.co.kr/search":
params = {'q': query_txt, 'tbm':'nws'}
resp = requests.get(site, params = params)
time.sleep(1)
full_html = resp.content
soup = BeautifulSoup(full_html, 'html.parser')
content_list = soup.select('div#main div.ZINbbc')
no =1
numbers = []
titles=[]
bodies=[]
for i in content_list:
title = i.find('div', class_ ='BNeawe vvjwJb AP7Wnd')
if title == None:
continue
numbers.append(no)
print('번호: ', no)
titles.append(title.get_text())
print('제목: ', title.get_text().strip())
body = i.find('div', class_ ='BNeawe s3v9rd AP7Wnd').get_text()
bodies.append(body)
print('본문: ', body.strip())
no += 1
DB = pd.DataFrame()
DB['번호'] = numbers
DB['제목'] = titles
DB['본문'] = bodies
DB.to_csv(fc_name, encoding='utf-8-sig')
DB.to_excel(fx_name)
f = open(f_name, 'a', encoding='UTF-8')
f.write(str(numbers))
f.write(str(questions))
f.write(str(bodies))
f.close()
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import sys
import os
import requests
import pandas as pd
f_name = os.getcwd()
fc_name = os.getcwd()
fx_name = os.getcwd()
while True:
try:
site = input("사이트를 입력하세요\n")
if site.find("naver") != -1 or site.find("네이버") != -1:
site = "https://search.naver.com/search.naver"
f_name += "\\py_site2\\naver.txt"
fc_name += "\\py_site2\\naver.csv"
fx_name += "\\py_site2\\naver.xls"
break
elif site.find("daum") != -1 or site.find("다음") != -1:
site = "https://search.daum.net/search"
f_name += "\\py_site2\\daum.txt"
fc_name += "\\py_site2\\daum.csv"
fx_name += "\\py_site2\\daum.xls"
break
elif site.find("google") != -1 or site.find("구글") != -1:
site = "https://www.google.co.kr/search"
f_name += "\\py_site2\\google.txt"
fc_name += "\\py_site2\\google.csv"
fx_name += "\\py_site2\\google.xls"
break
else:
print("지원하지 않는 사이트입니다.")
except:
print(" 오류가 발생하였습니다.")
query_txt = input('크롤링할 키워드는 무엇입니까\n')
그 이전에 만든것이랑 비슷하다. 다만, 이번엔 csv파일과 xls파일을 만들꺼기 때문에 이름을 정해주자!
if site == "https://search.naver.com/search.naver":
params = {'where':'kin', 'query': query_txt}
resp = requests.get(site, params = params)
full_html = resp.content
soup = BeautifulSoup(full_html, 'html.parser')
content_list = soup.select('ul.lst_total > li')
no =1
numbers = []
questions=[]
answers=[]
for i in content_list:
numbers.append(no)
print('번호: ', no)
question = i.find('div', 'question_group').get_text()
questions.append(question)
print('질문: ', question.strip())
answer = i.find('div', 'answer_group').get_text()
answers.append(answer)
print('답변: ', answer.strip())
no += 1
DB = pd.DataFrame()
DB['번호'] = numbers
DB['질문'] = questions
DB['답변'] = answers
DB.to_csv(fc_name, encoding='utf-8-sig')
DB.to_excel(fx_name)
f = open(f_name, 'a', encoding='UTF-8')
f.write(str(numbers))
f.write(str(questions))
f.write(str(answers))
f.close()
저번 코드랑 많이 달라졌다. url규칙으로 인해 바로 검색하게 하고, select도 조금 간결하게 쓰게 되었다.
<ul class = 'lst_total'>
<li>
<div class = 'question_group'> 제목1 </div>
<div class = 'answer_group'> 본문 1 </div> </li>
<li>
<div class = 'question_group'> 제목2 </div> </li>
<div class = 'answer_group'> 본문 2 </div> </li>
<li>
<div class = 'question_group'> 제목3 </div> </li>
<div class = 'answer_group'> 본문 3 </div> </li>
<li>
<div class = 'question_group'> 제목4 </div> </li>
<div class = 'answer_group'> 본문 4 </div> </li>
....
</ul>
일때 content_list = soup.select('ul.lst_total > li')
라 하면, 그곳에 해당하는 것들을 모아 배열을 만들어줘서 for문으로 돌리면 됬다!
csv파일과 xls파일 만드는건 그냥 하면됬다.
elif site == "https://search.daum.net/search":
params = {'w':'blog', 'q': query_txt}
resp = requests.get(site, params = params)
full_html = resp.content
soup = BeautifulSoup(full_html, 'html.parser')
content_list = soup.select('ul.list_info > li')
no =1
numbers = []
titles=[]
bodies=[]
for i in content_list:
numbers.append(no)
print('번호: ', no)
title = i.find('div', 'wrap_tit mg_tit').get_text()
titles.append(title)
print('제목: ', title.strip())
body = i.find('p', 'f_eb desc').get_text()
bodies.append(body)
print('본문: ', body.strip())
no += 1
DB = pd.DataFrame()
DB['번호'] = numbers
DB['제목'] = titles
DB['본문'] = bodies
DB.to_csv(fc_name, encoding='utf-8-sig')
DB.to_excel(fx_name)
f = open(f_name, 'a', encoding='UTF-8')
f.write(str(numbers))
f.write(str(questions))
f.write(str(bodies))
f.close()
얘도 뭐 비슷했다. 딱히 달라진건 없었다.
elif site == "https://www.google.co.kr/search":
params = {'q': query_txt, 'tbm':'nws'}
resp = requests.get(site, params = params)
time.sleep(1)
full_html = resp.content
soup = BeautifulSoup(full_html, 'html.parser')
content_list = soup.select('div#main div.ZINbbc')
no =1
numbers = []
titles=[]
bodies=[]
for i in content_list:
title = i.find('div', class_ ='BNeawe vvjwJb AP7Wnd')
if title == None:
continue
numbers.append(no)
print('번호: ', no)
titles.append(title.get_text())
print('제목: ', title.get_text().strip())
body = i.find('div', class_ ='BNeawe s3v9rd AP7Wnd').get_text()
bodies.append(body)
print('본문: ', body.strip())
no += 1
DB = pd.DataFrame()
DB['번호'] = numbers
DB['제목'] = titles
DB['본문'] = bodies
DB.to_csv(fc_name, encoding='utf-8-sig')
DB.to_excel(fx_name)
f = open(f_name, 'a', encoding='UTF-8')
f.write(str(numbers))
f.write(str(questions))
f.write(str(bodies))
f.close()
구글은 조금 복잡했다. 일단 soup.select('div#main div.ZINbbc')
여기는 저번에 배운 그대로 했다! #이 아이디였나 그랬을 것이다. 아 맞다, 여기서 띄어쓰기는 하위태그를 나타내기 때문에 클래스명이 띄어쓰기가 있는경우 뒤에를 자르고 앞에만 쓰면됬다!
for문안에서도 복잡했는데, content_list[0]에는 div.BNeawe...가 없었다. 이때 해결한 방법이 만약 그값이 None(없다면) continue를 이용해 그 문장을 넘어가 버렸다. 그뒤로는 뭐.. 똑같다!
import openpyxl
wb = openpyxl.Workbook()
sheet_1 = wb.active
sheet_2 = wb.create_sheet("매출현황")
sheet_1.title = '총매출현황'
wb.save('c:\\py_temp\\py_site\\test3.xlsx')
얘는 새로운 xlsx파일을 만드는 방법이다. 가상의 Workbook을 만든다음 넘겨준다!
import openpyxl
wb = openpyxl.load_workbook('c:\\py_temp\\py_site\\test3.xlsx')
sheet_1 = wb['총매출현황']
sheet_1['A1'] = '첫번째 cell'
sheet_1['A2'] = '두번째 cell'
wb.save('c:\\py_temp\\py_site\\test3.xlsx')
얘는 있는 파일을 읽어 수정하는것이다. 근데 이것들을 쓸일이 있을까?