
from urllib.request import urlopen
textPage = urlopen('http://www.pythonscraping.com/pages/warandpeace/chapter1.txt')
print(textPage.read())
from urllib.request import urlopen
textPage = urlopen('http://www.pythonscraping.com/pages/warandpeace/chapter1-ru.txt')
print(str(textPage.read(), 'utf-8')) # utf-8을 이용해 문서를 읽으면 정상적으로 읽을 수 있음
BeautifulSoup에서 utf-8 인코딩을 사용하고 싶을 때는 지정해주어야 함
bsObj = BeautifulSoup(html, 'html.parser')
content = bsObj.find('div', {'id': 'mw-content-text'}).get_text()
content = bytes(content, 'utf-8')
content = content.decode('utf-8')
from urllib.request import urlopen
from io import StringIO
import csv
data = urlopen('http://pythonscraping.com/files/MontyPythonAlbums.csv').read().decode(
'ascii', 'ignore'
)
dataFile = StringIO(data) # StringIO를 사용하지 않으면 한 글자씩 읽게 됨
csvReader = csv.reader(dataFile)
for row in csvReader:
print(row)
csv.reader를 사용하면 csv 파일의 첫 행까지 한 번에 읽게 된다. csv.DictReader를 사용하면 csv 파일의 첫 행의 field만을 분리 가능
from urllib.request import urlopen
from io import StringIO
from csv import DictReader
data = urlopen('http://pythonscraping.com/files/MontyPythonAlbums.csv').read().decode(
'ascii', 'ignore'
)
dataFile = StringIO(data)
dictReader = csv.DictReader(dataFile)
print(dictReader.fieldnames)
for row in dictReader:
print(row)
from urllib.request import urlopen
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open
def readPDF(pdfFile):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
process_pdf(rsrcmgr, device, pdfFile)
device.close()
content = retstr.getvalue()
retstr.close()
return content
pdfFile = urlopen('http://pythonscraping.com/pages/warandpeace/chapter1.pdf')
outputString = readPDF(pdfFile)
print(outputString)
pdfFile.close()