๐ํฌ๋กค๋ง์ด๋ ?
์นํ์ด์ง๋ฅผ ๊ทธ๋๋ก ๊ฐ์ ธ์ ๋ฐ์ดํฐ๋ฅผ ์ถ์ถํด ๋ด๋ ํ์ !
.
.
.
.
.
๋จผ์ , lxml์ด ๊น๋ ค์์ง ์๋ค๋ฉด ๊น์์ค๋ค.
!pip install lxml
.
.
.
from lxml import etree
sample_xml = '''
<AAA>
<BBB id = "a">
<XXX />
<yyy></yyy>
</BBB>
<CCC id = "a">
<yyy>mystring</yyy>
</CCC>
<BBB id = "b">
<zzz></zzz>
</BBB>
<BBB></BBB>
</AAA>
'''
et = etree.fromstring(sample_xml)
r = et.xpath('/AAA') # ๋งจ ์์ ๋ฃจํธ ๋ฐ๋ก ์๋์ ์๋ AAA๋ผ๋ ํ๊ทธ ์ฐพ๊ธฐ
r = et.xpath('/AAA/BBB') # ๋งจ ์์ ๋ฃจํธ ๋ฐ๋ก ์๋์ ์๋ AAAํ๊ทธ ๋ฐ์ BBBํ๊ทธ ๋ชจ๋ ์ฐพ๊ธฐ
r = et.xpath('//yyy') # ์ด๋์์ ์์ํ๋ ์๊ด ์์ด yyy๋ผ๋ ํ๊ทธ ์ฐพ๊ธฐ
r = et.xpath('//BBB/XXX') # BBBํ๊ทธ ๋ฐ๋ก ๋ฐ์ XXX ํ๊ทธ ์ฐพ๊ธฐ
r= et.xpath('//*/yyy') # ์ ์ฒด ์ค์์ yyy ํ๊ทธ ์ฐพ๊ธฐ
r = et.xpath('//BBB[@*]') # atribute๊ฐ ์๋ ๋ชจ๋ BBB์ฐพ๊ธฐ
r= et.xpath('//BBB[not(@*)]') # atribute๊ฐ ์๋ ๋ชจ๋ BBB์ฐพ๊ธฐ
print(r)
print(etree.tostring(r[0]))
print(etree.tostring(r[1]))
r = et.xpath('//@id') #๋ชจ๋ id๋ฅผ ์ฐพ๊ธฐ
r = et.xpath('//BBB[@id]') # id๊ฐ ์๋ BBBํ๊ทธ ์ฐพ๊ธฐ
r = et.xpath('//BBB[@id="a"]') # id=a ์ธ BBBํ๊ทธ ์ฐพ๊ธฐ
r = et.xpath('//*[@id="a]') # ๋ชจ๋ ํ๊ทธ๋ค ์ค id=a ์ธ ํ๊ทธ ์ฐพ๊ธฐ.
.
.
.
.
์๋์ ๋ค์ด๋ฒ '๋ณ์๋ฆฌ ์ด์ธ'๋ฅผ ํฌ๋กค๋งํด๋ณด๊ฒ ๋ค.
import requests
p = requests.get('https://search.naver.com/search.naver?sm=tab_hty.top&where=nexearch&query=%EB%B3%84%EC%9E%90%EB%A6%AC+%EC%9A%B4%EC%84%B8&oquery=%EB%B3%84%EC%9E%90%EB%A6%AC+%E3%85%9C%E3%85%87%E3%84%B4%EC%84%B8&tqi=ivRbIdprvTossOsM4D8ssssstvC-143699')
p2 = etree.fromstring(p.text, parser = etree.HTMLParser())
r = p2.xpath('//li[@class="1st_r]/p')
for i,v in enumerate(r) :
print(i,v.text)
.
.
.
.
.
๋จผ์ ํ์ด์ง๋ฅผ ๋ถ๋ฌ์จ ํ,
from lxml import etree
import requests
page = requests.get('https://sports.news.naver.com/kbaseball/record/index?category=kbo&year=2022')
p2 = etree.fromstring(page.text, parser=etree.HTMLParser())
r1 ๋ณ์์ ์์์ ํ ์ด๋ฆ์ ๋ด์ ํ, team list ์ ์ฎ๊ฒจ๋ด๋๋ค.
r1 = p2.xpath('//div/span[@id]')
team_list = []
for i in r1 :
team_list.append(i.text)
print(team_list)
# print :
# ['SSG', 'ํค์', 'LG', 'KT', 'KIA', 'NC', '์ผ์ฑ', '๋กฏ๋ฐ', '๋์ฐ', 'ํํ']
r2 ๋ณ์์๋ ํ ๋ณ ์น๋ฅ ์ ๋ด์ ํ, prob_list์ ์ฎ๊ฒจ๋ด๋๋ค.
r2 = p2.xpath('//td/strong')
prob_list = []
for i in r2 :
prob_list.append(i.text)
print(prob_list)
# print :
# ['0.629', '0.563', '0.613', '0.563', '0.490', '0.475', '0.465', '0.457', '0.423', '0.324']
final = dict(zip(team_list, prob_list))
print(final)
final = []
for i in zip(team_list, prob_list) :
final.append(i)
print(final)
final = []
for n,s in zip(team_list, prob_list) :
final.append([n, float(s)])
print(final)