XML, HTML, XPATH

๊น€์ง€์œคยท2023๋…„ 4์›” 11์ผ
0

์›น ํฌ๋กค๋ง

๋ชฉ๋ก ๋ณด๊ธฐ
1/4


๐ŸŒžํฌ๋กค๋ง์ด๋ž€ ?
์›นํŽ˜์ด์ง€๋ฅผ ๊ทธ๋Œ€๋กœ ๊ฐ€์ ธ์™€ ๋ฐ์ดํ„ฐ๋ฅผ ์ถ”์ถœํ•ด ๋‚ด๋Š” ํ–‰์œ„ !

.
.
.
.
.
๋จผ์ €, lxml์ด ๊น”๋ ค์žˆ์ง€ ์•Š๋‹ค๋ฉด ๊น”์•„์ค€๋‹ค.

!pip install lxml

.
.
.

๐Ÿ›ป ์‹ค์ œ ์›น์‚ฌ์ดํŠธ๋ฅผ ํฌ๋กค๋งํ•˜๊ธฐ ์ „์— ์—ฐ์Šตํ•ด๋ณด์ž.

from lxml import etree
sample_xml = '''
<AAA>
    <BBB id = "a">
        <XXX />
        <yyy></yyy>
    </BBB>
    <CCC id = "a">
        <yyy>mystring</yyy>
    </CCC>
    <BBB id = "b">
        <zzz></zzz>
    </BBB>
    <BBB></BBB>
</AAA>
'''

et = etree.fromstring(sample_xml)
  • ํƒœ๊ทธ ์ฐพ๊ธฐ
    '/', '//', '///'๋ฅผ ์ ์ ˆํžˆ ํ™œ์šฉํ•˜์—ฌ ํ•ด๋‹น ํƒœ๊ทธ๋ฅผ ์ฐพ๋Š”๋‹ค.
r = et.xpath('/AAA')  # ๋งจ ์ƒ์œ„ ๋ฃจํŠธ ๋ฐ”๋กœ ์•„๋ž˜์— ์žˆ๋Š” AAA๋ผ๋Š” ํƒœ๊ทธ ์ฐพ๊ธฐ

r = et.xpath('/AAA/BBB')  # ๋งจ ์ƒ์œ„ ๋ฃจํŠธ ๋ฐ”๋กœ ์•„๋ž˜์— ์žˆ๋Š” AAAํƒœ๊ทธ ๋ฐ‘์˜ BBBํƒœ๊ทธ ๋ชจ๋‘ ์ฐพ๊ธฐ

r = et.xpath('//yyy') # ์–ด๋””์—์„œ ์‹œ์ž‘ํ•˜๋“  ์ƒ๊ด€ ์—†์ด yyy๋ผ๋Š” ํƒœ๊ทธ ์ฐพ๊ธฐ

r = et.xpath('//BBB/XXX') # BBBํƒœ๊ทธ ๋ฐ”๋กœ ๋ฐ‘์˜ XXX ํƒœ๊ทธ ์ฐพ๊ธฐ

r= et.xpath('//*/yyy') # ์ „์ฒด ์ค‘์—์„œ yyy ํƒœ๊ทธ ์ฐพ๊ธฐ

r = et.xpath('//BBB[@*]') # atribute๊ฐ€ ์žˆ๋Š” ๋ชจ๋“  BBB์ฐพ๊ธฐ

r= et.xpath('//BBB[not(@*)]') # atribute๊ฐ€ ์—†๋Š” ๋ชจ๋“  BBB์ฐพ๊ธฐ

print(r)
print(etree.tostring(r[0]))
print(etree.tostring(r[1]))
r = et.xpath('//@id') #๋ชจ๋“  id๋ฅผ ์ฐพ๊ธฐ
r = et.xpath('//BBB[@id]') # id๊ฐ€ ์žˆ๋Š” BBBํƒœ๊ทธ ์ฐพ๊ธฐ
r = et.xpath('//BBB[@id="a"]') # id=a ์ธ BBBํƒœ๊ทธ ์ฐพ๊ธฐ
r = et.xpath('//*[@id="a]') # ๋ชจ๋“  ํƒœ๊ทธ๋“ค ์ค‘ id=a ์ธ ํƒœ๊ทธ ์ฐพ๊ธฐ.

.
.
.
.

๐Ÿ›ป "์‹ค์ œ ์›น์‚ฌ์ดํŠธ๋กœ ํฌ๋กค๋ง"

์•„๋ž˜์˜ ๋„ค์ด๋ฒ„ '๋ณ„์ž๋ฆฌ ์šด์„ธ'๋ฅผ ํฌ๋กค๋งํ•ด๋ณด๊ฒ ๋‹ค.

import requests

p = requests.get('https://search.naver.com/search.naver?sm=tab_hty.top&where=nexearch&query=%EB%B3%84%EC%9E%90%EB%A6%AC+%EC%9A%B4%EC%84%B8&oquery=%EB%B3%84%EC%9E%90%EB%A6%AC+%E3%85%9C%E3%85%87%E3%84%B4%EC%84%B8&tqi=ivRbIdprvTossOsM4D8ssssstvC-143699')
p2 = etree.fromstring(p.text, parser = etree.HTMLParser())
r = p2.xpath('//li[@class="1st_r]/p')

for i,v in enumerate(r) :
	print(i,v.text)
  • ๊ฒฐ๊ณผ

.
.
.
.
.

๐Ÿ›ป "๋„ค์ด๋ฒ„ ์•ผ๊ตฌ ํŒ€์ˆœ์œ„ & ์Šน๋ฅ " ์ถ”์ถœ

๋จผ์ € ํŽ˜์ด์ง€๋ฅผ ๋ถˆ๋Ÿฌ์˜จ ํ›„,

from lxml import etree
import requests

page = requests.get('https://sports.news.naver.com/kbaseball/record/index?category=kbo&year=2022')

p2 = etree.fromstring(page.text, parser=etree.HTMLParser())

r1 ๋ณ€์ˆ˜์— ์ˆœ์œ„์ˆœ ํŒ€ ์ด๋ฆ„์„ ๋‹ด์€ ํ›„, team list ์— ์˜ฎ๊ฒจ๋‹ด๋Š”๋‹ค.

r1 = p2.xpath('//div/span[@id]')

team_list = []
for i in r1 :
    team_list.append(i.text)
print(team_list)

# print :
# ['SSG', 'ํ‚ค์›€', 'LG', 'KT', 'KIA', 'NC', '์‚ผ์„ฑ', '๋กฏ๋ฐ', '๋‘์‚ฐ', 'ํ•œํ™”']

r2 ๋ณ€์ˆ˜์—๋Š” ํŒ€ ๋ณ„ ์Šน๋ฅ ์„ ๋‹ด์€ ํ›„, prob_list์— ์˜ฎ๊ฒจ๋‹ด๋Š”๋‹ค.

r2 =  p2.xpath('//td/strong')

prob_list = []
for i in r2 :
    prob_list.append(i.text)
print(prob_list)

# print :
# ['0.629', '0.563', '0.613', '0.563', '0.490', '0.475', '0.465', '0.457', '0.423', '0.324']

โŒจ๏ธ dictionary { } ๋กœ ํ•ฉ์น˜๊ธฐ

final = dict(zip(team_list, prob_list))
print(final)
  • ๊ฒฐ๊ณผ

โŒจ๏ธ list ( ) ๋กœ ํ•ฉ์น˜๊ธฐ

final = []
for i in zip(team_list, prob_list) :
    final.append(i)
print(final)
  • ๊ฒฐ๊ณผ

โŒจ๏ธ tuple [ ] ๋กœ ํ•ฉ์น˜๊ธฐ

final = []
for n,s in zip(team_list, prob_list) :
    final.append([n, float(s)])
print(final)
  • ๊ฒฐ๊ณผ
profile
๋ฐ์ดํ„ฐ ๋ถ„์„ / ๋ฐ์ดํ„ฐ ์‚ฌ์ด์–ธํ‹ฐ์ŠคํŠธ / AI ๋”ฅ๋Ÿฌ๋‹

0๊ฐœ์˜ ๋Œ“๊ธ€

๊ด€๋ จ ์ฑ„์šฉ ์ •๋ณด