(가장 표준적인, “파싱된 데이터” 형태)
data = [
{'제목':'뉴스1', '링크':'url1', '날짜':'2024-01-01'},
{'제목':'뉴스2', '링크':'url2', '날짜':'2024-01-02'},
...
]
import pandas as pd
df = pd.DataFrame(data)
requests, BeautifulSoup, selenium, API 등)로 데이터를 파싱해 딕셔너리(사전)로 모았다면 가장 쉽게 DataFrame 생성!data = [
['뉴스1', 'url1', '2024-01-01'],
['뉴스2', 'url2', '2024-01-02'],
...
]
df = pd.DataFrame(data, columns=['제목', '링크', '날짜'])
(ex. 태그, 텍스트 덩어리, 라인별 문장 등)
lines = [
"뉴스1, url1, 2024-01-01",
"뉴스2, url2, 2024-01-02"
]
data = [line.split(', ') for line in lines]
df = pd.DataFrame(data, columns=['제목', '링크', '날짜'])
from bs4 import BeautifulSoup
import requests
import pandas as pd
html = requests.get('https://news.ycombinator.com/').text
soup = BeautifulSoup(html, 'html.parser')
titles = [tag.text for tag in soup.select('.titleline a')]
links = [tag['href'] for tag in soup.select('.titleline a')]
df = pd.DataFrame({'제목': titles, '링크': links})
import requests
import pandas as pd
r = requests.get('https://jsonplaceholder.typicode.com/posts')
data = r.json() # 리스트(딕셔너리) 형태로 변환됨
df = pd.DataFrame(data)
columns=[...] 옵션 사용import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://news.ycombinator.com/'
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
# 데이터 추출
titles = [tag.text for tag in soup.select('.titleline a')]
links = [tag['href'] for tag in soup.select('.titleline a')]
# DataFrame 변환
df = pd.DataFrame({'제목': titles, '링크': links})
print(df.head())
pd.DataFrame(data) 바로 가능