SQL 활용
import pandas as pd # Load the provided dataset data = pd.read_csv('./train.csv') # Display the first few rows of the dataset to understand its structure data.head()
# Calculate descriptive statistics for the dataset descriptive_stats = data.describe() descriptive_stats
correlations = data.corr()['critical_temp'].sort_values(ascending=False) correlations_top_bottom = pd.concat([correlations.head(11), correlations.tail(10)]) correlations_top_bottom
-> 파이썬 활용 데이터 파악!
SQL로 데이터 추출 및 파악하기
import sqlite3
# Create an in-memory SQLite database
conn = sqlite3.connect(':memory:')
data.to_sql('superconductors', conn, index=False)
# Test the database by selecting the first few rows
pd.read_sql('SELECT * FROM superconductors LIMIT 5', conn)
# Calculate descriptive statistics for 'critical_temp' using SQL
critical_temp_stats = pd.read_sql("""
SELECT
AVG(critical_temp) as Mean,
MIN(critical_temp) as Min,
MAX(critical_temp) as Max,
(MAX(critical_temp) + MIN(critical_temp)) / 2 as Median
FROM
superconductors
""", conn)
critical_temp_stats
# Check for missing values in the dataset using SQL
missing_values_check = pd.read_sql("""
SELECT
COUNT(*) as TotalRows,
SUM(CASE WHEN critical_temp IS NULL THEN 1 ELSE 0 END) as Missing_critical_temp
FROM
superconductors
""", conn)
missing_values_check
간단한 SQL 사용으로 데이터를 확인!