[Python] 데이터 시각화

yeji·2024년 10월 24일

python

Python

목록 보기

11/36

Matplotlib

파이썬에서 시각화를 위한 라이브러리 중 하나

그래프 도구

Color(색상)
문자열로 지정할 수 있으며 'blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'white'와 같은 기본 색상 이름 또는 RGB 값을 직접 지정할 수 있음
Linestyle(선 스타일)
'-'(실선), '--'(대시선), ':'(점선), '-.'(점-대시선) 등으로 지정할 수 있음
Marker(마커) : 데이터 포인트를 나타내는 기호
'o'(원), '^'(삼각형), 's'(사각형), '+'(플러스), 'x'(엑스) 등 다양한 기호로 지정

ax = df.plot(x='A', y='B', color='green', linestyle='--', marker='o')

plt.show()

범례 추가 : legnd() 메서드 사용

#1 label 사용
ax = df.plot(x='A', y='B', color='green', linestyle='--', marker='o', label='Data Series')

#2 legend 사용
ax.legend(['Data Series'])

축제목 입력 : set_xlabel(), set_ylabel(), set_title() 메서드를 사용

ax.set_xlabel('X-axis Label')
ax.set_ylabel('Y-axis Label')
ax.set_title('Title of the Plot')

텍스트 추가 : text() 메서드를 사용해 특정 위치에 텍스트 추가 가능

ax.text(3, 3, 'Some Text', fontsize=12)

그래프 사이즈 조절 : plt.figure() 함수를 사용하여 figure 객체를 생성하고 이후에 figsize 매개변소를 이용해 크기 조절

import matplotlib.pyplot as plt

# Figure 객체 생성 및 사이즈 설정
plt.figure(figsize=(8, 6))  # 가로 8인치, 세로 6인치

그래프 종류

line : 연속적인 데이터 시각화

# 데이터프레임 생성
data = {'날짜': ['2023-01-01', '2023-01-02', '2023-01-03'],
        '값': [10, 15, 8]}
df = pd.DataFrame(data)

# '날짜'를 날짜 형식으로 변환
df['날짜'] = pd.to_datetime(df['날짜'])

# 선 그래프 작성
plt.plot(df['날짜'], df['값'])
plt.xlabel('날짜')
plt.ylabel('값')
plt.title('선 그래프 예시')
plt.show()

bar : 범주형 데이터 간의 비교를 나타날 때

# 데이터프레임 생성
data = {'도시': ['서울', '부산', '대구', '인천'],
        '인구': [990, 350, 250, 290]}
df = pd.DataFrame(data)

# 막대 그래프 작성
plt.bar(df['도시'], df['인구'])
plt.xlabel('도시')
plt.ylabel('인구')
plt.title('막대 그래프 예시')
plt.show()

histogram : 데이터 빈도를 시각화하여 연속적인 데이터를 분포를 이해하는 데 사용

import matplotlib.pyplot as plt
import numpy as np

# 데이터 생성 (랜덤 데이터)
data = np.random.randn(1000)

# 히스토그램 그리기
plt.hist(data, bins=30)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram')
plt.show()

pie : 카테고리별 비율을 비요할 때 사용

import matplotlib.pyplot as plt

# 데이터 생성
sizes = [30, 20, 25, 15, 10]
labels = ['A', 'B', 'C', 'D', 'E']

# 원 그래프 그리기
plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.title('Pie Chart')
plt.show()

box plot : 연속형 데이터의 분포와 이상치 시각화, 통계적 특성 파악

import matplotlib.pyplot as plt
import numpy as np

# 데이터 생성
np.random.seed(10)
data = [np.random.normal(0, std, 100) for std in range(1, 4)]

# 박스 플롯 그리기
plt.boxplot(data)
plt.xlabel('Data')
plt.ylabel('Value')
plt.title('Box Plot')
plt.show()

상자(Box) : 데이터의 중앙값과 사분위수(25%와 75%), 상자의 아래쪽 끝은 25%의 값(1사분위수), 상자의 윗쪽 끝은 75%의 값(3사분위수), 상자의 중앙에 위치한 선은 중앙값
수염(Whisker) : 상자의 위 아래로 연장되는 선, 일반적으로 1.5배의 사분위 범위로 계산되는데 이 범위를 넘어가는 값은 이상치(outlier)로 간주, 수염의 끝은 최솟값과 최댓값
이상치(Outliers) : 수염 부분을 벗어나는 개별 데이터 포인트, 일반적인 범위를 벗어나는 값, 독립적으로 표시

scatter(산점도) : 두 변수 간의 상관관계

import matplotlib.pyplot as plt

# 데이터 생성
x = [1, 2, 3, 4, 5]
y = [2, 3, 5, 7, 11]

# 산점도 그리기
plt.scatter(x, y)
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.title('Scatter Plot')
plt.show()

실습 문제

# 실습 데이터 셋
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# 데이터 크기 설정
num_samples = 1000

# 랜덤 시드 설정
np.random.seed(42)

# 랜덤 데이터 생성
user_ids = np.arange(1, num_samples + 1)
purchase_dates = [datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 60)) for _ in range(num_samples)]
product_ids = np.random.randint(100, 200, size=num_samples)
categories = np.random.choice(['Electronics', 'Books', 'Clothing', 'Home', 'Toys'], size=num_samples)
prices = np.round(np.random.uniform(5, 300, size=num_samples), 2)
quantities = np.random.randint(1, 6, size=num_samples)
total_spent = prices * quantities
ages = np.random.randint(18, 65, size=num_samples)
genders = np.random.choice(['M', 'F'], size=num_samples)
locations = np.random.choice(['New York', 'Los Angeles', 'Chicago', 'San Francisco', 'Houston', 'Dallas', 'Seattle', 'Austin', 'Miami', 'Boston'], size=num_samples)
membership_levels = np.random.choice(['Bronze', 'Silver', 'Gold', 'Platinum'], size=num_samples)
ad_spends = np.round(np.random.uniform(5, 50, size=num_samples), 2)
visit_durations = np.random.randint(10, 120, size=num_samples)

# 데이터프레임 생성
data = {
    'user_id': user_ids,
    'purchase_date': purchase_dates,
    'product_id': product_ids,
    'category': categories,
    'price': prices,
    'quantity': quantities,
    'total_spent': total_spent,
    'age': ages,
    'gender': genders,
    'location': locations,
    'membership_level': membership_levels,
    'ad_spend': ad_spends,
    'visit_duration': visit_durations
}

# 데이터프레임 완성
df = pd.DataFrame(data)


# 결측치 추가
nan_indices = np.random.choice(df.index, size=50, replace=False)
df.loc[nan_indices, 'price'] = np.nan
df.loc[nan_indices[:25], 'quantity'] = np.nan

# 중복 데이터 추가
duplicate_indices = np.random.choice(df.index, size=20, replace=False)
duplicates = df.loc[duplicate_indices]
df = pd.concat([df, duplicates], ignore_index=True)

# 아웃라이어 추가
outlier_indices = np.random.choice(df.index, size=10, replace=False)
df.loc[outlier_indices, 'price'] = df['price'] * 10
df.loc[outlier_indices, 'total_spent'] = df['total_spent'] * 10


# CSV 파일로 저장
df.to_csv('./user_purchase_data.csv', index=False)

Q1. user_purchase_data.csv 파일에는 결측치가 포함되어 있습니다.
모든 결측치를 확인하고, 결측치가 있는 행을 제거하세요.

data=pd.read_csv('/Users/t2023-m0092/Desktop/J/Python/user_purchase_data.csv')
# 결측치 확인
data.isnull().sum()

# 결측치 제거
data_cleaned = data.dropna()

# 결측치 제거 확인
data_cleaned.isnull().sum()

Q2. purchase_date 컬럼의 데이터 타입을 문자열에서 datetime으로 변환하고, total_spent 컬럼의 데이터 타입을 정수로 변환하세요.

# 변환
data_cleaned['purchase_date'] = pd.to_datetime(data_cleaned['purchase_date'])
data_cleaned['total_spent'] = data_cleaned['total_spent'].astype(int)

# 변환 확인
data_cleaned.info()

Q3. 중복된 구매 데이터를 확인하고 제거하세요.
중복의 기준은 user_id, purchase_date, product_id가 동일한 행으로 합니다.

# 중복 확인
data_cleaned.duplicated(subset=['user_id', 'purchase_date', 'product_id'])

# 중복 제거
data_no_duplicated = data_cleaned.drop_duplicates(subset=['user_id', 'purchase_date', 'product_id'])

# 중복 제거 확인
data_no_duplicated.duplicated(subset=['user_id', 'purchase_date', 'product_id'])

Q4. price 컬럼에 이상치가 존재합니다.
IQR (Interquartile Range) 방법을 사용하여 이상치를 찾아 제거하세요.

# IQR 이상치 기준 설정
q3 = data_no_duplicated['price'].quantile(0.75)
q1 = data_no_duplicated['price'].quantile(0.25)
iqr = q3 - q1

boundary = 1.5*iqr

# 이상치 제거
data_no_outliers = data_no_duplicated[(data_no_duplicated['price']>=q1-boundary) & (data_no_duplicated['price']<=q3+boundary)]

Q5. total_spent 컬럼을 Min-Max 정규화를 사용하여 0과 1 사이의 값으로 변환하세요.

# minmaxscaler
from sklearn.preprocessing import MinMaxScaler

minmax_scaler = MinMaxScaler()
data_no_outliers['total_spent_normalized'] = minmax_scaler.fit_transform(data_no_outliers[['total_spent']])

# 결과 확인
data_no_outliers[['total_spent', 'total_spent_normalized']].head()

Q6. price 컬럼에 대해 제품 가격의 분포를 Box Plot으로 시각화하세요. 카테고리별로 그룹화하여 시각화하세요.

# 라이브러리 불러오기
import matplotlib.pyplot as plt
import seaborn as sns

# 시각화
sns.boxplot(x='category', y='price', data=data)

# 결과 확인
plt.show()

Q7. age와 total_spent 컬럼을 이용하여 사용자 나이와 총 지출 금액 간의 관계를 Scatter Plot으로 시각화하세요.

# 시각화
sns.scatterplot(x='age', y='total_spent', data=data)
plt.title('Age vs Total Spent')
plt.xlabel('Age')
plt.ylabel('Total Spent')

# 결과 확인
plt.show()

Q8. 모든 수치형 데이터 (price, quantity, total_spent, age, ad_spend, visit_duration) 간의 상관관계를 분석하고, heatmap을 사용하여 시각화하세요.

# 컬럼 선택
correlation_matrix = data[['price', 'quantity', 'total_spent', 'age', 'ad_spend', 'visit_duration']].corr()

# 시각화
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Matrix Heatmap')

# 결과 확인
plt.show()

Q9. age 컬럼에 대한 히스토그램을 작성하여 사용자 나이 분포를 시각화하세요.

# 시각화
plt.hist(data['age'], bins=20)
plt.title('Age Histogram')
plt.xlabel('Age')
plt.ylabel('Frequency')

# 결과 확인
plt.show()

Q10. membership_level 컬럼을 사용하여 각 회원 등급별 총 지출 금액을 바 차트로 시각화하세요.

# 총 지출 금액 합계 생성
membership_spent = data.groupby('membership_level')['total_spent'].sum().reset_index()

# 시각화
plt.bar(membership_spent['membership_level'], membership_spent['total_spent'])
plt.xlabel('Membership Level')
plt.ylabel('Membership Total Spent')
plt.title('Bar')

# 결과 확인
plt.show()

yeji

👋🏻

이전 포스트

[세션] Python 테이블 결합 및 피벗

다음 포스트