"""
step3_analytics.py — 장기 시계열 인프라 데이터 시각화 및 20대 마스터 차트 드로잉 엔진
"""
import os
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
BASE_DATA_DIR = Path("./data")
MERGED_DIR = BASE_DATA_DIR / "merged"
PLOT_DIR = BASE_DATA_DIR / "output" / "plots"
PLOT_DIR.mkdir(parents=True, exist_ok=True)
sns.set_theme(style="whitegrid")
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['axes.unicode_minus'] = False
def check_empty_data(df, chart_name):
if df.empty:
print(f" ⚠️ [데이터 공백] {chart_name}을(를) 그릴 로우가 없어 빈 차트로 대체합니다.")
return True
return False
def main():
print("🚀 [Step3 개시] FinOps 빅데이터 시계열 시각화 차트 분석 엔진 가동...")
p1 = MERGED_DIR / "enriched_fixed_7d.parquet"
p2 = MERGED_DIR / "pareto_fixed_ns.parquet"
print(f"📖 가공 원부 스캔 좌표:\n - {p1}\n - {p2}")
if not p1.exists():
print("❌ 오류: step2 파이프라인의 가공 산출물(Parquet)이 없습니다. step2를 먼저 실행하세요.")
return
df_pod = pd.read_parquet(p1)
df_ns = pd.read_parquet(p2) if p2.exists() else pd.DataFrame()
print(f"✅ 데이터 레이크 로드 완료 -> 분석 대상 컨테이너 세트: {len(df_pod):,}행 스캔 성공.\n")
df_pod["cpu_util"] = np.where(df_pod["cpu_request_max"] > 0, (df_pod["cpu_usage_p95"] / df_pod["cpu_request_max"] * 100), 0)
df_pod["mem_util"] = np.where(df_pod["mem_request_max"] > 0, (df_pod["mem_usage_p95"] / df_pod["mem_request_max"] * 100), 0)
df_pod["lim_req_ratio"] = np.where(df_pod["cpu_request_max"] > 0, df_pod["cpu_limit_max"] / df_pod["cpu_request_max"], 0)
print("⏳ [1/19] chart1_cpu_req_vs_usage_by_workload 시각화 연산 중...")
df_wl_cpu = df_pod.groupby("workload_type")[["cpu_request_max", "cpu_usage_p95"]].mean().reset_index()
plt.figure(figsize=(10, 5))
df_melt_cpu = df_wl_cpu.melt(id_vars="workload_type", value_vars=["cpu_request_max", "cpu_usage_p95"])
sns.barplot(data=df_melt_cpu, x="workload_type", y="value", hue="variable", palette="Blues_r")
plt.xticks(rotation=30, ha='right')
plt.title("Average CPU Request vs P95 Peak Usage by Workload")
plt.tight_layout()
out1 = PLOT_DIR / "chart1_cpu_req_vs_usage_by_workload.png"
plt.savefig(out1, dpi=100)
plt.close()
print(f" -> 🎨 차트 렌더링 완료: {out1.name}")
print("⏳ [2/19] chart2_mem_req_vs_usage_by_workload 시각화 연산 중...")
df_wl_mem = df_pod.groupby("workload_type")[["mem_request_max", "mem_usage_p95"]].mean().reset_index()
plt.figure(figsize=(10, 5))
df_melt_mem = df_wl_mem.melt(id_vars="workload_type", value_vars=["mem_request_max", "mem_usage_p95"])
sns.barplot(data=df_melt_mem, x="workload_type", y="value", hue="variable", palette="Purples_r")
plt.xticks(rotation=30, ha='right')
plt.title("Average Memory Request vs P95 Peak Usage (GB) by Workload")
plt.tight_layout()
out2 = PLOT_DIR / "chart2_mem_req_vs_usage_by_workload.png"
plt.savefig(out2, dpi=100)
plt.close()
print(f" -> 🎨 차트 렌더링 완료: {out2.name}")
print("⏳ [3/19] chart3_daily_waste_stack 누적 시계열 연산 중...")
df_daily_waste = df_pod.groupby(["date", "workload_type"])["cpu_waste_core_hours"].sum().unstack().fillna(0)
if not check_empty_data(df_daily_waste, "chart3_daily_waste_stack"):
df_daily_waste.plot(kind='bar', stacked=True, figsize=(11, 5), cmap="tab20")
plt.title("Daily Total CPU Waste Core-Hours Stacked by Workload (KST)")
plt.ylabel("Waste Core-Hours")
plt.xticks(rotation=45)
plt.tight_layout()
out3 = PLOT_DIR / "chart3_daily_waste_stack.png"
plt.savefig(out3, dpi=100)
plt.close()
print(f" -> 🎨 차트 렌더링 완료: {out3.name}")
print("⏳ [4/19] chart4_cpu_efficiency_heatmap 격자 분석 중...")
df_heat_cpu = df_pod.groupby(["workload_type", "date"])["cpu_util"].mean().unstack().fillna(0)
plt.figure(figsize=(10, 5))
sns.heatmap(df_heat_cpu, annot=True, fmt=".1f", cmap="RdYlGn", cbar=True)
plt.title("Mean CPU Utilization Heatmap (%) (Workload x KST Date)")
plt.tight_layout()
out4 = PLOT_DIR / "chart4_cpu_efficiency_heatmap.png"
plt.savefig(out4, dpi=100)
plt.close()
print(f" -> 🎨 차트 렌더링 완료: {out4.name}")
print("⏳ [5/19] chart18_mem_waste_heatmap 격자 분석 중...")
df_heat_mem = df_pod.groupby(["workload_type", "date"])["mem_waste_gb_hours"].sum().unstack().fillna(0)
plt.figure(figsize=(10, 5))
sns.heatmap(df_heat_mem, annot=False, cmap="BuPu", cbar=True)
plt.title("Total Memory Waste Volume Heatmap (GB-Hours)")
plt.tight_layout()
out18 = PLOT_DIR / "chart18_mem_waste_heatmap.png"
plt.savefig(out18, dpi=100)
plt.close()
print(f" -> 🎨 차트 렌더링 완료: {out18.name}")
print("⏳ [6/19] chart5_pareto_ns_waste 비용 거점 추적 중...")
if not check_empty_data(df_ns, "chart5_pareto_ns_waste"):
df_ns_top = df_ns.head(15)
fig, ax1 = plt.subplots(figsize=(11, 5))
sns.barplot(data=df_ns_top, x="namespace", y="total_waste_core_hours", ax=ax1, color="steelblue")
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha="right")
ax2 = ax1.twinx()
ax2.plot(df_ns_top["namespace"], df_ns_top["waste_cumsum_pct"], color="crimson", marker="o", linewidth=2)
ax2.set_ylim(0, 110)
plt.title("Namespace Top 15 Cost-Waste Pareto Chart")
plt.tight_layout()
out5 = PLOT_DIR / "chart5_pareto_ns_waste.png"
plt.savefig(out5, dpi=100)
plt.close()
print(f" -> 🎨 차트 렌더링 완료: {out5.name}")
print("⏳ [7/19] chart11_pareto_workload_waste 기술 도메인 서열화 중...")
df_wl_waste = df_pod.groupby("workload_type")["cpu_waste_core_hours"].sum().reset_index().sort_values("cpu_waste_core_hours", ascending=False)
plt.figure(figsize=(10, 5))
sns.barplot(data=df_wl_waste, x="workload_type", y="cpu_waste_core_hours", palette="Oranges_r")
plt.xticks(rotation=30, ha="right")
plt.title("Total CPU Waste Volume by Workload Type")
plt.tight_layout()
out11 = PLOT_DIR / "chart11_pareto_workload_waste.png"
plt.savefig(out11, dpi=100)
plt.close()
print(f" -> 🎨 차트 렌더링 완료: {out11.name}")
print("⏳ [8/19] chart6_status_donut 자원 건전성 지표 요약 중...")
status_summary = df_pod["status"].value_counts()
plt.figure(figsize=(6, 5))
colors = ["#70AD47", "#1F4E79", "#FFC000", "#C00000"]
plt.pie(status_summary, labels=status_summary.index, autopct='%1.1f%%', startangle=90, colors=colors[:len(status_summary)], wedgeprops=dict(width=0.4, edgecolor='w'))
plt.title("Infrastructure Resource Governance Status Ratio")
plt.tight_layout()
out6 = PLOT_DIR / "chart6_status_donut.png"
plt.savefig(out6, dpi=100)
plt.close()
print(f" -> 🎨 차트 렌더링 완료: {out6.name}")
print("⏳ [9/19] chart7_waste_footprint_bubble 3차원 입체 버블 맵 매핑 중...")
df_bubble = df_pod.groupby("workload_type").agg(
x_alloc=("cpu_allocated_core_hours", "sum"),
y_util=("cpu_util", "mean"),
z_waste=("cpu_waste_core_hours", "sum")
).reset_index()
plt.figure(figsize=(9, 6))
sns.scatterplot(data=df_bubble, x="x_alloc", y="y_util", size="z_waste", hue="workload_type", sizes=(100, 2000), alpha=0.7, legend="brief")
plt.title("Resource Footprint Bubble Chart (Allocated x Utilization x Waste Size)")
plt.xlabel("Total Allocated Core-Hours")
plt.ylabel("Average Utilization (%)")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
out7 = PLOT_DIR / "chart7_waste_footprint_bubble.png"
plt.savefig(out7, dpi=100)
plt.close()
print(f" -> 🎨 차트 렌더링 완료: {out7.name}")
print("⏳ [10/19] chart14_cpu_mem_waste_scatter 자원 교차 손실 스캔 중...")
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_pod.head(5000), x="cpu_waste_core_hours", y="mem_waste_gb_hours", hue="workload_type", alpha=0.5)
plt.title("Co-relation Scatter Plot: CPU Waste vs Memory Waste (Sampled)")
plt.tight_layout()
out14 = PLOT_DIR / "chart14_cpu_mem_waste_scatter.png"
plt.savefig(out14, dpi=100)
plt.close()
print(f" -> 🎨 차트 렌더링 완료: {out14.name}")
print("⏳ [11/19] chart8_shortfall_footprint 리소스 고갈 병목 분석 중...")
df_short = df_pod.groupby(["workload_type", "date"])["cpu_shortage_cores"].sum().unstack().fillna(0)
plt.figure(figsize=(10, 4))
sns.heatmap(df_short, annot=False, cmap="YlOrRd", cbar=True)
plt.title("Total CPU Shortfall (Deficit) Cores Footprint")
plt.tight_layout()
out8 = PLOT_DIR / "chart8_shortfall_footprint.png"
plt.savefig(out8, dpi=100)
plt.close()
print(f" -> 🎨 차트 렌더링 완료: {out8.name}")
print("⏳ [12/19] chart9_boxplot_cpu_util_by_workload 하중 변동성 밀도 유추 중...")
plt.figure(figsize=(11, 5))
sns.boxplot(data=df_pod, x="workload_type", y="cpu_util", palette="Set3")
plt.xticks(rotation=30, ha="right")
plt.ylim(-5, 105)
plt.title("CPU Utilization P95 Boxplot Distribution by Workload Type")
plt.tight_layout()
out9 = PLOT_DIR / "chart9_boxplot_cpu_util_by_workload.png"
plt.savefig(out9, dpi=100)
plt.close()
print(f" -> 🎨 차트 렌더링 완료: {out9.name}")
print("⏳ [13/19] chart10_boxplot_mem_util_by_workload 하중 변동성 밀도 유추 중...")
plt.figure(figsize=(11, 5))
sns.boxplot(data=df_pod, x="workload_type", y="mem_util", palette="Pastel1")
plt.xticks(rotation=30, ha="right")
plt.ylim(-5, 105)
plt.title("Memory Utilization P95 Boxplot Distribution by Workload Type")
plt.tight_layout()
out10 = PLOT_DIR / "chart10_boxplot_mem_util_by_workload.png"
plt.savefig(out10, dpi=100)
plt.close()
print(f" -> 🎨 차트 렌더링 완료: {out10.name}")
print("⏳ [14/19] chart12_daily_waste_trend_by_workload 꺾은선 추이 추적 중...")
df_trend_wl = df_pod.groupby(["date", "workload_type"])["cpu_waste_core_hours"].sum().unstack().fillna(0)
plt.figure(figsize=(11, 5))
sns.lineplot(data=df_trend_wl, markers=True, dashes=False, linewidth=2)
plt.title("Daily CPU Waste Timeline Trend Line by Workload")
plt.xticks(rotation=30)
plt.tight_layout()
out12 = PLOT_DIR / "chart12_daily_waste_trend_by_workload.png"
plt.savefig(out12, dpi=100)
plt.close()
print(f" -> 🎨 차트 렌더링 완료: {out12.name}")
print("⏳ [15/19] chart15_oom_status_by_workload 워크로드 자원 상태 분석 중...")
plt.figure(figsize=(11, 5))
sns.countplot(data=df_pod, x="workload_type", hue="status", palette="muted")
plt.xticks(rotation=30, ha="right")
plt.title("Governance Status Distribution Count per Workload Type")
plt.legend(loc="upper right")
plt.tight_layout()
out15 = PLOT_DIR / "chart15_oom_status_by_workload.png"
plt.savefig(out15, dpi=100)
plt.close()
print(f" -> 🎨 차트 렌더링 완료: {out15.name}")
print("⏳ [16/19] chart13_violin_cpu_util 고밀도 커널 파동 연산 중...")
plt.figure(figsize=(10, 5))
sns.violinplot(data=df_pod, x="workload_type", y="cpu_util", inner="quartile", palette="pastel")
plt.xticks(rotation=30, ha="right")
plt.title("CPU Utilization Kernel Density Violin Plot")
plt.tight_layout()
out13 = PLOT_DIR / "chart13_violin_cpu_util.png"
plt.savefig(out13, dpi=100)
plt.close()
print(f" -> 🎨 차트 렌더링 완료: {out13.name}")
print("⏳ [17/19] chart17_cpu_limit_request_ratio QoS 안정성 등급 비율 측정 중...")
plt.figure(figsize=(10, 5))
sns.boxplot(data=df_pod, x="workload_type", y="lim_req_ratio", palette="vlag")
plt.xticks(rotation=30, ha="right")
plt.title("Kubernetes Pod CPU Limit / Request Overcommit Ratio")
plt.tight_layout()
out17 = PLOT_DIR / "chart17_cpu_limit_request_ratio.png"
plt.savefig(out17, dpi=100)
plt.close()
print(f" -> 🎨 차트 렌더링 완료: {out17.name}")
print("⏳ [18/19] chart19_daily_cpu_per_workload 일간 CPU 총괄 볼륨 스캔 중...")
df_daily_cpu_req = df_pod.groupby("date")["cpu_request_max"].sum()
df_daily_cpu_use = df_pod.groupby("date")["cpu_usage_p95"].sum()
plt.figure(figsize=(11, 5))
plt.fill_between(df_daily_cpu_req.index, df_daily_cpu_req.values, label="Total CPU Request Cores", color="skyblue", alpha=0.4)
plt.plot(df_daily_cpu_use.index, df_daily_cpu_use.values, label="Total CPU P95 Actual Core Vol", color="navy", linewidth=2.5, marker="o")
plt.title(" 전사 일별 CPU 공급 용량 vs 실효 피크 연산 소모량 추이 (KST)")
plt.xticks(rotation=30)
plt.legend(loc="upper left")
plt.tight_layout()
out19 = PLOT_DIR / "chart19_daily_cpu_per_workload.png"
plt.savefig(out19, dpi=100)
plt.close()
print(f" -> 🎨 차트 렌더링 완료: {out19.name}")
print("⏳ [19/19] chart20_daily_mem_per_workload 일간 메모리 총괄 볼륨 스캔 중...")
df_daily_mem_req = df_pod.groupby("date")["mem_request_max"].sum()
df_daily_mem_use = df_pod.groupby("date")["mem_usage_p95"].sum()
plt.figure(figsize=(11, 5))
plt.fill_between(df_daily_mem_req.index, df_daily_mem_req.values, label="Total Memory Request (GB)", color="plum", alpha=0.4)
plt.plot(df_daily_mem_use.index, df_daily_mem_use.values, label="Total Memory P95 Actual Vol (GB)", color="purple", linewidth=2.5, marker="o")
plt.title(" 전사 일별 메모리 공급 용량 vs 실효 피크 소모량 추이 (KST)")
plt.xticks(rotation=30)
plt.legend(loc="upper left")
plt.tight_layout()
out20 = PLOT_DIR / "chart20_daily_mem_per_workload.png"
plt.savefig(out20, dpi=100)
plt.close()
print(f" -> 🎨 차트 렌더링 완료: {out20.name}")
print("\n🏁 === [Step3 차트 드로잉 성공 마감] 총 19대 마스터 이미지 자산이 ./data/output/plots/ 에 동기화되었습니다. ===")
if __name__ == "__main__":
main()