"""
step3_analytics.py β Advanced Infrastructure Resource Visualization Engine (English Standard)
"""
import os
import re
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
BASE_DATA_DIR = Path("./data")
MERGED_DIR = BASE_DATA_DIR / "merged"
PLOT_DIR = BASE_DATA_DIR / "output" / "plots"
PLOT_DIR.mkdir(parents=True, exist_ok=True)
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['axes.unicode_minus'] = False
sns.set_theme(style="whitegrid")
def check_empty_data(df, chart_name):
if df.empty:
print(f" β οΈ [Empty Data] Skipping {chart_name} due to empty rows.")
return True
return False
def main():
print("π [Step3 κ°μ] FinOps λ€μ°¨μ μκ³μ΄ μλ¬Έ μκ°ν λΆμ μμ§ κ°λ...")
p1 = MERGED_DIR / "enriched_fixed_7d.parquet"
p2 = MERGED_DIR / "pareto_fixed_ns.parquet"
print(f"π κ°κ³΅ μλΆ μ€μΊ κ²½λ‘:\n - {p1}\n - {p2}")
if not p1.exists():
print("β μ€λ₯: step2 νμ΄νλΌμΈμ κ°κ³΅ μ°μΆλ¬Ό(Parquet)μ΄ μμ΅λλ€. step2λ₯Ό λ¨Όμ μ€ννμΈμ.")
return
df_pod = pd.read_parquet(p1)
df_ns = pd.read_parquet(p2) if p2.exists() else pd.DataFrame()
print(f"β
λ°μ΄ν° λ μ΄ν¬ λ‘λ μ±λ£ -> λΆμ λμ λͺ
μΈ μΈνΈ: {len(df_pod):,}ν μ€μΊ μ±κ³΅.")
df_pod["cpu_util"] = np.where(df_pod["cpu_request_max"] > 0, (df_pod["cpu_usage_p95"] / df_pod["cpu_request_max"] * 100), 0)
df_pod["mem_util"] = np.where(df_pod["mem_request_max"] > 0, (df_pod["mem_usage_p95"] / df_pod["mem_request_max"] * 100), 0)
df_pod["lim_req_ratio"] = np.where(df_pod["cpu_request_max"] > 0, df_pod["cpu_limit_max"] / df_pod["cpu_request_max"], 0)
df_pod["status_en"] = df_pod["status"].map({
"π₯ OOMμ₯μ λ°μ": "OOM Killed",
"β οΈ RequestλΆμ‘±": "Request Shortage",
"π κ³Όλ€ν λΉ": "Over-allocated",
"β
μ΅μ νμλ£": "Optimized"
}).fillna("Unknown")
print("\nβ³ [1/18] chart1_cpu_req_vs_usage_by_workload μκ°ν μ°μ° μ€...")
df_wl_cpu = df_pod.groupby("workload_type")[["cpu_request_max", "cpu_usage_p95"]].mean().reset_index()
plt.figure(figsize=(10, 5))
df_melt_cpu = df_wl_cpu.melt(id_vars="workload_type", value_vars=["cpu_request_max", "cpu_usage_p95"])
sns.barplot(data=df_melt_cpu, x="workload_type", y="value", hue="variable", palette="Blues_r")
plt.xticks(rotation=30, ha='right')
plt.title("Average CPU Request vs P95 Peak Usage by Workload Type")
plt.xlabel("Workload Type")
plt.ylabel("CPU Cores")
plt.tight_layout()
out1 = PLOT_DIR / "chart1_cpu_req_vs_usage_by_workload.png"
plt.savefig(out1, dpi=100)
plt.close()
print(f" -> π¨ μ°¨νΈ λ λλ§ μλ£: {out1.name}")
print("β³ [2/18] chart2_mem_req_vs_usage_by_workload κ³ λν(Memory RSS λ μ΄μ΄ νΈμ
) μ°μ° μ€...")
df_wl_mem = df_pod.groupby("workload_type")[["mem_request_max", "mem_usage_p95", "mem_rss_p95"]].mean().reset_index()
plt.figure(figsize=(10, 5))
df_melt_mem = df_wl_mem.melt(id_vars="workload_type", value_vars=["mem_request_max", "mem_usage_p95", "mem_rss_p95"])
sns.barplot(data=df_melt_mem, x="workload_type", y="value", hue="variable", palette="Purples_r")
plt.xticks(rotation=30, ha='right')
plt.title("Average Memory Specification vs WorkingSet vs RSS Peak (GB)")
plt.xlabel("Workload Type")
plt.ylabel("Memory Capacity (GB)")
plt.tight_layout()
out2 = PLOT_DIR / "chart2_mem_req_vs_usage_by_workload.png"
plt.savefig(out2, dpi=100)
plt.close()
print(f" -> π¨ μ°¨νΈ λ λλ§ μλ£: {out2.name}")
print("β³ [3/18] chart3_daily_waste_stack λμ μκ³μ΄ μ°μ° μ€...")
df_daily_waste = df_pod.groupby(["date", "workload_type"])["cpu_waste_core_hours"].sum().unstack().fillna(0)
if not check_empty_data(df_daily_waste, "chart3_daily_waste_stack"):
df_daily_waste.plot(kind='bar', stacked=True, figsize=(11, 5), cmap="tab20")
plt.title("Daily Total CPU Waste Core-Hours Stacked by Workload Type (KST)")
plt.xlabel("Date (KST)")
plt.ylabel("Waste Volume (Core-Hours)")
plt.xticks(rotation=45)
plt.tight_layout()
out3 = PLOT_DIR / "chart3_daily_waste_stack.png"
plt.savefig(out3, dpi=100)
plt.close()
print(f" -> π¨ μ°¨νΈ λ λλ§ μλ£: {out3.name}")
print("β³ [4/18] chart4_cpu_efficiency_heatmap ν¨μ¨ 격μ μ§λ μμ± μ€...")
df_heat_cpu = df_pod.groupby(["workload_type", "date"])["cpu_util"].mean().unstack().fillna(0)
plt.figure(figsize=(10, 5))
sns.heatmap(df_heat_cpu, annot=True, fmt=".1f", cmap="RdYlGn", cbar=True)
plt.title("Mean CPU Utilization Heatmap (%) (Workload Type x Date)")
plt.xlabel("Date (KST)")
plt.ylabel("Workload Type")
plt.tight_layout()
out4 = PLOT_DIR / "chart4_cpu_efficiency_heatmap.png"
plt.savefig(out4, dpi=100)
plt.close()
print(f" -> π¨ μ°¨νΈ λ λλ§ μλ£: {out4.name}")
print("β³ [5/18] chart18_mem_waste_heatmap λ©λͺ¨λ¦¬ μμ€ λ§€νΈλ¦μ€ μ°μ° μ€...")
df_heat_mem = df_pod.groupby(["workload_type", "date"])["mem_waste_gb_hours"].sum().unstack().fillna(0)
plt.figure(figsize=(10, 5))
sns.heatmap(df_heat_mem, annot=False, cmap="BuPu", cbar=True)
plt.title("Total Memory Waste Volume Heatmap (GB-Hours)")
plt.xlabel("Date (KST)")
plt.ylabel("Workload Type")
plt.tight_layout()
out18 = PLOT_DIR / "chart18_mem_waste_heatmap.png"
plt.savefig(out18, dpi=100)
plt.close()
print(f" -> π¨ μ°¨νΈ λ λλ§ μλ£: {out18.name}")
print("β³ [6/18] chart5_pareto_ns_waste λΉμ© λλΉ λΆμ μμ΄ν μΆμ μ€...")
if not check_empty_data(df_ns, "chart5_pareto_ns_waste"):
df_ns_top = df_ns.head(15)
fig, ax1 = plt.subplots(figsize=(11, 5))
sns.barplot(data=df_ns_top, x="namespace", y="total_waste_core_hours", ax=ax1, color="steelblue")
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha="right")
ax2 = ax1.twinx()
ax2.plot(df_ns_top["namespace"], df_ns_top["waste_cumsum_pct"], color="crimson", marker="o", linewidth=2)
ax2.set_ylim(0, 110)
plt.title("Top 15 Namespace Cost-Waste Pareto Chart (Cumulative Share %)")
ax1.set_xlabel("Tenant Namespace")
ax1.set_ylabel("Waste Volume (Core-Hours)")
plt.tight_layout()
out5 = PLOT_DIR / "chart5_pareto_ns_waste.png"
plt.savefig(out5, dpi=100)
plt.close()
print(f" -> π¨ μ°¨νΈ λ λλ§ μλ£: {out5.name}")
print("β³ [7/18] chart11_pareto_workload_waste κΈ°μ λλ©μΈ μ§λΆμ¨ μ§κ³ μ€...")
df_wl_waste = df_pod.groupby("workload_type")["cpu_waste_core_hours"].sum().reset_index().sort_values("cpu_waste_core_hours", ascending=False)
plt.figure(figsize=(10, 5))
sns.barplot(data=df_wl_waste, x="workload_type", y="cpu_waste_core_hours", palette="Oranges_r")
plt.xticks(rotation=30, ha="right")
plt.title("Total CPU Waste Volume by Workload Type")
plt.xlabel("Workload Type")
plt.ylabel("Total Waste (Core-Hours)")
plt.tight_layout()
out11 = PLOT_DIR / "chart11_pareto_workload_waste.png"
plt.savefig(out11, dpi=100)
plt.close()
print(f" -> π¨ μ°¨νΈ λ λλ§ μλ£: {out11.name}")
print("β³ [8/18] chart6_status_donut κ±°λ²λμ€ κ±΄μ μ± λΉμ¨ λμΆ μ€...")
status_summary = df_pod["status_en"].value_counts()
plt.figure(figsize=(6, 5))
colors = ["#70AD47", "#1F4E79", "#FFC000", "#C00000"]
plt.pie(status_summary, labels=status_summary.index, autopct='%1.1f%%', startangle=90, colors=colors[:len(status_summary)], wedgeprops=dict(width=0.4, edgecolor='w'))
plt.title("Infrastructure Resource Governance Status Ratio")
plt.tight_layout()
out6 = PLOT_DIR / "chart6_status_donut.png"
plt.savefig(out6, dpi=100)
plt.close()
print(f" -> π¨ μ°¨νΈ λ λλ§ μλ£: {out6.name}")
print("β³ [9/18] chart7_waste_footprint_bubble 3μ°¨μ μ
체 κ΄κ³ λΆν¬λ λ§€ν μ€...")
df_bubble = df_pod.groupby("workload_type").agg(
x_alloc=("cpu_allocated_core_hours", "sum"),
y_util=("cpu_util", "mean"),
z_waste=("cpu_waste_core_hours", "sum")
).reset_index()
plt.figure(figsize=(9, 6))
sns.scatterplot(data=df_bubble, x="x_alloc", y="y_util", size="z_waste", hue="workload_type", sizes=(100, 2000), alpha=0.7, legend="brief")
plt.title("Resource Footprint Bubble Chart (Allocated x Utilization x Waste Size)")
plt.xlabel("Total Allocated Core-Hours")
plt.ylabel("Average Utilization (%)")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
out7 = PLOT_DIR / "chart7_waste_footprint_bubble.png"
plt.savefig(out7, dpi=100)
plt.close()
print(f" -> π¨ μ°¨νΈ λ λλ§ μλ£: {out7.name}")
print("β³ [10/18] chart14_cpu_mem_waste_scatter μμ μμ€ κ΅μ°¨ κ΅μ νΈλνΉ μ€...")
plt.figure(figsize=(9, 6))
sns.scatterplot(data=df_pod.head(5000), x="cpu_waste_core_hours", y="mem_waste_gb_hours", size="pv_waste_gb_hours", hue="workload_type", alpha=0.5, sizes=(20, 400))
plt.title("Multi-Resource Loss Co-relation: CPU vs Memory vs PV Storage (Size)")
plt.xlabel("CPU Waste (Core-Hours)")
plt.ylabel("Memory Waste (GB-Hours)")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
out14 = PLOT_DIR / "chart14_cpu_mem_waste_scatter.png"
plt.savefig(out14, dpi=100)
plt.close()
print(f" -> π¨ μ°¨νΈ λ λλ§ μλ£: {out14.name}")
print("β³ [11/18] chart8_shortfall_footprint 리μμ€ λΆμ‘± λ° μ€λ‘νλ§ λ³λͺ© νμ§ μ€...")
df_short = df_pod.groupby(["workload_type", "date"])["cpu_throttled_max"].max().unstack().fillna(0)
plt.figure(figsize=(10, 4))
sns.heatmap(df_short, annot=False, cmap="YlOrRd", cbar=True)
plt.title("Container CPU CFS Throttled Periods Peak Footprint (Risk Matrix)")
plt.xlabel("Date (KST)")
plt.ylabel("Workload Type")
plt.tight_layout()
out8 = PLOT_DIR / "chart8_shortfall_footprint.png"
plt.savefig(out8, dpi=100)
plt.close()
print(f" -> π¨ μ°¨νΈ λ λλ§ μλ£: {out8.name}")
print("β³ [12/18] chart9_boxplot_cpu_util_by_workload CPU νμ€ λΆν¬λ μ°μ° μ€...")
plt.figure(figsize=(11, 5))
sns.boxplot(data=df_pod, x="workload_type", y="cpu_util", palette="Set3")
plt.xticks(rotation=30, ha="right")
plt.ylim(-5, 105)
plt.title("CPU Utilization P95 Boxplot Distribution by Workload Type")
plt.xlabel("Workload Type")
plt.ylabel("P95 Actual Utilization (%)")
plt.tight_layout()
out9 = PLOT_DIR / "chart9_boxplot_cpu_util_by_workload.png"
plt.savefig(out9, dpi=100)
plt.close()
print(f" -> π¨ μ°¨νΈ λ λλ§ μλ£: {out9.name}")
print("β³ [13/18] chart10_boxplot_mem_util_by_workload λ©λͺ¨λ¦¬ νμ€ λΆν¬λ μ°μ° μ€...")
plt.figure(figsize=(11, 5))
sns.boxplot(data=df_pod, x="workload_type", y="mem_util", palette="Pastel1")
plt.xticks(rotation=30, ha="right")
plt.ylim(-5, 105)
plt.title("Memory Utilization P95 Boxplot Distribution by Workload Type")
plt.xlabel("Workload Type")
plt.ylabel("P95 Actual Utilization (%)")
plt.tight_layout()
out10 = PLOT_DIR / "chart10_boxplot_mem_util_by_workload.png"
plt.savefig(out10, dpi=100)
plt.close()
print(f" -> π¨ μ°¨νΈ λ λλ§ μλ£: {out10.name}")
print("β³ [14/18] chart12_daily_waste_trend_by_workload μ₯κΈ° νμλΌμΈ λ³λ μΆμ μ€...")
df_trend_wl = df_pod.groupby(["date", "workload_type"])["cpu_waste_core_hours"].sum().unstack().fillna(0)
plt.figure(figsize=(11, 5))
sns.lineplot(data=df_trend_wl, markers=True, dashes=False, linewidth=2)
plt.title("Daily CPU Waste Timeline Trend Line by Workload Type")
plt.xlabel("Date (KST)")
plt.ylabel("Waste Volume (Core-Hours)")
plt.xticks(rotation=30)
plt.tight_layout()
out12 = PLOT_DIR / "chart12_daily_waste_trend_by_workload.png"
plt.savefig(out12, dpi=100)
plt.close()
print(f" -> π¨ μ°¨νΈ λ λλ§ μλ£: {out12.name}")
print("β³ [15/18] chart15_oom_status_by_workload μ₯μ κ΅° μΉ΄μ΄νΈ λΆλ₯ μ€...")
plt.figure(figsize=(11, 5))
sns.countplot(data=df_pod, x="workload_type", hue="status_en", palette="muted")
plt.xticks(rotation=30, ha="right")
plt.title("Governance Status Distribution Count per Workload Type")
plt.xlabel("Workload Type")
plt.ylabel("Pod Count")
plt.legend(loc="upper right")
plt.tight_layout()
out15 = PLOT_DIR / "chart15_oom_status_by_workload.png"
plt.savefig(out15, dpi=100)
plt.close()
print(f" -> π¨ μ°¨νΈ λ λλ§ μλ£: {out15.name}")
print("β³ [16/18] chart13_violin_cpu_util κ³ λ°λ 컀λ λ°λ λΆν¬ μΆμ μ€...")
plt.figure(figsize=(10, 5))
sns.violinplot(data=df_pod, x="workload_type", y="cpu_util", inner="quartile", palette="pastel")
plt.xticks(rotation=30, ha="right")
plt.title("CPU Utilization Kernel Density Violin Plot")
plt.xlabel("Workload Type")
plt.ylabel("CPU Utilization (%)")
plt.tight_layout()
out13 = PLOT_DIR / "chart13_violin_cpu_util.png"
plt.savefig(out13, dpi=100)
plt.close()
print(f" -> π¨ μ°¨νΈ λ λλ§ μλ£: {out13.name}")
print("β³ [17/18] chart17_cpu_limit_request_ratio μ€λ²μ»€λ° QoS λΉμ¨ μΈ‘μ μ€...")
plt.figure(figsize=(10, 5))
sns.boxplot(data=df_pod, x="workload_type", y="lim_req_ratio", palette="vlag")
plt.xticks(rotation=30, ha="right")
plt.title("Kubernetes Pod CPU Limit / Request Overcommit Ratio")
plt.xlabel("Workload Type")
plt.ylabel("Limit / Request Ratio (Factor)")
plt.tight_layout()
out17 = PLOT_DIR / "chart17_cpu_limit_request_ratio.png"
plt.savefig(out17, dpi=100)
plt.close()
print(f" -> π¨ μ°¨νΈ λ λλ§ μλ£: {out17.name}")
print("β³ [17/18] chart19_daily_cpu_per_workload μΌκ° CPU μ΄λ μΆμ΄ λ‘€μ
μ€...")
df_daily_cpu_req = df_pod.groupby("date")["cpu_request_max"].sum()
df_daily_cpu_use = df_pod.groupby("date")["cpu_usage_p95"].sum()
plt.figure(figsize=(11, 5))
plt.fill_between(df_daily_cpu_req.index, df_daily_cpu_req.values, label="Total CPU Request Cores (Capacity)", color="skyblue", alpha=0.4)
plt.plot(df_daily_cpu_use.index, df_daily_cpu_use.values, label="Total CPU P95 Actual Consumption", color="navy", linewidth=2.5, marker="o")
plt.title("Daily Total CPU Capacity Allocation vs Actual Peak Consumption Trend (KST)")
plt.xlabel("Date (KST)")
plt.ylabel("Total CPU Cores")
plt.xticks(rotation=30)
plt.legend(loc="upper left")
plt.tight_layout()
out19 = PLOT_DIR / "chart19_daily_cpu_per_workload.png"
plt.savefig(out19, dpi=100)
plt.close()
print(f" -> π¨ μ°¨νΈ λ λλ§ μλ£: {out19.name}")
print("β³ [18/18] chart20_daily_mem_per_workload μΌκ° λ©λͺ¨λ¦¬/μ€ν λ¦¬μ§ λ©΄μ μ μ¬ μ€...")
df_daily_mem_req = df_pod.groupby("date")["mem_request_max"].sum()
df_daily_mem_use = df_pod.groupby("date")["mem_usage_p95"].sum()
df_daily_pv_req = df_pod.groupby("date")["pv_capacity_max"].sum()
fig, ax1 = plt.subplots(figsize=(11, 5))
ax1.fill_between(df_daily_mem_req.index, df_daily_mem_req.values, label="Total Memory Request (GB)", color="plum", alpha=0.4)
ax1.plot(df_daily_mem_use.index, df_daily_mem_use.values, label="Total Memory P95 Actual Usage", color="purple", linewidth=2.5, marker="o")
ax1.set_ylabel("Memory Volume (GB)")
ax1.set_xlabel("Date (KST)")
ax2 = ax1.twinx()
ax2.plot(df_daily_pv_req.index, df_daily_pv_req.values, label="Total PV Provisions (GB)", color="teal", linewidth=2, linestyle="--", marker="x")
ax2.set_ylabel("PV Storage Volume (GB)")
plt.title("Daily Memory & PV Storage Space Provisions vs Active Peak Consumption")
fig.legend(loc="upper left", bbox_to_anchor=(0.1, 0.9))
plt.xticks(rotation=30)
plt.tight_layout()
out20 = PLOT_DIR / "chart20_daily_mem_per_workload.png"
plt.savefig(out20, dpi=100)
plt.close()
print(f" -> π¨ μ°¨νΈ λ λλ§ μλ£: {out20.name}")
print("\nπ === [Step3 μκ²°] κΈμ κΉ¨μ§μ΄ μμ² λ°λ©Έλ 19λ λ§μ€ν° μλ¬Έ μ°¨νΈκ° ./data/output/plots/ μ λκΈ°νλμμ΅λλ€. ===")
if __name__ == "__main__":
main()