"""
step3_analytics.py — Long-term Time-series Infrastructure Data Visualization Engine (English Standard)
"""
import os
import re
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
BASE_DATA_DIR = Path("./data")
MERGED_DIR = BASE_DATA_DIR / "merged"
PLOT_DIR = BASE_DATA_DIR / "output" / "plots"
PLOT_DIR.mkdir(parents=True, exist_ok=True)
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['axes.unicode_minus'] = False
sns.set_theme(style="whitegrid")
def check_empty_data(df, chart_name):
if df.empty:
print(f" ⚠️ [Empty Data] Skipping {chart_name} due to empty rows.")
return True
return False
def main():
print("🚀 [Step3] Starting FinOps Time-series English Visualization Engine...")
p1 = MERGED_DIR / "enriched_fixed_7d.parquet"
p2 = MERGED_DIR / "pareto_fixed_ns.parquet"
if not p1.exists():
print("❌ Error: Processed Parquet from step2 not found. Please run step2 first.")
return
df_pod = pd.read_parquet(p1)
df_ns = pd.read_parquet(p2) if p2.exists() else pd.DataFrame()
print(f"✅ Data lake loaded successfully -> Container rows: {len(df_pod):,}\n")
df_pod["cpu_util"] = np.where(df_pod["cpu_request_max"] > 0, (df_pod["cpu_usage_p95"] / df_pod["cpu_request_max"] * 100), 0)
df_pod["mem_util"] = np.where(df_pod["mem_request_max"] > 0, (df_pod["mem_usage_p95"] / df_pod["mem_request_max"] * 100), 0)
df_pod["lim_req_ratio"] = np.where(df_pod["cpu_request_max"] > 0, df_pod["cpu_limit_max"] / df_pod["cpu_request_max"], 0)
print("⏳ [1/19] Rendering chart1_cpu_req_vs_usage_by_workload...")
df_wl_cpu = df_pod.groupby("workload_type")[["cpu_request_max", "cpu_usage_p95"]].mean().reset_index()
plt.figure(figsize=(10, 5))
df_melt_cpu = df_wl_cpu.melt(id_vars="workload_type", value_vars=["cpu_request_max", "cpu_usage_p95"])
sns.barplot(data=df_melt_cpu, x="workload_type", y="value", hue="variable", palette="Blues_r")
plt.xticks(rotation=30, ha='right')
plt.title("Average CPU Request vs P95 Peak Usage by Workload Type")
plt.xlabel("Workload Type")
plt.ylabel("CPU Cores")
plt.tight_layout()
out1 = PLOT_DIR / "chart1_cpu_req_vs_usage_by_workload.png"
plt.savefig(out1, dpi=100)
plt.close()
print(f" -> 🎨 Chart generated: {out1.name}")
print("⏳ [2/19] Rendering chart2_mem_req_vs_usage_by_workload...")
df_wl_mem = df_pod.groupby("workload_type")[["mem_request_max", "mem_usage_p95"]].mean().reset_index()
plt.figure(figsize=(10, 5))
df_melt_mem = df_wl_mem.melt(id_vars="workload_type", value_vars=["mem_request_max", "mem_usage_p95"])
sns.barplot(data=df_melt_mem, x="workload_type", y="value", hue="variable", palette="Purples_r")
plt.xticks(rotation=30, ha='right')
plt.title("Average Memory Request vs P95 Peak Usage (GB) by Workload Type")
plt.xlabel("Workload Type")
plt.ylabel("Memory Capacity (GB)")
plt.tight_layout()
out2 = PLOT_DIR / "chart2_mem_req_vs_usage_by_workload.png"
plt.savefig(out2, dpi=100)
plt.close()
print(f" -> 🎨 Chart generated: {out2.name}")
print("⏳ [3/19] Rendering chart3_daily_waste_stack...")
df_daily_waste = df_pod.groupby(["date", "workload_type"])["cpu_waste_core_hours"].sum().unstack().fillna(0)
if not check_empty_data(df_daily_waste, "chart3_daily_waste_stack"):
df_daily_waste.plot(kind='bar', stacked=True, figsize=(11, 5), cmap="tab20")
plt.title("Daily Total CPU Waste Core-Hours Stacked by Workload (KST)")
plt.xlabel("Date (KST)")
plt.ylabel("Waste Volume (Core-Hours)")
plt.xticks(rotation=45)
plt.tight_layout()
out3 = PLOT_DIR / "chart3_daily_waste_stack.png"
plt.savefig(out3, dpi=100)
plt.close()
print(f" -> 🎨 Chart generated: {out3.name}")
print("⏳ [4/19] Rendering chart4_cpu_efficiency_heatmap...")
df_heat_cpu = df_pod.groupby(["workload_type", "date"])["cpu_util"].mean().unstack().fillna(0)
plt.figure(figsize=(10, 5))
sns.heatmap(df_heat_cpu, annot=True, fmt=".1f", cmap="RdYlGn", cbar=True)
plt.title("Mean CPU Utilization Heatmap (%) (Workload Type x Date)")
plt.xlabel("Date (KST)")
plt.ylabel("Workload Type")
plt.tight_layout()
out4 = PLOT_DIR / "chart4_cpu_efficiency_heatmap.png"
plt.savefig(out4, dpi=100)
plt.close()
print(f" -> 🎨 Chart generated: {out4.name}")
print("⏳ [5/19] Rendering chart18_mem_waste_heatmap...")
df_heat_mem = df_pod.groupby(["workload_type", "date"])["mem_waste_gb_hours"].sum().unstack().fillna(0)
plt.figure(figsize=(10, 5))
sns.heatmap(df_heat_mem, annot=False, cmap="BuPu", cbar=True)
plt.title("Total Memory Waste Volume Heatmap (GB-Hours)")
plt.xlabel("Date (KST)")
plt.ylabel("Workload Type")
plt.tight_layout()
out18 = PLOT_DIR / "chart18_mem_waste_heatmap.png"
plt.savefig(out18, dpi=100)
plt.close()
print(f" -> 🎨 Chart generated: {out18.name}")
print("⏳ [6/19] Rendering chart5_pareto_ns_waste...")
if not check_empty_data(df_ns, "chart5_pareto_ns_waste"):
df_ns_top = df_ns.head(15)
fig, ax1 = plt.subplots(figsize=(11, 5))
sns.barplot(data=df_ns_top, x="namespace", y="total_waste_core_hours", ax=ax1, color="styleblue" if "styleblue" in plt.colormaps() else "steelblue")
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha="right")
ax2 = ax1.twinx()
ax2.plot(df_ns_top["namespace"], df_ns_top["waste_cumsum_pct"], color="crimson", marker="o", linewidth=2)
ax2.set_ylim(0, 110)
plt.title("Top 15 Namespace Cost-Waste Pareto Chart (Cumulative Share %)")
ax1.set_xlabel("Tenant Namespace")
ax1.set_ylabel("Waste Volume (Core-Hours)")
plt.tight_layout()
out5 = PLOT_DIR / "chart5_pareto_ns_waste.png"
plt.savefig(out5, dpi=100)
plt.close()
print(f" -> 🎨 Chart generated: {out5.name}")
print("⏳ [7/19] Rendering chart11_pareto_workload_waste...")
df_wl_waste = df_pod.groupby("workload_type")["cpu_waste_core_hours"].sum().reset_index().sort_values("cpu_waste_core_hours", ascending=False)
plt.figure(figsize=(10, 5))
sns.barplot(data=df_wl_waste, x="workload_type", y="cpu_waste_core_hours", palette="Oranges_r")
plt.xticks(rotation=30, ha="right")
plt.title("Total CPU Waste Volume by Workload Type")
plt.xlabel("Workload Type")
plt.ylabel("Total Waste (Core-Hours)")
plt.tight_layout()
out11 = PLOT_DIR / "chart11_pareto_workload_waste.png"
plt.savefig(out11, dpi=100)
plt.close()
print(f" -> 🎨 Chart generated: {out11.name}")
print("⏳ [8/19] Rendering chart6_status_donut...")
status_summary = df_pod["status"].value_counts()
plt.figure(figsize=(6, 5))
colors = ["#70AD47", "#1F4E79", "#FFC000", "#C00000"]
plt.pie(status_summary, labels=status_summary.index, autopct='%1.1f%%', startangle=90, colors=colors[:len(status_summary)], wedgeprops=dict(width=0.4, edgecolor='w'))
plt.title("Infrastructure Resource Governance Status Ratio")
plt.tight_layout()
out6 = PLOT_DIR / "chart6_status_donut.png"
plt.savefig(out6, dpi=100)
plt.close()
print(f" -> 🎨 Chart generated: {out6.name}")
print("⏳ [9/19] Rendering chart7_waste_footprint_bubble...")
df_bubble = df_pod.groupby("workload_type").agg(
x_alloc=("cpu_allocated_core_hours", "sum"),
y_util=("cpu_util", "mean"),
z_waste=("cpu_waste_core_hours", "sum")
).reset_index()
plt.figure(figsize=(9, 6))
sns.scatterplot(data=df_bubble, x="x_alloc", y="y_util", size="z_waste", hue="workload_type", sizes=(100, 2000), alpha=0.7, legend="brief")
plt.title("Resource Footprint Bubble Chart (Allocated x Utilization x Waste Size)")
plt.xlabel("Total Allocated Core-Hours")
plt.ylabel("Average Utilization (%)")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
out7 = PLOT_DIR / "chart7_waste_footprint_bubble.png"
plt.savefig(out7, dpi=100)
plt.close()
print(f" -> 🎨 Chart generated: {out7.name}")
print("⏳ [10/19] Rendering chart14_cpu_mem_waste_scatter...")
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_pod.head(5000), x="cpu_waste_core_hours", y="mem_waste_gb_hours", hue="workload_type", alpha=0.5)
plt.title("Co-relation Scatter Plot: CPU Waste vs Memory Waste (Sampled)")
plt.xlabel("CPU Waste (Core-Hours)")
plt.ylabel("Memory Waste (GB-Hours)")
plt.tight_layout()
out14 = PLOT_DIR / "chart14_cpu_mem_waste_scatter.png"
plt.savefig(out14, dpi=100)
plt.close()
print(f" -> 🎨 Chart generated: {out14.name}")
print("⏳ [11/19] Rendering chart8_shortfall_footprint...")
df_short = df_pod.groupby(["workload_type", "date"])["cpu_shortage_cores"].sum().unstack().fillna(0)
plt.figure(figsize=(10, 4))
sns.heatmap(df_short, annot=False, cmap="YlOrRd", cbar=True)
plt.title("Total CPU Shortfall (Deficit) Cores Footprint")
plt.xlabel("Date (KST)")
plt.ylabel("Workload Type")
plt.tight_layout()
out8 = PLOT_DIR / "chart8_shortfall_footprint.png"
plt.savefig(out8, dpi=100)
plt.close()
print(f" -> 🎨 Chart generated: {out8.name}")
print("⏳ [12/19] Rendering chart9_boxplot_cpu_util_by_workload...")
plt.figure(figsize=(11, 5))
sns.boxplot(data=df_pod, x="workload_type", y="cpu_util", palette="Set3")
plt.xticks(rotation=30, ha="right")
plt.ylim(-5, 105)
plt.title("CPU Utilization P95 Boxplot Distribution by Workload Type")
plt.xlabel("Workload Type")
plt.ylabel("P95 Actual Utilization (%)")
plt.tight_layout()
out9 = PLOT_DIR / "chart9_boxplot_cpu_util_by_workload.png"
plt.savefig(out9, dpi=100)
plt.close()
print(f" -> 🎨 Chart generated: {out9.name}")
print("⏳ [13/19] Rendering chart10_boxplot_mem_util_by_workload...")
plt.figure(figsize=(11, 5))
sns.boxplot(data=df_pod, x="workload_type", y="mem_util", palette="Pastel1")
plt.xticks(rotation=30, ha="right")
plt.ylim(-5, 105)
plt.title("Memory Utilization P95 Boxplot Distribution by Workload Type")
plt.xlabel("Workload Type")
plt.ylabel("P95 Actual Utilization (%)")
plt.tight_layout()
out10 = PLOT_DIR / "chart10_boxplot_mem_util_by_workload.png"
plt.savefig(out10, dpi=100)
plt.close()
print(f" -> 🎨 Chart generated: {out10.name}")
print("⏳ [14/19] Rendering chart12_daily_waste_trend_by_workload...")
df_trend_wl = df_pod.groupby(["date", "workload_type"])["cpu_waste_core_hours"].sum().unstack().fillna(0)
plt.figure(figsize=(11, 5))
sns.lineplot(data=df_trend_wl, markers=True, dashes=False, linewidth=2)
plt.title("Daily CPU Waste Timeline Trend Line by Workload Type")
plt.xlabel("Date (KST)")
plt.ylabel("Waste Volume (Core-Hours)")
plt.xticks(rotation=30)
plt.tight_layout()
out12 = PLOT_DIR / "chart12_daily_waste_trend_by_workload.png"
plt.savefig(out12, dpi=100)
plt.close()
print(f" -> 🎨 Chart generated: {out12.name}")
print("⏳ [15/19] Rendering chart15_oom_status_by_workload...")
plt.figure(figsize=(11, 5))
sns.countplot(data=df_pod, x="workload_type", hue="status", palette="muted")
plt.xticks(rotation=30, ha="right")
plt.title("Governance Status Distribution Count per Workload Type")
plt.xlabel("Workload Type")
plt.ylabel("Pod Count")
plt.legend(loc="upper right")
plt.tight_layout()
out15 = PLOT_DIR / "chart15_oom_status_by_workload.png"
plt.savefig(out15, dpi=100)
plt.close()
print(f" -> 🎨 Chart generated: {out15.name}")
print("⏳ [16/19] Rendering chart13_violin_cpu_util...")
plt.figure(figsize=(10, 5))
sns.violinplot(data=df_pod, x="workload_type", y="cpu_util", inner="quartile", palette="pastel")
plt.xticks(rotation=30, ha="right")
plt.title("CPU Utilization Kernel Density Violin Plot")
plt.xlabel("Workload Type")
plt.ylabel("CPU Utilization (%)")
plt.tight_layout()
out13 = PLOT_DIR / "chart13_violin_cpu_util.png"
plt.savefig(out13, dpi=100)
plt.close()
print(f" -> 🎨 Chart generated: {out13.name}")
print("⏳ [17/19] Rendering chart17_cpu_limit_request_ratio...")
plt.figure(figsize=(10, 5))
sns.boxplot(data=df_pod, x="workload_type", y="lim_req_ratio", palette="vlag")
plt.xticks(rotation=30, ha="right")
plt.title("Kubernetes Pod CPU Limit / Request Overcommit Ratio")
plt.xlabel("Workload Type")
plt.ylabel("Limit / Request Ratio")
plt.tight_layout()
out17 = PLOT_DIR / "chart17_cpu_limit_request_ratio.png"
plt.savefig(out17, dpi=100)
plt.close()
print(f" -> 🎨 Chart generated: {out17.name}")
print("⏳ [18/19] Rendering chart19_daily_cpu_per_workload...")
df_daily_cpu_req = df_pod.groupby("date")["cpu_request_max"].sum()
df_daily_cpu_use = df_pod.groupby("date")["cpu_usage_p95"].sum()
plt.figure(figsize=(11, 5))
plt.fill_between(df_daily_cpu_req.index, df_daily_cpu_req.values, label="Total CPU Request Cores", color="skyblue", alpha=0.4)
plt.plot(df_daily_cpu_use.index, df_daily_cpu_use.values, label="Total CPU P95 Actual Cores", color="navy", linewidth=2.5, marker="o")
plt.title("Daily Total CPU Capacity Allocation vs Actual Peak Consumption (KST)")
plt.xlabel("Date (KST)")
plt.ylabel("Total CPU Cores")
plt.xticks(rotation=30)
plt.legend(loc="upper left")
plt.tight_layout()
out19 = PLOT_DIR / "chart19_daily_cpu_per_workload.png"
plt.savefig(out19, dpi=100)
plt.close()
print(f" -> 🎨 Chart generated: {out19.name}")
print("⏳ [19/19] Rendering chart20_daily_mem_per_workload...")
df_daily_mem_req = df_pod.groupby("date")["mem_request_max"].sum()
df_daily_mem_use = df_pod.groupby("date")["mem_usage_p95"].sum()
plt.figure(figsize=(11, 5))
plt.fill_between(df_daily_mem_req.index, df_daily_mem_req.values, label="Total Memory Request (GB)", color="plum", alpha=0.4)
plt.plot(df_daily_mem_use.index, df_daily_mem_use.values, label="Total Memory P95 Actual (GB)", color="purple", linewidth=2.5, marker="o")
plt.title("Daily Total Memory Capacity Allocation vs Actual Peak Consumption (KST)")
plt.xlabel("Date (KST)")
plt.ylabel("Total Memory (GB)")
plt.xticks(rotation=30)
plt.legend(loc="upper left")
plt.tight_layout()
out20 = PLOT_DIR / "chart20_daily_mem_per_workload.png"
plt.savefig(out20, dpi=100)
plt.close()
print(f" -> 🎨 Chart generated: {out20.name}")
print("\n🏁 === [Step3 Success] All 19 charts generated cleanly in English under ./data/output/plots/ ===")
if __name__ == "__main__":
main()