poc0806a

Young-Kyoo Kim·2025년 8월 5일
0

{
"dashboard": {
"id": null,
"title": "Cloud Native Data Lake House - Cluster Overview",
"tags": ["kubernetes", "minio", "aistor", "overview"],
"style": "dark",
"timezone": "browser",
"refresh": "30s",
"time": {
"from": "now-1h",
"to": "now"
},
"panels": [
{
"id": 1,
"title": "Cluster Status Overview",
"type": "stat",
"gridPos": {"h": 4, "w": 24, "x": 0, "y": 0},
"targets": [
{
"expr": "up{job=~\"kubernetes.*\"}",
"legendFormat": "{{job}} - {{instance}}"
}
],
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {"displayMode": "list", "orientation": "horizontal"}
}
}
},
{
"id": 2,
"title": "MinIO Cluster Health",
"type": "stat",
"gridPos": {"h": 6, "w": 8, "x": 0, "y": 4},
"targets": [
{
"expr": "minio_cluster_nodes_online_total",
"legendFormat": "Online Nodes"
},
{
"expr": "minio_cluster_nodes_offline_total",
"legendFormat": "Offline Nodes"
},
{
"expr": "minio_cluster_drive_online_total",
"legendFormat": "Online Drives"
},
{
"expr": "minio_cluster_drive_offline_total",
"legendFormat": "Offline Drives"
}
],
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"thresholds": {
"steps": [
{"color": "red", "value": null},
{"color": "yellow", "value": 0.8},
{"color": "green", "value": 0.9}
]
}
}
}
},
{
"id": 3,
"title": "MinIO Storage Capacity",
"type": "gauge",
"gridPos": {"h": 6, "w": 8, "x": 8, "y": 4},
"targets": [
{
"expr": "(minio_cluster_capacity_usable_total_bytes - minio_cluster_capacity_usable_free_bytes) / minio_cluster_capacity_usable_total_bytes * 100",
"legendFormat": "Storage Usage %"
}
],
"fieldConfig": {
"defaults": {
"max": 100,
"min": 0,
"unit": "percent",
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 70},
{"color": "red", "value": 85}
]
}
}
}
},
{
"id": 4,
"title": "MinIO Request Rate",
"type": "stat",
"gridPos": {"h": 6, "w": 8, "x": 16, "y": 4},
"targets": [
{
"expr": "sum(rate(minio_http_requests_total[5m]))",
"legendFormat": "Requests/sec"
},
{
"expr": "sum(rate(minio_s3_requests_total[5m]))",
"legendFormat": "S3 Requests/sec"
}
]
},
{
"id": 5,
"title": "Kubernetes API Server",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 10},
"targets": [
{
"expr": "rate(apiserver_request_total[5m])",
"legendFormat": "{{verb}} {{resource}"
},
{
"expr": "histogram_quantile(0.99, rate(apiserver_request_duration_seconds_bucket[5m]))",
"legendFormat": "99th percentile latency"
},
{
"expr": "apiserver_current_inflight_requests",
"legendFormat": "Inflight requests"
},
{
"expr": "rate(apiserver_request_total{code=~\"5..\"}[5m])",
"legendFormat": "5xx Error Rate"
}
]
},
{
"id": 6,
"title": "ETCD Performance",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 10},
"targets": [
{
"expr": "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))",
"legendFormat": "WAL fsync 99th percentile"
},
{
"expr": "histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))",
"legendFormat": "Backend commit 99th percentile"
},
{
"expr": "etcd_server_has_leader",
"legendFormat": "Has Leader"
},
{
"expr": "etcd_server_leader_changes_seen_total",
"legendFormat": "Leader changes"
}
]
},
{
"id": 7,
"title": "CoreDNS Performance",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 18},
"targets": [
{
"expr": "sum(rate(coredns_dns_requests_total[5m])) by (zone)",
"legendFormat": "DNS Requests - {{zone}}"
},
{
"expr": "sum(rate(coredns_dns_responses_total{rcode=\"NOERROR\"}[5m])) / sum(rate(coredns_dns_responses_total[5m]))",
"legendFormat": "Success Rate"
},
{
"expr": "histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "99th percentile latency"
}
]
},
{
"id": 8,
"title": "NodeLocalDNS Performance",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 18},
"targets": [
{
"expr": "sum(rate(coredns_dns_requests_total{job=\"node-local-dns\"}[5m]))",
"legendFormat": "Local DNS Requests"
},
{
"expr": "sum(rate(coredns_forward_requests_total{job=\"node-local-dns\"}[5m]))",
"legendFormat": "Forwarded Requests"
},
{
"expr": "sum(rate(coredns_cache_hits_total{job=\"node-local-dns\"}[5m])) / sum(rate(coredns_dns_requests_total{job=\"node-local-dns\"}[5m]))",
"legendFormat": "Cache Hit Rate"
}
]
},
{
"id": 9,
"title": "Cilium Network Health",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 18},
"targets": [
{
"expr": "cilium_datapath_conntrack_gc_runs_total",
"legendFormat": "Conntrack GC runs"
},
{
"expr": "cilium_drop_count_total",
"legendFormat": "Dropped packets"
},
{
"expr": "cilium_forward_count_total",
"legendFormat": "Forwarded packets"
},
{
"expr": "cilium_policy_verdict_total",
"legendFormat": "Policy verdicts"
}
]
},
{
"id": 10,
"title": "Node Resource Usage - CPU",
"type": "timeseries",
"gridPos": {"h": 8, "w": 6, "x": 0, "y": 26},
"targets": [
{
"expr": "100 - (avg by (instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) 100)",
"legendFormat": "{{instance}} CPU Usage"
},
{
"expr": "avg(100 - (avg by (instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m]))
100))",
"legendFormat": "Cluster Average CPU"
}
]
},
{
"id": 11,
"title": "Node Resource Usage - Memory",
"type": "timeseries",
"gridPos": {"h": 8, "w": 6, "x": 6, "y": 26},
"targets": [
{
"expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) 100",
"legendFormat": "{{instance}} Memory Usage"
},
{
"expr": "avg((1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))
100)",
"legendFormat": "Cluster Average Memory"
}
]
},
{
"id": 12,
"title": "Node Resource Usage - Disk",
"type": "timeseries",
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 26},
"targets": [
{
"expr": "(1 - (node_filesystem_avail_bytes{fstype!=\"tmpfs\"} / node_filesystem_size_bytes{fstype!=\"tmpfs\"})) * 100",
"legendFormat": "{{instance}}:{{mountpoint}} Disk Usage"
},
{
"expr": "rate(node_disk_read_bytes_total[5m])",
"legendFormat": "{{instance}} Disk Read"
},
{
"expr": "rate(node_disk_written_bytes_total[5m])",
"legendFormat": "{{instance}} Disk Write"
}
]
},
{
"id": 13,
"title": "Node Resource Usage - Network",
"type": "timeseries",
"gridPos": {"h": 8, "w": 6, "x": 18, "y": 26},
"targets": [
{
"expr": "rate(node_network_receive_bytes_total{device!=\"lo\"}[5m])",
"legendFormat": "{{instance}}:{{device}} RX"
},
{
"expr": "rate(node_network_transmit_bytes_total{device!=\"lo\"}[5m])",
"legendFormat": "{{instance}}:{{device}} TX"
},
{
"expr": "rate(node_network_receive_errs_total{device!=\"lo\"}[5m])",
"legendFormat": "{{instance}}:{{device}} RX Errors"
}
]
}
],
"templating": {
"list": [
{
"name": "cluster",
"type": "query",
"query": "label_values(up, cluster)",
"refresh": 1
}
]
}
}
}

{
"dashboard": {
"id": null,
"title": "Cloud Native Data Lake House - Namespace Details",
"tags": ["kubernetes", "namespace", "minio", "aistor"],
"style": "dark",
"timezone": "browser",
"refresh": "30s",
"time": {
"from": "now-1h",
"to": "now"
},
"panels": [
{
"id": 1,
"title": "Namespace Overview",
"type": "stat",
"gridPos": {"h": 4, "w": 24, "x": 0, "y": 0},
"targets": [
{
"expr": "count(kube_pod_info{namespace=\"namespace\"})", "legendFormat": "Total Pods" }, { "expr": "count(kube_pod_status_phase{namespace=\"namespace\", phase=\"Running\"})",
"legendFormat": "Running Pods"
},
{
"expr": "count(kube_deployment_status_replicas{namespace=\"namespace\"})", "legendFormat": "Deployments" }, { "expr": "count(kube_service_info{namespace=\"namespace\"})",
"legendFormat": "Services"
}
]
},
{
"id": 2,
"title": "MinIO Namespace Performance",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 4},
"targets": [
{
"expr": "sum(rate(minio_http_requests_total{job=~\".$namespace.\"}[5m])) by (api)",
"legendFormat": "{{api}} Requests/sec"
},
{
"expr": "sum(rate(minio_s3_requests_total{job=~\".$namespace.\"}[5m])) by (api)",
"legendFormat": "S3 {{api}} Requests/sec"
},
{
"expr": "histogram_quantile(0.95, sum(rate(minio_s3_requests_duration_seconds_bucket{job=~\".$namespace.\"}[5m])) by (le, api))",
"legendFormat": "{{api}} 95th Latency"
},
{
"expr": "sum(rate(minio_s3_requests_errors_total{job=~\".$namespace.\"}[5m])) by (api)",
"legendFormat": "{{api}} Errors/sec"
}
]
},
{
"id": 3,
"title": "MinIO Storage Metrics",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 4},
"targets": [
{
"expr": "sum(minio_bucket_usage_total_bytes{job=~\".$namespace.\"}) by (bucket)",
"legendFormat": "{{bucket}} Usage"
},
{
"expr": "sum(minio_bucket_objects_count{job=~\".$namespace.\"}) by (bucket)",
"legendFormat": "{{bucket}} Objects"
},
{
"expr": "sum(rate(minio_s3_requests_ttfb_seconds_bucket{job=~\".$namespace.\"}[5m]))",
"legendFormat": "Time to First Byte"
},
{
"expr": "sum(rate(minio_inter_node_traffic_sent_bytes{job=~\".$namespace.\"}[5m]))",
"legendFormat": "Inter-node Traffic Sent"
}
]
},
{
"id": 4,
"title": "Pod CPU Usage",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 12},
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"namespace\", container!=\"POD\", container!=\"\"}[5m])) by (pod)", "legendFormat": "{{pod}}" }, { "expr": "sum(kube_pod_container_resource_requests{namespace=\"namespace\", resource=\"cpu\"}) by (pod)",
"legendFormat": "{{pod}} Request"
},
{
"expr": "sum(kube_pod_container_resource_limits{namespace=\"namespace\", resource=\"cpu\"}) by (pod)", "legendFormat": "{{pod}} Limit" } ] }, { "id": 5, "title": "Pod Memory Usage", "type": "timeseries", "gridPos": {"h": 8, "w": 8, "x": 8, "y": 12}, "targets": [ { "expr": "sum(container_memory_working_set_bytes{namespace=\"namespace\", container!=\"POD\", container!=\"\"}) by (pod)",
"legendFormat": "{{pod}} Memory"
},
{
"expr": "sum(kube_pod_container_resource_requests{namespace=\"namespace\", resource=\"memory\"}) by (pod)", "legendFormat": "{{pod}} Request" }, { "expr": "sum(kube_pod_container_resource_limits{namespace=\"namespace\", resource=\"memory\"}) by (pod)",
"legendFormat": "{{pod}} Limit"
},
{
"expr": "sum(container_memory_cache{namespace=\"namespace\", container!=\"POD\", container!=\"\"}) by (pod)", "legendFormat": "{{pod}} Cache" } ] }, { "id": 6, "title": "Pod Network I/O", "type": "timeseries", "gridPos": {"h": 8, "w": 8, "x": 16, "y": 12}, "targets": [ { "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"namespace\"}[5m])) by (pod)",
"legendFormat": "{{pod}} RX"
},
{
"expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"namespace\"}[5m])) by (pod)", "legendFormat": "{{pod}} TX" }, { "expr": "sum(rate(container_network_receive_packets_dropped_total{namespace=\"namespace\"}[5m])) by (pod)",
"legendFormat": "{{pod}} RX Drops"
},
{
"expr": "sum(rate(container_network_transmit_packets_dropped_total{namespace=\"namespace\"}[5m])) by (pod)", "legendFormat": "{{pod}} TX Drops" } ] }, { "id": 7, "title": "Cilium Network Policies", "type": "timeseries", "gridPos": {"h": 8, "w": 12, "x": 0, "y": 20}, "targets": [ { "expr": "sum(cilium_policy_verdict_total{k8s_namespace=\"namespace\"}) by (verdict)",
"legendFormat": "{{verdict}}"
},
{
"expr": "sum(rate(cilium_drop_count_total{k8s_namespace=\"namespace\"}[5m])) by (reason)", "legendFormat": "Drops: {{reason}}" }, { "expr": "sum(rate(cilium_forward_count_total{k8s_namespace=\"namespace\"}[5m]))",
"legendFormat": "Forwarded Packets"
},
{
"expr": "cilium_identity_count{k8s_namespace=\"namespace\"}", "legendFormat": "Identities" } ] }, { "id": 8, "title": "API Server Namespace Requests", "type": "timeseries", "gridPos": {"h": 8, "w": 12, "x": 12, "y": 20}, "targets": [ { "expr": "sum(rate(apiserver_request_total{namespace=\"namespace\"}[5m])) by (verb, resource)",
"legendFormat": "{{verb}} {{resource}}"
},
{
"expr": "histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{namespace=\"namespace\"}[5m])) by (le, verb))", "legendFormat": "{{verb}} 95th Latency" }, { "expr": "sum(rate(apiserver_request_total{namespace=\"namespace\", code=~\"4..\"}[5m]))",
"legendFormat": "4xx Errors"
},
{
"expr": "sum(rate(apiserver_request_total{namespace=\"namespace\", code=~\"5..\"}[5m]))", "legendFormat": "5xx Errors" } ] }, { "id": 9, "title": "Pod Storage I/O", "type": "timeseries", "gridPos": {"h": 8, "w": 8, "x": 0, "y": 28}, "targets": [ { "expr": "sum(rate(container_fs_reads_bytes_total{namespace=\"namespace\", container!=\"POD\", container!=\"\"}[5m])) by (pod)",
"legendFormat": "{{pod}} Read"
},
{
"expr": "sum(rate(container_fs_writes_bytes_total{namespace=\"namespace\", container!=\"POD\", container!=\"\"}[5m])) by (pod)", "legendFormat": "{{pod}} Write" }, { "expr": "sum(container_fs_usage_bytes{namespace=\"namespace\", container!=\"POD\", container!=\"\"}) by (pod)",
"legendFormat": "{{pod}} Usage"
}
]
},
{
"id": 10,
"title": "DNS Resolution Performance",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 28},
"targets": [
{
"expr": "sum(rate(coredns_dns_requests_total{k8s_namespace=\"namespace\"}[5m])) by (zone, type)", "legendFormat": "{{zone}} {{type}}" }, { "expr": "sum(rate(coredns_dns_responses_total{k8s_namespace=\"namespace\", rcode!=\"NOERROR\"}[5m])) by (rcode)",
"legendFormat": "Error: {{rcode}}"
},
{
"expr": "histogram_quantile(0.95, sum(rate(coredns_dns_request_duration_seconds_bucket{k8s_namespace=\"namespace\"}[5m])) by (le))", "legendFormat": "95th Latency" } ] }, { "id": 11, "title": "Pod Restart and Status", "type": "timeseries", "gridPos": {"h": 8, "w": 8, "x": 16, "y": 28}, "targets": [ { "expr": "increase(kube_pod_container_status_restarts_total{namespace=\"namespace\"}[5m])",
"legendFormat": "{{pod}}/{{container}} Restarts"
},
{
"expr": "kube_pod_status_phase{namespace=\"namespace\"}", "legendFormat": "{{pod}} {{phase}}" }, { "expr": "kube_pod_container_status_ready{namespace=\"namespace\"}",
"legendFormat": "{{pod}}/{{container}} Ready"
},
{
"expr": "time() - kube_pod_created{namespace=\"namespace\"}", "legendFormat": "{{pod}} Age" } ] } ], "templating": { "list": [ { "name": "namespace", "type": "query", "query": "label_values(kube_namespace_status_phase, namespace)", "refresh": 1, "includeAll": false, "multi": false }, { "name": "pod", "type": "query", "query": "label_values(kube_pod_info{namespace=\"namespace\"}, pod)",
"refresh": 1,
"includeAll": true,
"multi": true
}
]
}
}
}

{
"dashboard": {
"id": null,
"title": "Cloud Native Data Lake House - Node Details",
"tags": ["kubernetes", "node", "minio", "aistor", "infrastructure"],
"style": "dark",
"timezone": "browser",
"refresh": "30s",
"time": {
"from": "now-1h",
"to": "now"
},
"panels": [
{
"id": 1,
"title": "Node Status Overview",
"type": "stat",
"gridPos": {"h": 4, "w": 24, "x": 0, "y": 0},
"targets": [
{
"expr": "kube_node_status_condition{node=\"node\", condition=\"Ready\", status=\"true\"}", "legendFormat": "Node Ready" }, { "expr": "count(kube_pod_info{node=\"node\"})",
"legendFormat": "Total Pods"
},
{
"expr": "count(kube_pod_status_phase{node=\"node\", phase=\"Running\"})", "legendFormat": "Running Pods" }, { "expr": "up{instance=~\".*node.*\"}",
"legendFormat": "Node Up"
}
],
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"thresholds": {
"steps": [
{"color": "red", "value": null},
{"color": "green", "value": 1}
]
}
}
}
},
{
"id": 2,
"title": "Node CPU Metrics",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 4},
"targets": [
{
"expr": "100 - (avg(irate(node_cpu_seconds_total{instance=~\".$node.\", mode=\"idle\"}[5m])) 100)",
"legendFormat": "CPU Usage %"
},
{
"expr": "avg(irate(node_cpu_seconds_total{instance=~\".
node.*\", mode=\"system\"}[5m])) * 100", "legendFormat": "System CPU %" }, { "expr": "avg(irate(node_cpu_seconds_total{instance=~\".*node.\", mode=\"user\"}[5m])) 100",
"legendFormat": "User CPU %"
},
{
"expr": "avg(irate(node_cpu_seconds_total{instance=~\".$node.\", mode=\"iowait\"}[5m])) 100",
"legendFormat": "IO Wait %"
},
{
"expr": "node_load1{instance=~\".
node.*\"}", "legendFormat": "Load 1m" } ] }, { "id": 3, "title": "Node Memory Metrics", "type": "timeseries", "gridPos": {"h": 8, "w": 8, "x": 8, "y": 4}, "targets": [ { "expr": "node_memory_MemTotal_bytes{instance=~\".*node.\"}",
"legendFormat": "Total Memory"
},
{
"expr": "node_memory_MemAvailable_bytes{instance=~\".
node.*\"}", "legendFormat": "Available Memory" }, { "expr": "node_memory_MemTotal_bytes{instance=~\".*node.\"} - node_memory_MemAvailable_bytes{instance=~\".node.*\"}", "legendFormat": "Used Memory" }, { "expr": "node_memory_Cached_bytes{instance=~\".*node.\"} + node_memory_Buffers_bytes{instance=~\".node.*\"}", "legendFormat": "Cache + Buffers" }, { "expr": "node_memory_SwapTotal_bytes{instance=~\".*node.\"} - node_memory_SwapFree_bytes{instance=~\".node.*\"}", "legendFormat": "Swap Used" } ] }, { "id": 4, "title": "Node Network Metrics", "type": "timeseries", "gridPos": {"h": 8, "w": 8, "x": 16, "y": 4}, "targets": [ { "expr": "rate(node_network_receive_bytes_total{instance=~\".*node.\", device!=\"lo\"}[5m])",
"legendFormat": "{{device}} RX"
},
{
"expr": "rate(node_network_transmit_bytes_total{instance=~\".
node.*\", device!=\"lo\"}[5m])", "legendFormat": "{{device}} TX" }, { "expr": "rate(node_network_receive_packets_total{instance=~\".*node.\", device!=\"lo\"}[5m])",
"legendFormat": "{{device}} RX Packets"
},
{
"expr": "rate(node_network_transmit_packets_total{instance=~\".
node.*\", device!=\"lo\"}[5m])", "legendFormat": "{{device}} TX Packets" }, { "expr": "rate(node_network_receive_errs_total{instance=~\".*node.\", device!=\"lo\"}[5m])",
"legendFormat": "{{device}} RX Errors"
}
]
},
{
"id": 5,
"title": "Node Disk I/O Metrics",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 12},
"targets": [
{
"expr": "rate(node_disk_read_bytes_total{instance=~\".
node.*\"}[5m])", "legendFormat": "{{device}} Read" }, { "expr": "rate(node_disk_written_bytes_total{instance=~\".*node.\"}[5m])",
"legendFormat": "{{device}} Write"
},
{
"expr": "rate(node_disk_reads_completed_total{instance=~\".
node.*\"}[5m])", "legendFormat": "{{device}} Read IOPS" }, { "expr": "rate(node_disk_writes_completed_total{instance=~\".*node.\"}[5m])",
"legendFormat": "{{device}} Write IOPS"
},
{
"expr": "rate(node_disk_read_time_seconds_total{instance=~\".
node.*\"}[5m]) / rate(node_disk_reads_completed_total{instance=~\".*node.\"}[5m])",
"legendFormat": "{{device}} Read Latency"
}
]
},
{
"id": 6,
"title": "Node Filesystem Usage",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 12},
"targets": [
{
"expr": "node_filesystem_size_bytes{instance=~\".
node.*\", fstype!=\"tmpfs\"}", "legendFormat": "{{mountpoint}} Total" }, { "expr": "node_filesystem_avail_bytes{instance=~\".*node.\", fstype!=\"tmpfs\"}",
"legendFormat": "{{mountpoint}} Available"
},
{
"expr": "node_filesystem_size_bytes{instance=~\".
node.*\", fstype!=\"tmpfs\"} - node_filesystem_avail_bytes{instance=~\".*node.\", fstype!=\"tmpfs\"}",
"legendFormat": "{{mountpoint}} Used"
},
{
"expr": "(1 - (node_filesystem_avail_bytes{instance=~\".
node.*\", fstype!=\"tmpfs\"} / node_filesystem_size_bytes{instance=~\".*node.\", fstype!=\"tmpfs\"})) 100",
"legendFormat": "{{mountpoint}} Usage %"
}
]
},
{
"id": 7,
"title": "MinIO Node Performance",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 20},
"targets": [
{
"expr": "minio_node_drive_online_total{server=~\".$node.\"}",
"legendFormat": "Online Drives"
},
{
"expr": "minio_node_drive_offline_total{server=~\".$node.\"}",
"legendFormat": "Offline Drives"
},
{
"expr": "rate(minio_node_drive_total_bytes{server=~\".$node.\"}[5m])",
"legendFormat": "Drive Throughput"
},
{
"expr": "minio_node_drive_used_bytes{server=~\".$node.\"}",
"legendFormat": "Drive Used Space"
}
]
},
{
"id": 8,
"title": "Cilium Node Metrics",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 20},
"targets": [
{
"expr": "cilium_datapath_conntrack_gc_runs_total{k8s_node=\"node\"}", "legendFormat": "Conntrack GC Runs" }, { "expr": "cilium_endpoint_count{k8s_node=\"node\"}",
"legendFormat": "Endpoints"
},
{
"expr": "cilium_identity_count{k8s_node=\"node\"}", "legendFormat": "Identities" }, { "expr": "rate(cilium_drop_count_total{k8s_node=\"node\"}[5m])",
"legendFormat": "Packet Drops"
},
{
"expr": "cilium_bpf_map_ops_total{k8s_node=\"node\"}", "legendFormat": "BPF Map Operations" } ] }, { "id": 9, "title": "Container Runtime Metrics", "type": "timeseries", "gridPos": {"h": 8, "w": 8, "x": 16, "y": 20}, "targets": [ { "expr": "rate(container_cpu_usage_seconds_total{id=\"/\", instance=~\".*node.\"}[5m])",
"legendFormat": "Container CPU Usage"
},
{
"expr": "container_memory_working_set_bytes{id=\"/\", instance=~\".
node.*\"}", "legendFormat": "Container Memory" }, { "expr": "container_fs_usage_bytes{id=\"/\", instance=~\".*node.\"}",
"legendFormat": "Container FS Usage"
},
{
"expr": "rate(container_network_receive_bytes_total{id=\"/\", instance=~\".
node.*\"}[5m])", "legendFormat": "Container Network RX" } ] }, { "id": 10, "title": "Node Pod Resource Usage", "type": "timeseries", "gridPos": {"h": 8, "w": 12, "x": 0, "y": 28}, "targets": [ { "expr": "sum(rate(container_cpu_usage_seconds_total{node=\"node\", container!=\"POD\", container!=\"\"}[5m])) by (pod)",
"legendFormat": "{{pod}} CPU"
},
{
"expr": "sum(container_memory_working_set_bytes{node=\"node\", container!=\"POD\", container!=\"\"}) by (pod)", "legendFormat": "{{pod}} Memory" }, { "expr": "sum(rate(container_network_receive_bytes_total{node=\"node\"}[5m])) by (pod)",
"legendFormat": "{{pod}} Net RX"
},
{
"expr": "sum(rate(container_network_transmit_bytes_total{node=\"node\"}[5m])) by (pod)", "legendFormat": "{{pod}} Net TX" } ] }, { "id": 11, "title": "Node System Metrics", "type": "timeseries", "gridPos": {"h": 8, "w": 12, "x": 12, "y": 28}, "targets": [ { "expr": "rate(node_context_switches_total{instance=~\".*node.\"}[5m])",
"legendFormat": "Context Switches/sec"
},
{
"expr": "rate(node_interrupts_total{instance=~\".
node.*\"}[5m])", "legendFormat": "Interrupts/sec" }, { "expr": "node_procs_running{instance=~\".*node.\"}",
"legendFormat": "Running Processes"
},
{
"expr": "node_procs_blocked{instance=~\".
node.*\"}", "legendFormat": "Blocked Processes" }, { "expr": "rate(node_forks_total{instance=~\".*node.\"}[5m])",
"legendFormat": "Process Forks/sec"
}
]
},
{
"id": 12,
"title": "DNS Performance on Node",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 36},
"targets": [
{
"expr": "sum(rate(coredns_dns_requests_total{k8s_node=\"node\"}[5m])) by (zone, type)", "legendFormat": "{{zone}} {{type}} Requests" }, { "expr": "sum(rate(coredns_dns_responses_total{k8s_node=\"node\", rcode=\"NOERROR\"}[5m])) / sum(rate(coredns_dns_responses_total{k8s_node=\"node\"}[5m]))", "legendFormat": "Success Rate" }, { "expr": "histogram_quantile(0.95, sum(rate(coredns_dns_request_duration_seconds_bucket{k8s_node=\"node\"}[5m])) by (le))",
"legendFormat": "95th Latency"
}
]
},
{
"id": 13,
"title": "NodeLocalDNS on Node",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 36},
"targets": [
{
"expr": "sum(rate(coredns_dns_requests_total{job=\"node-local-dns\", k8s_node=\"node\"}[5m]))", "legendFormat": "Local DNS Requests" }, { "expr": "sum(rate(coredns_forward_requests_total{job=\"node-local-dns\", k8s_node=\"node\"}[5m]))",
"legendFormat": "Forwarded Requests"
},
{
"expr": "sum(rate(coredns_cache_hits_total{job=\"node-local-dns\", k8s_node=\"node\"}[5m])) / sum(rate(coredns_dns_requests_total{job=\"node-local-dns\", k8s_node=\"node\"}[5m]))",
"legendFormat": "Cache Hit Rate"
},
{
"expr": "coredns_cache_entries{job=\"node-local-dns\", k8s_node=\"node\"}", "legendFormat": "Cache Entries" } ] }, { "id": 14, "title": "ETCD Client Metrics on Node", "type": "timeseries", "gridPos": {"h": 8, "w": 8, "x": 16, "y": 36}, "targets": [ { "expr": "sum(rate(etcd_client_requests_total{k8s_node=\"node\"}[5m])) by (operation)",
"legendFormat": "{{operation}} Requests"
},
{
"expr": "histogram_quantile(0.95, sum(rate(etcd_request_duration_seconds_bucket{k8s_node=\"node\"}[5m])) by (le, operation))", "legendFormat": "{{operation}} 95th Latency" }, { "expr": "sum(rate(etcd_client_requests_total{k8s_node=\"node\", code!=\"200\"}[5m]))",
"legendFormat": "Failed Requests"
}
]
},
{
"id": 15,
"title": "Hardware Health",
"type": "stat",
"gridPos": {"h": 6, "w": 24, "x": 0, "y": 44},
"targets": [
{
"expr": "node_hwmon_temp_celsius{instance=~\".
node.*\"}", "legendFormat": "{{chip}} {{sensor}} Temperature" }, { "expr": "node_cooling_device_cur_state{instance=~\".*node.\"}",
"legendFormat": "{{name}} Cooling State"
},
{
"expr": "node_power_supply_online{instance=~\".
node.*\"}", "legendFormat": "{{supply}} Power Supply" }, { "expr": "node_entropy_available_bits{instance=~\".*node.\"}",
"legendFormat": "Entropy Available"
}
],
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 70},
{"color": "red", "value": 85}
]
}
}
}
}
],
"templating": {
"list": [
{
"name": "node",
"type": "query",
"query": "label_values(kube_node_info, node)",
"refresh": 1,
"includeAll": false,
"multi": false
},
{
"name": "device",
"type": "query",
"query": "label_values(node_disk_reads_completed_total{instance=~\".
node.*\"}, device)", "refresh": 1, "includeAll": true, "multi": true }, { "name": "mountpoint", "type": "query", "query": "label_values(node_filesystem_size_bytes{instance=~\".*node.*\", fstype!=\"tmpfs\"}, mountpoint)",
"refresh": 1,
"includeAll": true,
"multi": true
}
]
}
}
}

0개의 댓글