가시다(gasida) 님이 진행하는 AEWS(Amazon EKS Workshop Study) 3기 과정으로 학습한 내용을 정리 또는 실습한 내용을 정리한 게시글입니다. 5주차는 EKS Autoscaling 관련 다양한 기술을 Study한 내용을 실습하면서 정리하였습니다
출처: kubernetes patterns ‘2nd 발췌

# YAML 파일 다운로드
curl -O https://s3.ap-northeast-2.amazonaws.com/cloudformation.cloudneta.net/K8S/myeks-5week.yaml
# 변수 지정
CLUSTER_NAME=myeks-sejkim
SSHKEYNAME=kp-sejkim
MYACCESSKEY=<IAM Uesr 액세스 키>
MYSECRETKEY=<IAM Uesr 시크릿 키>
# CloudFormation 스택 배포
aws cloudformation deploy --template-file myeks-5week.yaml --stack-name $CLUSTER_NAME --parameter-overrides KeyName=$SSHKEYNAME SgIngressSshCidr=$(curl -s ipinfo.io/ip)/32 MyIamUserAccessKeyID=$MYACCESSKEY MyIamUserSecretAccessKey=$MYSECRETKEY ClusterBaseName=$CLUSTER_NAME --region ap-northeast-2
# CloudFormation 스택 배포 완료 후 작업용 EC2 IP 출력
aws cloudformation describe-stacks --stack-name myeks --query 'Stacks[*].Outputs[0].OutputValue' --output text




# AWS LoadBalancerController
helm repo add eks https://aws.github.io/eks-charts
helm install aws-load-balancer-controller eks/aws-load-balancer-controller -n kube-system --set clusterName=$CLUSTER_NAME \
--set serviceAccount.create=false --set serviceAccount.name=aws-load-balancer-controller
# ExternalDNS
export MyDomain=ksj7279.click
export MyDnzHostedZoneId=$(aws route53 list-hosted-zones-by-name --dns-name "$MyDomain." --query "HostedZones[0].Id" --output text)
export CERT_ARN=$(aws acm list-certificates --query "CertificateSummaryList[?contains(DomainName, '${MyDomain}')].CertificateArn" --output text)
echo $MyDomain $MyDnzHostedZoneId $CERT_ARN
curl -s https://raw.githubusercontent.com/gasida/PKOS/main/aews/externaldns.yaml | MyDomain=$MyDomain MyDnzHostedZoneId=$MyDnzHostedZoneId envsubst | kubectl apply -f -
# gp3 스토리지 클래스 생성
cat <<EOF | kubectl apply -f -
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: gp3
annotations:
storageclass.kubernetes.io/is-default-class: "true"
allowVolumeExpansion: true
provisioner: ebs.csi.aws.com
volumeBindingMode: WaitForFirstConsumer
parameters:
type: gp3
allowAutoIOPSPerGBIncrease: 'true'
encrypted: 'true'
fsType: xfs # 기본값이 ext4
EOF
kubectl get sc
# kube-ops-view
helm repo add geek-cookbook https://geek-cookbook.github.io/charts/
helm install kube-ops-view geek-cookbook/kube-ops-view --version 1.2.2 --set service.main.type=ClusterIP --set env.TZ="Asia/Seoul" --namespace kube-system
# kubeopsview 용 Ingress 설정 : group 설정으로 1대의 ALB를 여러개의 ingress 에서 공용 사용
echo $CERT_ARN
cat <<EOF | kubectl apply -f -
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
annotations:
alb.ingress.kubernetes.io/certificate-arn: $CERT_ARN
alb.ingress.kubernetes.io/group.name: study
alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}, {"HTTP":80}]'
alb.ingress.kubernetes.io/load-balancer-name: $CLUSTER_NAME-ingress-alb
alb.ingress.kubernetes.io/scheme: internet-facing
alb.ingress.kubernetes.io/ssl-redirect: "443"
alb.ingress.kubernetes.io/success-codes: 200-399
alb.ingress.kubernetes.io/target-type: ip
labels:
app.kubernetes.io/name: kubeopsview
name: kubeopsview
namespace: kube-system
spec:
ingressClassName: alb
rules:
- host: kubeopsview.$MyDomain
http:
paths:
- backend:
service:
name: kube-ops-view
port:
number: 8080 # name: http
path: /
pathType: Prefix
EOF
# 설치된 파드 정보 확인
kubectl get pods -n kube-system
NAME READY STATUS RESTARTS AGE
aws-load-balancer-controller-86ff7688d-7dgt9 1/1 Running 0 18m
aws-load-balancer-controller-86ff7688d-9g5gr 1/1 Running 0 18m
aws-node-d9jgf 2/2 Running 0 31m
aws-node-gghkv 2/2 Running 0 35m
aws-node-lgjn7 2/2 Running 0 33m
coredns-86f5954566-sk46f 1/1 Running 0 31m
coredns-86f5954566-slt5h 1/1 Running 0 30m
ebs-csi-controller-844b978c49-5s8pn 6/6 Running 0 34m
ebs-csi-controller-844b978c49-ftm76 6/6 Running 0 31m
ebs-csi-node-fd24v 3/3 Running 0 35m
ebs-csi-node-qx8hr 3/3 Running 0 33m
ebs-csi-node-rpdp6 3/3 Running 0 31m
external-dns-7dd89bd9bc-76642 1/1 Running 0 6m26s
kube-ops-view-657dbc6cd8-ttgns 1/1 Running 0 3m17s
kube-proxy-9qtd4 1/1 Running 0 33m
kube-proxy-bhwcd 1/1 Running 0 31m
kube-proxy-rsrxh 1/1 Running 0 35m
metrics-server-6bf5998d9c-7qczk 1/1 Running 0 31m
metrics-server-6bf5998d9c-n856v 1/1 Running 0 30m
# service, ep, ingress 확인
kubectl get ingress,svc,ep -n kube-system
NAME CLASS HOSTS ADDRESS PORTS AGE
ingress.networking.k8s.io/kubeopsview alb kubeopsview.ksj7279.click myeks-sejkim-ingress-alb-1439144942.ap-northeast-2.elb.amazonaws.com 80 86s
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
service/aws-load-balancer-webhook-service ClusterIP 10.100.92.74 <none> 443/TCP 18m
service/eks-extension-metrics-api ClusterIP 10.100.228.41 <none> 443/TCP 55m
service/kube-dns ClusterIP 10.100.0.10 <none> 53/UDP,53/TCP,9153/TCP 52m
service/kube-ops-view ClusterIP 10.100.89.189 <none> 8080/TCP 3m41s
service/metrics-server ClusterIP 10.100.152.236 <none> 443/TCP 52m
NAME ENDPOINTS AGE
endpoints/aws-load-balancer-webhook-service 192.168.2.171:9443,192.168.3.53:9443 18m
endpoints/eks-extension-metrics-api 172.0.32.0:10443 55m
endpoints/kube-dns 192.168.1.42:53,192.168.2.50:53,192.168.1.42:53 + 3 more... 52m
endpoints/kube-ops-view 192.168.1.164:8080 3m41s
endpoints/metrics-server 192.168.1.178:10251,192.168.3.114:10251 52m
# Kube Ops View 접속 정보 확인 : 조금 오래 기다리면 접속됨...
echo -e "Kube Ops View URL = https://kubeopsview.$MyDomain/#scale=1.5"
Kube Ops View URL = https://kubeopsview.ksj7279.click/#scale=1.5
open "https://kubeopsview.$MyDomain/#scale=1.5" # macOS

# repo 추가
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
# 파라미터 파일 생성 : PV/PVC(AWS EBS) 삭제에 불편하니, 4주차 실습과 다르게 PV/PVC 미사용
cat <<EOT > monitor-values.yaml
prometheus:
prometheusSpec:
scrapeInterval: "15s"
evaluationInterval: "15s"
podMonitorSelectorNilUsesHelmValues: false
serviceMonitorSelectorNilUsesHelmValues: false
retention: 5d
retentionSize: "10GiB"
# Enable vertical pod autoscaler support for prometheus-operator
verticalPodAutoscaler:
enabled: true
ingress:
enabled: true
ingressClassName: alb
hosts:
- prometheus.$MyDomain
paths:
- /*
annotations:
alb.ingress.kubernetes.io/scheme: internet-facing
alb.ingress.kubernetes.io/target-type: ip
alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}, {"HTTP":80}]'
alb.ingress.kubernetes.io/certificate-arn: $CERT_ARN
alb.ingress.kubernetes.io/success-codes: 200-399
alb.ingress.kubernetes.io/load-balancer-name: myeks-sejkim-ingress-alb
alb.ingress.kubernetes.io/group.name: study
alb.ingress.kubernetes.io/ssl-redirect: '443'
grafana:
defaultDashboardsTimezone: Asia/Seoul
adminPassword: prom-operator
defaultDashboardsEnabled: false
ingress:
enabled: true
ingressClassName: alb
hosts:
- grafana.$MyDomain
paths:
- /*
annotations:
alb.ingress.kubernetes.io/scheme: internet-facing
alb.ingress.kubernetes.io/target-type: ip
alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}, {"HTTP":80}]'
alb.ingress.kubernetes.io/certificate-arn: $CERT_ARN
alb.ingress.kubernetes.io/success-codes: 200-399
alb.ingress.kubernetes.io/load-balancer-name: myeks-sejkim-ingress-alb
alb.ingress.kubernetes.io/group.name: study
alb.ingress.kubernetes.io/ssl-redirect: '443'
kube-state-metrics:
rbac:
extraRules:
- apiGroups: ["autoscaling.k8s.io"]
resources: ["verticalpodautoscalers"]
verbs: ["list", "watch"]
customResourceState:
enabled: true
config:
kind: CustomResourceStateMetrics
spec:
resources:
- groupVersionKind:
group: autoscaling.k8s.io
kind: "VerticalPodAutoscaler"
version: "v1"
labelsFromPath:
verticalpodautoscaler: [metadata, name]
namespace: [metadata, namespace]
target_api_version: [apiVersion]
target_kind: [spec, targetRef, kind]
target_name: [spec, targetRef, name]
metrics:
- name: "vpa_containerrecommendations_target"
help: "VPA container recommendations for memory."
each:
type: Gauge
gauge:
path: [status, recommendation, containerRecommendations]
valueFrom: [target, memory]
labelsFromPath:
container: [containerName]
commonLabels:
resource: "memory"
unit: "byte"
- name: "vpa_containerrecommendations_target"
help: "VPA container recommendations for cpu."
each:
type: Gauge
gauge:
path: [status, recommendation, containerRecommendations]
valueFrom: [target, cpu]
labelsFromPath:
container: [containerName]
commonLabels:
resource: "cpu"
unit: "core"
selfMonitor:
enabled: true
alertmanager:
enabled: false
defaultRules:
create: false
kubeControllerManager:
enabled: false
kubeEtcd:
enabled: false
kubeScheduler:
enabled: false
prometheus-windows-exporter:
prometheus:
monitor:
enabled: false
EOT
cat monitor-values.yaml
# helm 배포
helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack --version 69.3.1 \
-f monitor-values.yaml --create-namespace --namespace monitoring
# helm 확인
helm get values -n monitoring kube-prometheus-stack
# PV 사용하지 않음
kubectl get pv,pvc -A
kubectl df-pv
# 프로메테우스 웹 접속
echo -e "https://prometheus.$MyDomain"
open "https://prometheus.$MyDomain" # macOS
# 그라파나 웹 접속 : admin / prom-operator
echo -e "https://grafana.$MyDomain"
open "https://grafana.$MyDomain" # macOS
#
kubectl get targetgroupbindings.elbv2.k8s.aws -A
# 상세 확인
kubectl get pod -n monitoring -l app.kubernetes.io/name=kube-state-metrics
NAME READY STATUS RESTARTS AGE
kube-prometheus-stack-kube-state-metrics-5674c7ddd8-ssvsc 1/1 Running 0 2m23s
kubectl describe pod -n monitoring -l app.kubernetes.io/name=kube-state-metrics
...
Service Account: kube-prometheus-stack-kube-state-metrics
...
Args:
--port=8080
--resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments
--custom-resource-state-config-file=/etc/customresourcestate/config.yaml
...
Volumes:
customresourcestate-config:
Type: ConfigMap (a volume populated by a ConfigMap)
Name: kube-prometheus-stack-kube-state-metrics-customresourcestate-config
Optional: false
...
kubectl describe cm -n monitoring kube-prometheus-stack-kube-state-metrics-customresourcestate-config
...
# ClusterRole
kubectl get clusterrole kube-prometheus-stack-kube-state-metrics
kubectl describe clusterrole kube-prometheus-stack-kube-state-metrics
kubectl describe clusterrole kube-prometheus-stack-kube-state-metrics | grep verticalpodautoscalers
verticalpodautoscalers.autoscaling.k8s.io [] [] [list watch]
# ELB
kubectl get ingress -A
NAMESPACE NAME CLASS HOSTS ADDRESS PORTS AGE
kube-system kubeopsview alb kubeopsview.ksj7279.click myeks-sejkim-ingress-alb-1439144942.ap-northeast-2.elb.amazonaws.com 80 38m
monitoring kube-prometheus-stack-grafana alb grafana.ksj7279.click myeks-sejkim-ingress-alb-1439144942.ap-northeast-2.elb.amazonaws.com 80 28m
monitoring kube-prometheus-stack-prometheus alb prometheus.ksj7279.click myeks-sejkim-ingress-alb-1439144942.ap-northeast-2.elb.amazonaws.com 80 28m
kubectl get targetgroupbindings.elbv2.k8s.aws -A
NAMESPACE NAME SERVICE-NAME SERVICE-PORT TARGET-TYPE AGE
kube-system k8s-kubesyst-kubeopsv-b711c62eab kube-ops-view 8080 ip 37m
monitoring k8s-monitori-kubeprom-be1ef0cc53 kube-prometheus-stack-prometheus 9090 ip 2m25s
monitoring k8s-monitori-kubeprom-e48cb48220 kube-prometheus-stack-grafana 80 ip 2m25s

It displays the scheduled pod resource requests vs the allocatable capacity on the node.
It does not look at the actual pod resource usage.
Node마다 할당 가능한 용량과 스케줄링된 POD(컨테이너)의 Resource 중 request 값을 표시한다.
실제 POD(컨테이너) 리소스 사용량은 아니다. /pkg/model/pod.go 파일을 보면 컨테이너의 request 합을 반환하며, init containers는 미포함
https://github.com/awslabs/eks-node-viewer/blob/main/pkg/model/pod.go#L82
// **Requested returns the sum of the resources requested by the pod**. **This doesn't include any init containers** as we
// are interested in the steady state usage of the pod
func (p *Pod) Requested() v1.ResourceList {
p.mu.RLock()
defer p.mu.RUnlock()
requested := v1.ResourceList{}
for _, c := range p.pod.Spec.Containers {
for rn, q := range c.Resources.Requests {
existing := requested[rn]
existing.Add(q)
requested[rn] = existing
}
}
requested[v1.ResourcePods] = resource.MustParse("1")
return requested
}
# macOS 설치
brew tap aws/tap
brew install eks-node-viewer
# 운영서버 EC2에 설치 : userdata 통해 이미 설치 되어 있음
yum install golang -y
go install github.com/awslabs/eks-node-viewer/cmd/eks-node-viewer@latest # 설치 시 2~3분 정도 소요
# Windows 에 WSL2 (Ubuntu) 설치
sudo apt install golang-go
go install github.com/awslabs/eks-node-viewer/cmd/eks-node-viewer@latest # 설치 시 2~3분 정도 소요
echo 'export PATH="$PATH:/root/go/bin"' >> /etc/profile
# Standard usage
eks-node-viewer
# Display both CPU and Memory Usage
eks-node-viewer --resources cpu,memory
eks-node-viewer --resources cpu,memory --extra-labels eks-node-viewer/node-age
# Display extra labels, i.e. AZ : node 에 labels 사용 가능
eks-node-viewer --extra-labels topology.kubernetes.io/zone
eks-node-viewer --extra-labels kubernetes.io/arch
# Sort by CPU usage in descending order
eks-node-viewer --node-sort=eks-node-viewer/node-cpu-usage=dsc
# Karenter nodes only
eks-node-viewer --node-selector "karpenter.sh/provisioner-name"
# Specify a particular AWS profile and region
AWS_PROFILE=myprofile AWS_REGION=us-west-2
Computed Labels : --extra-labels
# eks-node-viewer/node-age - Age of the node
eks-node-viewer --extra-labels eks-node-viewer/node-age
eks-node-viewer --extra-labels topology.kubernetes.io/zone,eks-node-viewer/node-age
# eks-node-viewer/node-ephemeral-storage-usage - Ephemeral Storage usage (requests)
eks-node-viewer --extra-labels eks-node-viewer/node-ephemeral-storage-usage
# eks-node-viewer/node-cpu-usage - CPU usage (requests)
eks-node-viewer --extra-labels eks-node-viewer/node-cpu-usage
# eks-node-viewer/node-memory-usage - Memory usage (requests)
eks-node-viewer --extra-labels eks-node-viewer/node-memory-usage
# eks-node-viewer/node-pods-usage - Pod usage (requests)
eks-node-viewer --extra-labels eks-node-viewer/node-pods-usage

FROM php:5-apache
COPY index.php /var/www/html/index.php
RUN chmod a+rx index.php
<?php
$x = 0.0001;
for ($i = 0; $i <= 1000000; $i++) {
$x += sqrt($x);
}
echo "OK!";
?>
cat << EOF > php-apache.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: php-apache
spec:
selector:
matchLabels:
run: php-apache
template:
metadata:
labels:
run: php-apache
spec:
containers:
- name: php-apache
image: registry.k8s.io/hpa-example
ports:
- containerPort: 80
resources:
limits:
cpu: 500m
requests:
cpu: 200m
---
apiVersion: v1
kind: Service
metadata:
name: php-apache
labels:
run: php-apache
spec:
ports:
- port: 80
selector:
run: php-apache
EOF
kubectl apply -f php-apache.yaml
# 확인
kubectl exec -it deploy/php-apache -- cat /var/www/html/index.php
...
# 모니터링 : 터미널2개 사용
watch -d 'kubectl get hpa,pod;echo;kubectl top pod;echo;kubectl top node'
kubectl exec -it deploy/php-apache -- top
# [운영서버 EC2] 파드IP로 직접 접속
PODIP=$(kubectl get pod -l run=php-apache -o jsonpath="{.items[0].status.podIP}")
curl -s $PODIP; echo

# Create the HorizontalPodAutoscaler : requests.cpu=200m - 알고리즘
# Since each pod requests 200 milli-cores by kubectl run, this means an average CPU usage of 100 milli-cores.
cat <<EOF | kubectl apply -f -
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: php-apache
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: php-apache
minReplicas: 1
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
averageUtilization: 50
type: Utilization
EOF
혹은
kubectl autoscale deployment php-apache --cpu-percent=50 --min=1 --max=10
# 확인
kubectl describe hpa
...
Metrics: ( current / target )
resource cpu on pods (as a percentage of request): 0% (1m) / 50%
Min replicas: 1
Max replicas: 10
Deployment pods: 1 current / 1 desired
...
# HPA 설정 확인
kubectl get hpa php-apache -o yaml | kubectl neat
spec:
minReplicas: 1 # [4] 또는 최소 1개까지 줄어들 수도 있습니다
maxReplicas: 10 # [3] 포드를 최대 10개까지 늘립니다
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: php-apache # [1] php-apache 의 자원 사용량에서
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 50 # [2] CPU 활용률이 50% 이상인 경우
# 반복 접속 1 (파드1 IP로 접속) >> 증가 확인 후 중지
while true;do curl -s $PODIP; sleep 0.5; done
# 반복 접속 2 (서비스명 도메인으로 파드들 분산 접속) >> 증가 확인(몇개까지 증가되는가? 7개 / 그 이유는? 7개로 pod 증가 후 cpu 사용률 42~47% 유지됨) 후 중지
## >> [scale back down] 중지 5분 후 파드 갯수 감소 확인
# Run this in a separate terminal
# so that the load generation continues and you can carry on with the rest of the steps
kubectl run -i --tty load-generator --rm --image=busybox:1.28 --restart=Never -- /bin/sh -c "while sleep 0.01; do wget -q -O- http://php-apache; done"
# Horizontal Pod Autoscaler Status Conditions
kubectl describe hpa
...
Reference: Deployment/php-apache
Metrics: ( current / target )
resource cpu on pods (as a percentage of request): 39% (78m) / 50%
Min replicas: 1
Max replicas: 10
Deployment pods: 7 current / 7 desired
Conditions:
Type Status Reason Message
---- ------ ------ -------
AbleToScale True ScaleDownStabilized recent recommendations were higher than current one, applying the highest recent recommendation
ScalingActive True ValidMetricFound the HPA was able to successfully calculate a replica count from cpu resource utilization (percentage of request)
ScalingLimited False DesiredWithinRange the desired count is within the acceptable range
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal SuccessfulRescale 10m horizontal-pod-autoscaler New size: 2; reason: cpu resource utilization (percentage of request) above target
Normal SuccessfulRescale 10m (x4 over 169m) horizontal-pod-autoscaler New size: 4; reason: cpu resource utilization (percentage of request) above target
Normal SuccessfulRescale 9m53s horizontal-pod-autoscaler New size: 5; reason:
Normal SuccessfulRescale 9m23s horizontal-pod-autoscaler New size: 6; reason: cpu resource utilization (percentage of request) above target
Normal SuccessfulRescale 8m53s (x4 over 168m) horizontal-pod-autoscaler New size: 7; reason: cpu resource utilization (percentage of request) above target
Normal SuccessfulRescale 19m (x2 over 144m) horizontal-pod-autoscaler New size: 6; reason: All metrics below target
Normal SuccessfulRescale 19m (x5 over 153m) horizontal-pod-autoscaler New size: 1; reason: All metrics below target


https://keda.sh/docs/2.10/concepts/keda-operator container that runs when you install KEDA.keda-operator-metrics-apiserver container that runs when you install KEDA.keda-admission-webhooks
kubectl get pod -n keda
NAME READY STATUS RESTARTS AGE
keda-operator-6bdffdc78-5rqnp 1/1 Running 1 (11m ago) 11m
keda-operator-metrics-apiserver-74d844d769-2vrcq 1/1 Running 0 11m
keda-admission-webhooks-86cffccbf5-kmb7v 1/1 Running 0 11m
triggers:
- type: kafka
metadata:
bootstrapServers: kafka.svc:9092
consumerGroup: my-group
topic: test-topic
lagThreshold: '5' # Average target value to trigger scaling actions. (Default: 5, Optional)
activationLagThreshold: '3' # Target value for activating the scaler. Learn more about activation here.
offsetResetPolicy: latest
allowIdleConsumers: false
scaleToZeroOnInvalidOffset: false
excludePersistentLag: false
limitToPartitionsWithLag: false
version: 1.0.0
partitionLimitation: '1,2,10-20,31'
sasl: plaintext
tls: enable
unsafeSsl: 'false'
# 설치 전 기존 metrics-server 제공 Metris API 확인
kubectl get --raw "/apis/metrics.k8s.io" -v=6 | jq
kubectl get --raw "/apis/metrics.k8s.io" | jq
{
"kind": "APIGroup",
"apiVersion": "v1",
"name": "metrics.k8s.io",
"versions": [
{
"groupVersion": "metrics.k8s.io/v1beta1",
"version": "v1beta1"
}
],
"preferredVersion": {
"groupVersion": "metrics.k8s.io/v1beta1",
"version": "v1beta1"
}
}
# KEDA 설치 : serviceMonitor 만으로도 충분할듯..
cat <<EOT > keda-values.yaml
metricsServer:
useHostNetwork: true
prometheus:
metricServer:
enabled: true
port: 9022
portName: metrics
path: /metrics
serviceMonitor:
# Enables ServiceMonitor creation for the Prometheus Operator
enabled: true
podMonitor:
# Enables PodMonitor creation for the Prometheus Operator
enabled: true
operator:
enabled: true
port: 8080
serviceMonitor:
# Enables ServiceMonitor creation for the Prometheus Operator
enabled: true
podMonitor:
# Enables PodMonitor creation for the Prometheus Operator
enabled: true
webhooks:
enabled: true
port: 8020
serviceMonitor:
# Enables ServiceMonitor creation for the Prometheus webhooks
enabled: true
EOT
helm repo add kedacore https://kedacore.github.io/charts
helm repo update
helm install keda kedacore/keda --version 2.16.0 --namespace keda --create-namespace -f keda-values.yaml
NAME: keda
LAST DEPLOYED: Sat Mar 8 15:55:33 2025
NAMESPACE: keda
STATUS: deployed
REVISION: 1
TEST SUITE: None
NOTES:
:::^. .::::^: ::::::::::::::: .:::::::::. .^.
7???~ .^7????~. 7??????????????. :?????????77!^. .7?7.
7???~ ^7???7~. ~!!!!!!!!!!!!!!. :????!!!!7????7~. .7???7.
7???~^7????~. :????: :~7???7. :7?????7.
7???7????!. ::::::::::::. :????: .7???! :7??77???7.
7????????7: 7???????????~ :????: :????: :???7?5????7.
7????!~????^ !77777777777^ :????: :????: ^???7?#P7????7.
7???~ ^????~ :????: :7???! ^???7J#@J7?????7.
7???~ :7???!. :????: .:~7???!. ~???7Y&@#7777????7.
7???~ .7???7: !!!!!!!!!!!!!!! :????7!!77????7^ ~??775@@@GJJYJ?????7.
7???~ .!????^ 7?????????????7. :?????????7!~: !????G@@@@@@@@5??????7:
::::. ::::: ::::::::::::::: .::::::::.. .::::JGGGB@@@&7:::::::::
?@@#~
P@B^
:&G:
!5.
.Kubernetes Event-driven Autoscaling (KEDA) - Application autoscaling made simple.
Get started by deploying Scaled Objects to your cluster:
- Information about Scaled Objects : https://keda.sh/docs/latest/concepts/
- Samples: https://github.com/kedacore/samples
Get information about the deployed ScaledObjects:
kubectl get scaledobject [--namespace <namespace>]
Get details about a deployed ScaledObject:
kubectl describe scaledobject <scaled-object-name> [--namespace <namespace>]
Get information about the deployed ScaledObjects:
kubectl get triggerauthentication [--namespace <namespace>]
Get details about a deployed ScaledObject:
kubectl describe triggerauthentication <trigger-authentication-name> [--namespace <namespace>]
Get an overview of the Horizontal Pod Autoscalers (HPA) that KEDA is using behind the scenes:
kubectl get hpa [--all-namespaces] [--namespace <namespace>]
Learn more about KEDA:
- Documentation: https://keda.sh/
- Support: https://keda.sh/support/
- File an issue: https://github.com/kedacore/keda/issues/new/choose
# KEDA 설치 확인
kubectl get crd | grep keda
cloudeventsources.eventing.keda.sh 2025-03-08T06:55:34Z
clustercloudeventsources.eventing.keda.sh 2025-03-08T06:55:34Z
clustertriggerauthentications.keda.sh 2025-03-08T06:55:34Z
scaledjobs.keda.sh 2025-03-08T06:55:34Z
scaledobjects.keda.sh 2025-03-08T06:55:34Z
triggerauthentications.keda.sh 2025-03-08T06:55:34Z
kubectl get all -n keda
NAME READY STATUS RESTARTS AGE
pod/keda-admission-webhooks-86cffccbf5-7cnqj 1/1 Running 0 77s
pod/keda-operator-6bdffdc78-9k582 1/1 Running 1 (67s ago) 77s
pod/keda-operator-metrics-apiserver-74d844d769-wclhh 1/1 Running 0 77s
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
service/keda-admission-webhooks ClusterIP 10.100.241.143 <none> 443/TCP,8020/TCP 78s
service/keda-operator ClusterIP 10.100.188.36 <none> 9666/TCP,8080/TCP 78s
service/keda-operator-metrics-apiserver ClusterIP 10.100.20.226 <none> 443/TCP,9022/TCP 78s
NAME READY UP-TO-DATE AVAILABLE AGE
deployment.apps/keda-admission-webhooks 1/1 1 1 78s
deployment.apps/keda-operator 1/1 1 1 78s
deployment.apps/keda-operator-metrics-apiserver 1/1 1 1 78s
NAME DESIRED CURRENT READY AGE
replicaset.apps/keda-admission-webhooks-86cffccbf5 1 1 1 78s
replicaset.apps/keda-operator-6bdffdc78 1 1 1 78s
replicaset.apps/keda-operator-metrics-apiserver-74d844d769 1 1 1 78s
kubectl get validatingwebhookconfigurations keda-admission -o yaml
kubectl get podmonitor,servicemonitors -n keda
NAME AGE
podmonitor.monitoring.coreos.com/keda-operator 116s
podmonitor.monitoring.coreos.com/keda-operator-metrics-apiserver 116s
NAME AGE
servicemonitor.monitoring.coreos.com/keda-admission-webhooks 116s
servicemonitor.monitoring.coreos.com/keda-operator 116s
servicemonitor.monitoring.coreos.com/keda-operator-metrics-apiserver 116s
kubectl get apiservice v1beta1.external.metrics.k8s.io -o yaml
apiVersion: apiregistration.k8s.io/v1
kind: APIService
metadata:
annotations:
meta.helm.sh/release-name: keda
meta.helm.sh/release-namespace: keda
labels:
app.kubernetes.io/component: operator
app.kubernetes.io/instance: keda
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: v1beta1.external.metrics.k8s.io
app.kubernetes.io/part-of: keda-operator
app.kubernetes.io/version: 2.16.0
helm.sh/chart: keda-2.16.0
name: v1beta1.external.metrics.k8s.io
spec:
caBundle: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURFRENDQWZpZ0F3SUJBZ0lCQURBTkJna3Foa2lHOXcwQkFRc0ZBREFoTVJBd0RnWURWUVFLRXdkTFJVUkIKVDFKSE1RMHdDd1lEVlFRREV3UkxSVVJCTUI0WERUSTFNRE13T0RBMU5UVTBObG9YRFRNMU1ETXdOakEyTlRVMApObG93SVRFUU1BNEdBMVVFQ2hNSFMwVkVRVTlTUnpFTk1Bc0dBMVVFQXhNRVMwVkVRVENDQVNJd0RRWUpLb1pJCmh2Y05BUUVCQlFBRGdnRVBBRENDQVFvQ2dnRUJBTXk5ZTBkL3laRWtCeElRL0F1YSs0cExPTUJFT2ZDazQyWnkKTDVWRVJ4dktiTzBWdmZvMHdYaEx1ZTE0dzVDSEYvZ1RCTTNPMHVnb0RYVE1QbnZQR3NhWXJxUm9RMDJXV1Y0Mgo1MEtKZldZTUZlR0VoTTFLMmExYWVQaFoyUjNIRGQwNjkrdC8rK0luZzBjU3Q3SGR3eUdwTzJmVXl0OHE1UGwvClM2dDdIaklXOTBPejkydHB0a0VoRkZ4bXF3d3VkZ1N0Q0tUdkxMQmhLVUpiK3hrQ0NhbUhaNHU2VjdPT1lQOVYKUXhKcXlPU0xxTkhzZmp2b0NBU3k5YzFHancvYStTcVRMbUMrVTRoSldxOFZBa1FqeG9NeUFwQThNNnJRRVJ5eAppUzFmelpOdk5kWCtPVFZQekRpMFN2eVpDWGdaRGNJcWt1NUdPNXg2N1dJek1HMWswbEVDQXdFQUFhTlRNRkV3CkRnWURWUjBQQVFIL0JBUURBZ0trTUE4R0ExVWRFd0VCL3dRRk1BTUJBZjh3SFFZRFZSME9CQllFRkJZd2JBdlYKbWcxMEFqQTR1SG5jTSsrZDNMU3BNQThHQTFVZEVRUUlNQWFDQkV0RlJFRXdEUVlKS29aSWh2Y05BUUVMQlFBRApnZ0VCQUZXdGZtZ1BjSmFQK0oyWEc0emxHcW43TlMyZUpSWldobGhkdHM2UVhMZWhCKzc2clZQZzhzeHUvaUdWCnRCTXNlbkRDdGF4dnpqdk52Ly9ZL2JOR2xFNUY2ME9VeXdVZEhkdHlENWEwQzZ1VERhenhhRUU5YTlDTVMwSlkKeGM1ZldrTGNHdEN4RnMzWjQ2Q2xId2lwWnJRdnRTbi90cC9OTEhCQ0ZDZGozekdBVUlCdkJ5empjR2FRdFFPOQo1ZzBEU0NRZjJtY2lCeEthbEx1d1duMDdoSDY1MUJhYWFSdTgrR3RGTHdYUlNEZEZjR1k4K2tWeUVWbnFPWWM0CjZ2K2lPdmx6VEhGeHg5WUNmNG5FR1dmZ1lSTzV4cmNTU3dIUTZ1VEVXWXJsTHhsSm5JS1dtTU92RytuamMvNkkKZ1dzRmJGRis4b0dqVmZaY1JOQUpDY1h0NzhNPQotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0tCg==
group: external.metrics.k8s.io
groupPriorityMinimum: 100
service:
name: keda-operator-metrics-apiserver
namespace: keda
port: 443
version: v1beta1
versionPriority: 100
# CPU/Mem은 기존 metrics-server 의존하여, KEDA metrics-server는 외부 이벤트 소스(Scaler) 메트릭을 노출
## https://keda.sh/docs/2.16/operate/metrics-server/
kubectl get pod -n keda -l app=keda-operator-metrics-apiserver
NAME READY STATUS RESTARTS AGE
keda-operator-metrics-apiserver-74d844d769-wclhh 1/1 Running 0 4m6s
# Querying metrics exposed by KEDA Metrics Server
kubectl get --raw "/apis/external.metrics.k8s.io/v1beta1" | jq
{
"kind": "APIResourceList",
"apiVersion": "v1",
"groupVersion": "external.metrics.k8s.io/v1beta1",
"resources": [
{
"name": "externalmetrics",
"singularName": "",
"namespaced": true,
"kind": "ExternalMetricValueList",
"verbs": [
"get"
]
}
]
}
# keda 네임스페이스에 디플로이먼트 생성
kubectl apply -f php-apache.yaml -n keda
kubectl get pod -n keda
# ScaledObject 정책 생성 : cron
cat <<EOT > keda-cron.yaml
apiVersion: keda.sh/v1alpha1
kind: ScaledObject
metadata:
name: php-apache-cron-scaled
spec:
minReplicaCount: 0
maxReplicaCount: 2 # Specifies the maximum number of replicas to scale up to (defaults to 100).
pollingInterval: 30 # Specifies how often KEDA should check for scaling events
cooldownPeriod: 300 # Specifies the cool-down period in seconds after a scaling event
scaleTargetRef: # Identifies the Kubernetes deployment or other resource that should be scaled.
apiVersion: apps/v1
kind: Deployment
name: php-apache
triggers: # Defines the specific configuration for your chosen scaler, including any required parameters or settings
- type: cron
metadata:
timezone: Asia/Seoul
start: 00,15,30,45 * * * *
end: 05,20,35,50 * * * *
desiredReplicas: "1"
EOT
kubectl apply -f keda-cron.yaml -n keda
# 그라파나 대시보드 추가 : 대시보드 상단에 namespace : keda 로 변경하기!
# KEDA 대시보드 Import : https://github.com/kedacore/keda/blob/main/config/grafana/keda-dashboard.json
# 모니터링
watch -d 'kubectl get ScaledObject,hpa,pod -n keda'
kubectl get ScaledObject -w
# 확인
kubectl get ScaledObject,hpa,pod -n keda
kubectl get hpa -o jsonpath="{.items[0].spec}" -n keda | jq
{
"maxReplicas": 2,
"metrics": [
{
"external": {
"metric": {
"name": "s0-cron-Asia-Seoul-00,15,30,45xxxx-05,20,35,50xxxx",
"selector": {
"matchLabels": {
"scaledobject.keda.sh/name": "php-apache-cron-scaled"
}
}
},
"target": {
"averageValue": "1",
"type": "AverageValue"
}
},
"type": "External"
}
],
"minReplicas": 1,
"scaleTargetRef": {
"apiVersion": "apps/v1",
"kind": "Deployment",
"name": "php-apache"
}
}
kubectl describe ScaledObject -n keda
Name: php-apache-cron-scaled
Namespace: keda
Labels: scaledobject.keda.sh/name=php-apache-cron-scaled
Annotations: <none>
API Version: keda.sh/v1alpha1
Kind: ScaledObject
Metadata:
Creation Timestamp: 2025-03-08T07:23:14Z
Finalizers:
finalizer.keda.sh
Generation: 1
Resource Version: 136482
UID: 2bd90742-a828-4649-90ef-d11368ebcd93
Spec:
Cooldown Period: 300
Max Replica Count: 2
Min Replica Count: 0
Polling Interval: 30
Scale Target Ref:
API Version: apps/v1
Kind: Deployment
Name: php-apache
Triggers:
Metadata:
Desired Replicas: 1
End: 05,20,35,50 * * * *
Start: 00,15,30,45 * * * *
Timezone: Asia/Seoul
Type: cron
Status:
Conditions:
Message: ScaledObject is defined correctly and is ready for scaling
Reason: ScaledObjectReady
Status: True
Type: Ready
Message: Scaling is not performed because triggers are not active
Reason: ScalerNotActive
Status: False
Type: Active
Message: No fallbacks are active on this scaled object
Reason: NoFallbackFound
Status: False
Type: Fallback
Status: Unknown
Type: Paused
External Metric Names:
s0-cron-Asia-Seoul-00,15,30,45xxxx-05,20,35,50xxxx
Hpa Name: keda-hpa-php-apache-cron-scaled
Last Active Time: 2025-03-08T07:49:45Z
Original Replica Count: 1
Scale Target GVKR:
Group: apps
Kind: Deployment
Resource: deployments
Version: v1
Scale Target Kind: apps/v1.Deployment
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal KEDAScalersStarted 32m keda-operator Scaler cron is built.
Normal KEDAScalersStarted 32m keda-operator Started scalers watch
Normal ScaledObjectReady 32m keda-operator ScaledObject is ready for scaling
Normal KEDAScaleTargetActivated 10m (x2 over 25m) keda-operator Scaled apps/v1.Deployment keda/php-apache from 0 to 1, triggered by cronScaler
Normal KEDAScaleTargetDeactivated 40s (x3 over 32m) keda-operator Deactivated apps/v1.Deployment keda/php-apache from 1 to 0
# KEDA 및 deployment 등 삭제
kubectl delete ScaledObject -n keda php-apache-cron-scaled && kubectl delete deploy php-apache -n keda && helm uninstall keda -n keda
kubectl delete namespace keda




kube_customresource_vpa_containerrecommendations_target
kube_customresource_vpa_containerrecommendations_target{resource="cpu"}
kube_customresource_vpa_containerrecommendations_target{resource="memory"}
# [운영서버 EC2] 코드 다운로드
git clone https://github.com/kubernetes/autoscaler.git # userdata 로 설치 되어 있음
cd ~/autoscaler/vertical-pod-autoscaler/
tree hack
hack
├── api-docs
│ └── config.yaml
├── boilerplate.go.txt
├── convert-alpha-objects.sh
├── deploy-for-e2e-locally.sh
├── deploy-for-e2e.sh
├── dev-deploy-locally.sh
├── e2e
│ ├── Dockerfile.externalmetrics-writer
│ ├── k8s-metrics-server.yaml
│ ├── metrics-pump.yaml
│ ├── prometheus-adapter.yaml
│ ├── prometheus.yaml
│ ├── recommender-externalmetrics-deployment.yaml
│ └── vpa-rbac.diff
├── emit-metrics.py
├── generate-api-docs.sh
├── generate-crd-yaml.sh
├── generate-flags.sh
├── lib
│ └── util.sh
├── local-cluster.md
├── run-e2e-locally.sh
├── run-e2e.sh
├── run-e2e-tests.sh
├── tools.go
├── update-codegen.sh
├── update-kubernetes-deps-in-e2e.sh
├── update-kubernetes-deps.sh
├── verify-codegen.sh
├── verify-vpa-flags.sh
├── vpa-apply-upgrade.sh
├── vpa-down.sh
├── vpa-process-yaml.sh
├── vpa-process-yamls.sh
├── vpa-up.sh
└── warn-obsolete-vpa-objects.sh
3 directories, 34 files
# openssl 버전 확인
openssl version
OpenSSL 1.0.2k-fips 26 Jan 2017
# 1.0 제거
yum remove openssl -y
# openssl 1.1.1 이상 버전 확인
yum install openssl11 -y
openssl11 version
OpenSSL 1.1.1zb 11 Feb 2025
# 스크립트파일내에 openssl11 수정
sed -i 's/openssl/openssl11/g' ~/autoscaler/vertical-pod-autoscaler/pkg/admission-controller/gencerts.sh
git status
git config --global user.email "you@example.com"
git config --global user.name "Your Name"
git add .
git commit -m "openssl version modify"
# Deploy the Vertical Pod Autoscaler to your cluster with the following command.
watch -d kubectl get pod -n kube-system
cat hack/vpa-up.sh
./hack/vpa-up.sh
# 재실행!
sed -i 's/openssl/openssl11/g' ~/autoscaler/vertical-pod-autoscaler/pkg/admission-controller/gencerts.sh
./hack/vpa-up.sh
kubectl get crd | grep autoscaling
verticalpodautoscalercheckpoints.autoscaling.k8s.io 2025-03-08T11:10:29Z
verticalpodautoscalers.autoscaling.k8s.io 2025-03-08T11:10:29Z
kubectl get mutatingwebhookconfigurations vpa-webhook-config
NAME WEBHOOKS AGE
vpa-webhook-config 1 4m17s
(sejkim@lgcns:N/A) [root@operator-sejkim
kubectl get mutatingwebhookconfigurations vpa-webhook-config -o json | jq
{
"apiVersion": "admissionregistration.k8s.io/v1",
"kind": "MutatingWebhookConfiguration",
"metadata": {
"creationTimestamp": "2025-03-08T11:11:23Z",
"generation": 1,
"name": "vpa-webhook-config",
"resourceVersion": "179736",
"uid": "3e7ea8b4-a2a9-4cf6-ba5e-5c99f89db1cd"
},
"webhooks": [
{
"admissionReviewVersions": [
"v1"
],
"clientConfig": {
"caBundle": "LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURMVENDQWhXZ0F3SUJBZ0lVT1R5SkJMWWpTY2lpeWNtWHBLajNuRmxoYXVjd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0dURVhNQlVHQTFVRUF3d09kbkJoWDNkbFltaHZiMnRmWTJFd0lCY05NalV3TXpBNE1URXhNRFU1V2hnUApNakk1T0RFeU1qSXhNVEV3TlRsYU1Ca3hGekFWQmdOVkJBTU1Eblp3WVY5M1pXSm9iMjlyWDJOaE1JSUJJakFOCkJna3Foa2lHOXcwQkFRRUZBQU9DQVE4QU1JSUJDZ0tDQVFFQXVHbnc5eTBCaER6RzJaUUlUaDUrMC9YNG96T0gKaDYyUkVCUThDcGptN0haWVIxTWhNNUc2UGZzN3cxVnlrM0dzaWloMkVOTTdhTzlwS1d1T0NZTm1rdWhqejZ0cgpUNkxlQjJxSUE5M0pGN291Z0d0Z3Y0T2FzRW9acmRDQytPa1NGOSswZWRZMzQ2VTNWZ1orNDVMZVJRN0lFMnFlClFLWGpsa240c0pVajhCcUJ3QzVzNEdTNVg3b3NKZzUyRDRPMEs1Q0pORzB5TTFXcVNuclBiWmorSVI3aFdjV2cKL3NQODRpalo1ZE9oRTUxdGtSYlBJTGxYa1VyREliV3RJa0UrYk5kYzc4d1Z1aXE3dnV3QzRXMzFPSFlnMXVEOQpqdVVzaktMaGs2TzdDNktMUS94dVQzTk1oRW1TZjVMZzlzWnRPcE9UODZzb0s1OHA2SFRGQ3U3Yk1RSURBUUFCCm8yc3dhVEFkQmdOVkhRNEVGZ1FVUC9KYk45TXlleDNvVWEyNFhiVDZ2SGI3NGI0d0h3WURWUjBqQkJnd0ZvQVUKUC9KYk45TXlleDNvVWEyNFhiVDZ2SGI3NGI0d0RBWURWUjBUQkFVd0F3RUIvekFaQmdOVkhSRUVFakFRZ2c1MgpjR0ZmZDJWaWFHOXZhMTlqWVRBTkJna3Foa2lHOXcwQkFRc0ZBQU9DQVFFQUtrY1llTFJxR2xreXJVYktjUUVDCjBSVFRtaGJ4bTg4MEg5YnYySDV0cVphQXFPK0lIM1dVcDJ0dnU5SEZUQytiSlhoWkRuSlZtczFMQWdTWUhnbk0KU01PQVo2ak1LY1dxYkJsQVNlK24zZXlOWjROUytTTEpNZ2hHNXFHQjVmdkdKMFhCaDFaOEd2SmNDWEFWd0xlWApSN2J0eDlJQkF0UUcrWlExRXJkbWUzZWpWd0pDNkJseXJNNmxjZ3pwUk5iVldKSVZ1ck9WSkZVTWk3TjVQcGpnCnBmOHVZWUo2anVPck9WYVladnFZTUJ1OURtWS9qOFR6clpwbFVuUnFaSFlVR2NtbGdyVjdrbm1Fb2g3aDdXRmEKRWtJRm9JQnZZUUZNVDEyUHNhbmdibHBNWXBWbTMrbkxNakFVL25sbDk2TU5rbm5CdThlVFZES25IVC9oWUg3Sgo4Zz09Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K",
"service": {
"name": "vpa-webhook",
"namespace": "kube-system",
"port": 443
}
},
"failurePolicy": "Ignore",
"matchPolicy": "Equivalent",
"name": "vpa.k8s.io",
"namespaceSelector": {
"matchExpressions": [
{
"key": "kubernetes.io/metadata.name",
"operator": "NotIn",
"values": [
""
]
}
]
},
"objectSelector": {},
"reinvocationPolicy": "Never",
"rules": [
{
"apiGroups": [
""
],
"apiVersions": [
"v1"
],
"operations": [
"CREATE"
],
"resources": [
"pods"
],
"scope": "*"
},
{
"apiGroups": [
"autoscaling.k8s.io"
],
"apiVersions": [
"*"
],
"operations": [
"CREATE",
"UPDATE"
],
"resources": [
"verticalpodautoscalers"
],
"scope": "*"
}
],
"sideEffects": "None",
"timeoutSeconds": 30
}
]
}
# 모니터링
watch -d "kubectl top pod;echo "----------------------";kubectl describe pod | grep Requests: -A2"
# 공식 예제 배포
cd ~/autoscaler/vertical-pod-autoscaler/
cat examples/hamster.yaml
kubectl apply -f examples/hamster.yaml && kubectl get vpa -w
# 파드 리소스 Requestes 확인
kubectl describe pod | grep Requests: -A2
Requests:
cpu: 100m
memory: 50Mi
--
Requests:
cpu: 587m
memory: 262144k
--
Requests:
cpu: 587m
memory: 262144k
# VPA에 의해 기존 파드 삭제되고 신규 파드가 생성됨
kubectl get events --sort-by=".metadata.creationTimestamp" | grep VPA
34s Normal EvictedByVPA pod/hamster-598b78f579-hj445 Pod was evicted by VPA Updater to apply resource recommendation.
34s Normal EvictedPod verticalpodautoscaler/hamster-vpa VPA Updater evicted Pod hamster-598b78f579-hj445 to apply resource recommendation.

Deployment in your cluster.
# EKS 노드에 이미 아래 tag가 들어가 있음
# k8s.io/cluster-autoscaler/enabled : true
# k8s.io/cluster-autoscaler/myeks-sejkim : owned
aws ec2 describe-instances --filters Name=tag:Name,Values=$CLUSTER_NAME-ng1-Node --query "Reservations[*].Instances[*].Tags[*]" --output json | jq
aws ec2 describe-instances --filters Name=tag:Name,Values=$CLUSTER_NAME-ng1-Node --query "Reservations[*].Instances[*].Tags[*]" --output yaml
...
- Key: k8s.io/cluster-autoscaler/myeks-sejkim
Value: owned
- Key: k8s.io/cluster-autoscaler/enabled
Value: 'true'
...

# 현재 autoscaling(ASG) 정보 확인
# aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='클러스터이름']].[AutoScalingGroupName, MinSize, MaxSize,DesiredCapacity]" --output table
aws autoscaling describe-auto-scaling-groups \
--query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='myeks-sejkim']].[AutoScalingGroupName, MinSize, MaxSize,DesiredCapacity]" \
--output table
-----------------------------------------------------------------
| DescribeAutoScalingGroups |
+------------------------------------------------+----+----+----+
| eks-ng1-8ecab988-dd1a-ff47-60cf-594bc559aa6d | 2 | 3 | 2 |
# MinSize 3개로 수정
export ASG_NAME=$(aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='myeks-sejkim']].AutoScalingGroupName" --output text)
aws autoscaling update-auto-scaling-group --auto-scaling-group-name ${ASG_NAME} --min-size 3 --desired-capacity 3 --max-size 3
# 확인
aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='myeks-sejkim']].[AutoScalingGroupName, MinSize, MaxSize,DesiredCapacity]" --output table
-----------------------------------------------------------------
| DescribeAutoScalingGroups |
+------------------------------------------------+----+----+----+
| eks-ng1-8ecab988-dd1a-ff47-60cf-594bc559aa6d | 3 | 3 | 3 |
+------------------------------------------------+----+----+----+
# 배포 : Deploy the Cluster Autoscaler (CAS)
curl -s -O https://raw.githubusercontent.com/kubernetes/autoscaler/master/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml
...
- ./cluster-autoscaler
- --v=4
- --stderrthreshold=info
- --cloud-provider=aws
- --skip-nodes-with-local-storage=false # 로컬 스토리지를 가진 노드를 autoscaler가 scale down할지 결정, false(가능!)
- --expander=least-waste # 노드를 확장할 때 어떤 노드 그룹을 선택할지를 결정, least-waste는 리소스 낭비를 최소화하는 방식으로 새로운 노드를 선택.
- --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/<YOUR CLUSTER NAME>
...
sed -i -e "s|<YOUR CLUSTER NAME>|$CLUSTER_NAME|g" cluster-autoscaler-autodiscover.yaml
kubectl apply -f cluster-autoscaler-autodiscover.yaml
# 확인
kubectl get pod -n kube-system | grep cluster-autoscaler
cluster-autoscaler-f675c56f7-xx78p 1/1 Running 0 9s
kubectl describe deployments.apps -n kube-system cluster-autoscaler
...
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal ScalingReplicaSet 36s deployment-controller Scaled up replica set cluster-autoscaler-f675c56f7 to 1
kubectl describe deployments.apps -n kube-system cluster-autoscaler | grep node-group-auto-discovery
--node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/myeks-sejkim
# (옵션) cluster-autoscaler 파드가 동작하는 워커 노드가 퇴출(evict) 되지 않게 설정
kubectl -n kube-system annotate deployment.apps/cluster-autoscaler cluster-autoscaler.kubernetes.io/safe-to-evict="false"

# 모니터링
kubectl get nodes -w
while true; do kubectl get node; echo "------------------------------" ; date ; sleep 1; done
while true; do aws ec2 describe-instances --query "Reservations[*].Instances[*].{PrivateIPAdd:PrivateIpAddress,InstanceName:Tags[?Key=='Name']|[0].Value,Status:State.Name}" --filters Name=instance-state-name,Values=running --output text ; echo "------------------------------"; date; sleep 1; done
# Deploy a Sample App
# We will deploy an sample nginx application as a ReplicaSet of 1 Pod
cat << EOF > nginx.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: nginx-to-scaleout
spec:
replicas: 1
selector:
matchLabels:
app: nginx
template:
metadata:
labels:
service: nginx
app: nginx
spec:
containers:
- image: nginx
name: nginx-to-scaleout
resources:
limits:
cpu: 500m
memory: 512Mi
requests:
cpu: 500m
memory: 512Mi
EOF
kubectl apply -f nginx.yaml
kubectl get deployment/nginx-to-scaleout
# Scale our ReplicaSet
# Let’s scale out the replicaset to 15
kubectl scale --replicas=15 deployment/nginx-to-scaleout && date
# 확인
kubectl get pods -l app=nginx -o wide --watch
kubectl -n kube-system logs -f deployment/cluster-autoscaler
# 노드 자동 증가 확인
kubectl get nodes
aws autoscaling describe-auto-scaling-groups \
--query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='myeks']].[AutoScalingGroupName, MinSize, MaxSize,DesiredCapacity]" \
--output table
eks-node-viewer --resources cpu,memory
혹은
eks-node-viewer
# [운영서버 EC2] 최근 1시간 Fleet API 호출 확인 - Link
# https://ap-northeast-2.console.aws.amazon.com/cloudtrailv2/home?region=ap-northeast-2#/events?EventName=CreateFleet
aws cloudtrail lookup-events \
--lookup-attributes AttributeKey=EventName,AttributeValue=CreateFleet \
--start-time "$(date -d '1 hour ago' --utc +%Y-%m-%dT%H:%M:%SZ)" \
--end-time "$(date --utc +%Y-%m-%dT%H:%M:%SZ)"
# (참고) Event name : UpdateAutoScalingGroup
# https://ap-northeast-2.console.aws.amazon.com/cloudtrailv2/home?region=ap-northeast-2#/events?EventName=UpdateAutoScalingGroup
# 디플로이먼트 삭제
kubectl delete -f nginx.yaml && date
# [scale-down] 노드 갯수 축소 : 기본은 10분 후 scale down 됨, 물론 아래 flag 로 시간 수정 가능 >> 그러니 디플로이먼트 삭제 후 10분 기다리고 나서 보자!
# By default, cluster autoscaler will wait 10 minutes between scale down operations,
# you can adjust this using the --scale-down-delay-after-add, --scale-down-delay-after-delete,
# and --scale-down-delay-after-failure flag.
# E.g. --scale-down-delay-after-add=5m to decrease the scale down delay to 5 minutes after a node has been added.
# 터미널1
watch -d kubectl get node
# CloudTrail 에 CreateFleet 이벤트 조회 : 최근 90일 가능
aws cloudtrail lookup-events --lookup-attributes AttributeKey=EventName,AttributeValue=CreateFleet

# 위 실습 중 디플로이먼트 삭제 후 10분 후 노드 갯수 축소되는 것을 확인 후 아래 삭제를 해보자! >> 만약 바로 아래 CA 삭제 시 워커 노드는 4개 상태가 되어서 수동으로 2대 변경 하자!
kubectl delete -f nginx.yaml
# size 수정
aws autoscaling update-auto-scaling-group --auto-scaling-group-name ${ASG_NAME} --min-size 3 --desired-capacity 3 --max-size 3
aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='myeks']].[AutoScalingGroupName, MinSize, MaxSize,DesiredCapacity]" --output table
# Cluster Autoscaler 삭제
kubectl delete -f cluster-autoscaler-autodiscover.yaml
하나의 자원에 대해 두군데 (AWS ASG vs AWS EKS)에서 각자의 방식으로 관리 ⇒ 관리 정보가 서로 동기화되지 않아 다양한 문제 발생
**[참고 영상]** 오픈 소스 Karpenter를 활용한 Amazon EKS 확장 운영 전략 (신재현) 무신사 - [링크](https://youtu.be/FPlCVVrCD64) , [원본영상](https://www.youtube.com/watch?v=Re0jZ4Umb80)

#
helm repo add cluster-proportional-autoscaler https://kubernetes-sigs.github.io/cluster-proportional-autoscaler
# CPA규칙을 설정하고 helm차트를 릴리즈 필요
helm upgrade --install cluster-proportional-autoscaler cluster-proportional-autoscaler/cluster-proportional-autoscaler
# nginx 디플로이먼트 배포
cat <<EOT > cpa-nginx.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: nginx-deployment
spec:
replicas: 1
selector:
matchLabels:
app: nginx
template:
metadata:
labels:
app: nginx
spec:
containers:
- name: nginx
image: nginx:latest
resources:
limits:
cpu: "100m"
memory: "64Mi"
requests:
cpu: "100m"
memory: "64Mi"
ports:
- containerPort: 80
EOT
kubectl apply -f cpa-nginx.yaml
# CPA 규칙 설정
cat <<EOF > cpa-values.yaml
config:
ladder:
nodesToReplicas:
- [1, 1]
- [2, 2]
- [3, 3]
- [4, 3]
- [5, 5]
options:
namespace: default
target: "deployment/nginx-deployment"
EOF
kubectl describe cm cluster-proportional-autoscaler
Name: cluster-proportional-autoscaler
Namespace: default
Labels: app.kubernetes.io/managed-by=Helm
Annotations: meta.helm.sh/release-name: cluster-proportional-autoscaler
meta.helm.sh/release-namespace: default
Data
====
ladder:
----
{"nodesToReplicas":[[1,1],[2,2],[3,3],[4,3],[5,5]]}
BinaryData
====
Events: <none>
# 모니터링
watch -d kubectl get pod
# helm 업그레이드
helm upgrade --install cluster-proportional-autoscaler -f cpa-values.yaml cluster-proportional-autoscaler/cluster-proportional-autoscaler
# 노드 5개로 증가
export ASG_NAME=$(aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='myeks-sejkim']].AutoScalingGroupName" --output text)
aws autoscaling update-auto-scaling-group --auto-scaling-group-name ${ASG_NAME} --min-size 5 --desired-capacity 5 --max-size 5
aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='myeks']].[AutoScalingGroupName, MinSize, MaxSize,DesiredCapacity]" --output table
# 노드 4개로 축소
aws autoscaling update-auto-scaling-group --auto-scaling-group-name ${ASG_NAME} --min-size 4 --desired-capacity 4 --max-size 4
aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='myeks']].[AutoScalingGroupName, MinSize, MaxSize,DesiredCapacity]" --output table
# eksctl delete cluster --name $CLUSTER_NAME && aws cloudformation delete-stack --stack-name $CLUSTER_NAME
nohup sh -c "eksctl delete cluster --name $CLUSTER_NAME && aws cloudformation delete-stack --stack-name $CLUSTER_NAME" > /root/delete.log 2>&1 &
# (옵션) 삭제 과정 확인
tail -f delete.log
































# 변수 설정
export KARPENTER_NAMESPACE="kube-system"
export KARPENTER_VERSION="1.2.1"
export K8S_VERSION="1.32"
export AWS_PARTITION="aws" # if you are not using standard partitions, you may need to configure to aws-cn / aws-us-gov
export CLUSTER_NAME="sejkim-karpenter-demo" # ${USER}-karpenter-demo
export AWS_DEFAULT_REGION="ap-northeast-2"
export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --profile aews --query Account --output text)"
export TEMPOUT="$(mktemp)"
export ALIAS_VERSION="$(aws ssm get-parameter --name "/aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2023/x86_64/standard/recommended/image_id" --query Parameter.Value | xargs aws ec2 describe-images --query 'Images[0].Name' --image-ids | sed -r 's/^.*(v[[:digit:]]+).*$/\1/')"
# 확인
echo "${KARPENTER_NAMESPACE}" "${KARPENTER_VERSION}" "${K8S_VERSION}" "${CLUSTER_NAME}" "${AWS_DEFAULT_REGION}" "${AWS_ACCOUNT_ID}" "${TEMPOUT}" "${ALIAS_VERSION}"
cloudformation.yaml does for Karpenter.aws-auth configmap to allow nodes to connect.KARPENTER_IAM_ROLE_ARN variables.# CloudFormation 스택으로 IAM Policy/Role, SQS, Event/Rule 생성 : 3분 정도 소요
## IAM Policy : KarpenterControllerPolicy-gasida-karpenter-demo
## IAM Role : KarpenterNodeRole-gasida-karpenter-demo
curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > "${TEMPOUT}" \
&& aws cloudformation deploy \
--stack-name "Karpenter-${CLUSTER_NAME}" \
--template-file "${TEMPOUT}" \
--capabilities CAPABILITY_NAMED_IAM \
--parameter-overrides "ClusterName=${CLUSTER_NAME}"
# 클러스터 생성 : EKS 클러스터 생성 15분 정도 소요
eksctl create cluster -f - <<EOF
---
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
name: ${CLUSTER_NAME}
region: ${AWS_DEFAULT_REGION}
version: "${K8S_VERSION}"
tags:
karpenter.sh/discovery: ${CLUSTER_NAME}
iam:
withOIDC: true
podIdentityAssociations:
- namespace: "${KARPENTER_NAMESPACE}"
serviceAccountName: karpenter
roleName: ${CLUSTER_NAME}-karpenter
permissionPolicyARNs:
- arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:policy/KarpenterControllerPolicy-${CLUSTER_NAME}
iamIdentityMappings:
- arn: "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/KarpenterNodeRole-${CLUSTER_NAME}"
username: system:node:{{EC2PrivateDNSName}}
groups:
- system:bootstrappers
- system:nodes
## If you intend to run Windows workloads, the kube-proxy group should be specified.
# For more information, see https://github.com/aws/karpenter/issues/5099.
# - eks:kube-proxy-windows
managedNodeGroups:
- instanceType: t3.medium
amiFamily: AmazonLinux2023
name: ${CLUSTER_NAME}-ng
desiredCapacity: 2
minSize: 1
maxSize: 2
iam:
withAddonPolicies:
externalDNS: true
addons:
- name: eks-pod-identity-agent
EOF
# eks 배포 확인
eksctl get cluster
NAME REGION EKSCTL CREATED
sejkim-karpenter-demo ap-northeast-2 True
eksctl get nodegroup --cluster $CLUSTER_NAME
CLUSTER NODEGROUP STATUS CREATED MIN SIZE MAX SIZE DESIRED CAPACITY INSTANCE TYPE IMAGE ID ASG NAME TYPE
sejkim-karpenter-demo sejkim-karpenter-demo-ng ACTIVE 2025-03-08T14:37:20Z 1 2 2 t3.medium AL2023_x86_64_STANDARD eks-sejkim-karpenter-demo-ng-18cabb18-3178-8ce5-1fe3-2928865def0d managed
eksctl get iamidentitymapping --cluster $CLUSTER_NAME
ARN USERNAME GROUPS ACCOUNT
arn:aws:iam::1**********3:role/KarpenterNodeRole-sejkim-karpenter-demo system:node:{{EC2PrivateDNSName}} system:bootstrappers,system:nodes
arn:aws:iam::1**********3:role/eksctl-sejkim-karpenter-demo-nodeg-NodeInstanceRole-LpShCA5JpXgD system:node:{{EC2PrivateDNSName}} system:bootstrappers,system:nodes
eksctl get iamserviceaccount --cluster $CLUSTER_NAME
No iamserviceaccounts found
eksctl get addon --cluster $CLUSTER_NAME
2025-03-08 23:44:38 [ℹ] Kubernetes version "1.32" in use by cluster "sejkim-karpenter-demo"
2025-03-08 23:44:38 [ℹ] getting all addons
2025-03-08 23:44:39 [ℹ] to see issues for an addon run `eksctl get addon --name <addon-name> --cluster <cluster-name>`
NAME VERSION STATUS ISSUES IAMROLE UPDATE AVAILABLE CONFIGURATION VALUES POD IDENTITY ASSOCIATION ROLES
coredns v1.11.4-eksbuild.2 DEGRADED 1
eks-pod-identity-agent v1.3.4-eksbuild.1 ACTIVE 0 v1.3.5-eksbuild.2
kube-proxy v1.32.0-eksbuild.2 ACTIVE 0
metrics-server v0.7.2-eksbuild.2 DEGRADED 1
vpc-cni v1.19.2-eksbuild.1 ACTIVE 0 arn:aws:iam::1**********3:role/eksctl-sejkim-karpenter-demo-addon-vpc-cni-Role1-72O5AIgJ21FE v1.19.3-eksbuild.1,v1.19.2-eksbuild.5
#
kubectl ctx
kubectl config rename-context "<각자 자신의 IAM User>@<자신의 Nickname>-karpenter-demo.ap-northeast-2.eksctl.io" "karpenter-demo"
kubectl config rename-context "admin@gasida-karpenter-demo.ap-northeast-2.eksctl.io" "karpenter-demo"
# k8s 확인
kubectl ns default
kubectl cluster-info
kubectl get node --label-columns=node.kubernetes.io/instance-type,eks.amazonaws.com/capacityType,topology.kubernetes.io/zone
NAME STATUS ROLES AGE VERSION INSTANCE-TYPE CAPACITYTYPE ZONE
ip-192-168-44-153.ap-northeast-2.compute.internal Ready <none> 10m v1.32.1-eks-5d632ec t3.medium ON_DEMAND ap-northeast-2b
ip-192-168-76-20.ap-northeast-2.compute.internal Ready <none> 10m v1.32.1-eks-5d632ec t3.medium ON_DEMAND ap-northeast-2c
kubectl get pod -n kube-system -owide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
aws-node-525xb 2/2 Running 0 10m 192.168.76.20 ip-192-168-76-20.ap-northeast-2.compute.internal <none> <none>
aws-node-m6mk4 2/2 Running 0 10m 192.168.44.153 ip-192-168-44-153.ap-northeast-2.compute.internal <none> <none>
coredns-844d8f59bb-g24dz 1/1 Running 0 14m 192.168.49.206 ip-192-168-44-153.ap-northeast-2.compute.internal <none> <none>
coredns-844d8f59bb-zw6ct 1/1 Running 0 14m 192.168.56.61 ip-192-168-44-153.ap-northeast-2.compute.internal <none> <none>
eks-pod-identity-agent-5h8f9 1/1 Running 0 10m 192.168.44.153 ip-192-168-44-153.ap-northeast-2.compute.internal <none> <none>
eks-pod-identity-agent-75kfn 1/1 Running 0 10m 192.168.76.20 ip-192-168-76-20.ap-northeast-2.compute.internal <none> <none>
kube-proxy-csx7x 1/1 Running 0 10m 192.168.76.20 ip-192-168-76-20.ap-northeast-2.compute.internal <none> <none>
kube-proxy-mfr5j 1/1 Running 0 10m 192.168.44.153 ip-192-168-44-153.ap-northeast-2.compute.internal <none> <none>
metrics-server-74b6cb4f8f-kt6xf 1/1 Running 0 14m 192.168.51.108 ip-192-168-44-153.ap-northeast-2.compute.internal <none> <none>
metrics-server-74b6cb4f8f-tqqw5 1/1 Running 0 14m 192.168.53.140 ip-192-168-44-153.ap-northeast-2.compute.internal <none> <none>
kubectl get pdb -A
NAMESPACE NAME MIN AVAILABLE MAX UNAVAILABLE ALLOWED DISRUPTIONS AGE
kube-system coredns N/A 1 1 15m
kube-system metrics-server N/A 1 1 15m
kubectl describe cm -n kube-system aws-auth
# EC2 Spot Fleet의 service-linked-role 생성 확인 : 만들어있는것을 확인하는 거라 아래 에러 출력이 정상!
# If the role has already been successfully created, you will see:
# An error occurred (InvalidInput) when calling the CreateServiceLinkedRole operation: Service role name AWSServiceRoleForEC2Spot has been taken in this account, please try a different suffix.
aws iam create-service-linked-role --aws-service-name spot.amazonaws.com || true


# kube-ops-view
helm repo add geek-cookbook https://geek-cookbook.github.io/charts/
helm install kube-ops-view geek-cookbook/kube-ops-view --version 1.2.2 --set service.main.type=LoadBalancer --set env.TZ="Asia/Seoul" --namespace kube-system
echo -e "http://$(kubectl get svc -n kube-system kube-ops-view -o jsonpath="{.status.loadBalancer.ingress[0].hostname}"):8080/#scale=1.5"
open "http://$(kubectl get svc -n kube-system kube-ops-view -o jsonpath="{.status.loadBalancer.ingress[0].hostname}"):8080/#scale=1.5"
혹은
kubectl annotate service kube-ops-view -n kube-system "external-dns.alpha.kubernetes.io/hostname=kubeopsview.$MyDomain"
echo -e "Kube Ops View URL = http://kubeopsview.$MyDomain:8080/#scale=1.5"
open "http://kubeopsview.$MyDomain:8080/#scale=1.5"
# (옵션) ExternalDNS
MyDomain=<자신의 도메인>
MyDomain=gasida.link
MyDnzHostedZoneId=$(aws route53 list-hosted-zones-by-name --dns-name "${MyDomain}." --query "HostedZones[0].Id" --output text)
echo $MyDomain, $MyDnzHostedZoneId
curl -s https://raw.githubusercontent.com/gasida/PKOS/main/aews/externaldns.yaml | MyDomain=$MyDomain MyDnzHostedZoneId=$MyDnzHostedZoneId envsubst | kubectl apply -f -
# Logout of helm registry to perform an unauthenticated pull against the public ECR
helm registry logout public.ecr.aws
# Karpenter 설치를 위한 변수 설정 및 확인
export CLUSTER_ENDPOINT="$(aws eks describe-cluster --name "${CLUSTER_NAME}" --query "cluster.endpoint" --output text)"
export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter"
echo "${CLUSTER_ENDPOINT} ${KARPENTER_IAM_ROLE_ARN}"
# karpenter 설치
helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter --version "${KARPENTER_VERSION}" --namespace "${KARPENTER_NAMESPACE}" --create-namespace \
--set "settings.clusterName=${CLUSTER_NAME}" \
--set "settings.interruptionQueue=${CLUSTER_NAME}" \
--set controller.resources.requests.cpu=1 \
--set controller.resources.requests.memory=1Gi \
--set controller.resources.limits.cpu=1 \
--set controller.resources.limits.memory=1Gi \
--wait
# 확인
helm list -n kube-system
karpenter kube-system 1 2025-03-09 00:08:30.621088 +0900 KST deployed karpenter-1.2.1 1.2.1
kube-ops-view kube-system 1 2025-03-09 00:00:24.058658 +0900 KST deployed kube-ops-view-1.2.2 20.4.0
kubectl get all -n $KARPENTER_NAMESPACE
kubectl get crd | grep karpenter
ec2nodeclasses.karpenter.k8s.aws 2025-03-08T15:08:30Z
nodeclaims.karpenter.sh 2025-03-08T15:08:30Z
nodepools.karpenter.sh 2025-03-08T15:08:30Z
ClusterFirst기본적으로 포드 DNS 정책을 사용합니다. Karpenter가 DNS 서비스 포드의 용량을 관리해야 하는 경우 Karpenter가 시작될 때 DNS가 실행되지 않음을 의미합니다. 이 경우 포드 DNS 정책을 Defaultwith 로 설정해야 합니다 --set dnsPolicy=Default. 이렇게 하면 Karpenter가 내부 DNS 확인 대신 호스트의 DNS 확인을 사용하도록 하여 실행할 DNS 서비스 포드에 대한 종속성이 없도록 합니다.karpenter.sh/managed-bykarpenter.sh/nodepoolkubernetes.io/cluster/${CLUSTER_NAME}#
helm repo add grafana-charts https://grafana.github.io/helm-charts
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
kubectl create namespace monitoring
# 프로메테우스 설치
curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/prometheus-values.yaml | envsubst | tee prometheus-values.yaml
helm install --namespace monitoring prometheus prometheus-community/prometheus --values prometheus-values.yaml
extraScrapeConfigs: |
- job_name: karpenter
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- kube-system
relabel_configs:
- source_labels:
- __meta_kubernetes_endpoints_name
- __meta_kubernetes_endpoint_port_name
action: keep
regex: karpenter;http-metrics
# 프로메테우스 얼럿매니저 미사용으로 삭제
kubectl delete sts -n monitoring prometheus-alertmanager
# 프로메테우스 접속 설정
export POD_NAME=$(kubectl get pods --namespace monitoring -l "app.kubernetes.io/name=prometheus,app.kubernetes.io/instance=prometheus" -o jsonpath="{.items[0].metadata.name}")
kubectl --namespace monitoring port-forward $POD_NAME 9090 &
open http://127.0.0.1:9090
# 그라파나 설치
curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/grafana-values.yaml | tee grafana-values.yaml
helm install --namespace monitoring grafana grafana-charts/grafana --values grafana-values.yaml
datasources:
datasources.yaml:
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
version: 1
url: http://prometheus-server:80
access: proxy
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/default
dashboards:
default:
capacity-dashboard:
url: https://karpenter.sh/preview/getting-started/getting-started-with-karpenter/karpenter-capacity-dashboard.json
performance-dashboard:
url: https://karpenter.sh/preview/getting-started/getting-started-with-karpenter/karpenter-performance-dashboard.json
# admin 암호
kubectl get secret --namespace monitoring grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo
MjTam3qgdglcOabPwid0oeNVI79X9migMwW1eKLO
# 그라파나 접속
kubectl port-forward --namespace monitoring svc/grafana 3000:80 &
open http://127.0.0.1:3000

securityGroupSelectorTerms및 를 사용합니다. 위 명령 에서 subnetSelectorTerms태그를 적용했습니다 . 이러한 리소스가 클러스터 간에 공유되는 방식에 따라 다른 태그 지정 체계를 사용해야 할 수 있습니다.karpenter.sh/discoverysecurityGroupSelectorTerms and subnetSelectorTerms to discover resources used to launch nodes. We applied the tag karpenter.sh/discovery in the eksctl command above. Depending on how these resources are shared between clusters, you may need to use different tagging schemes.consolidationPolicy은 Karpenter가 노드를 제거하고 교체하여 클러스터 비용을 줄이도록 구성합니다. 결과적으로 통합은 클러스터의 모든 빈 노드를 종료합니다. 이 동작은 로 설정하여 Karpenter에게 노드를 통합해서는 안 된다고 말함으로써 비활성화할 수 있습니다 . 자세한 내용은 NodePool API 문서를 검토하세요.WhenEmptyOrUnderutilizeddisruptionconsolidateAfterNeverconsolidationPolicy set to WhenEmptyOrUnderutilized in the disruption block configures Karpenter to reduce cluster cost by removing and replacing nodes. As a result, consolidation will terminate any empty nodes on the cluster. This behavior can be disabled by setting consolidateAfter to Never, telling Karpenter that it should never consolidate nodes. Review the NodePool API docs for more informatio#
echo $ALIAS_VERSION
v20250228
#
cat <<EOF | envsubst | kubectl apply -f -
apiVersion: karpenter.sh/v1
kind: NodePool
metadata:
name: default
spec:
template:
spec:
requirements:
- key: kubernetes.io/arch
operator: In
values: ["amd64"]
- key: kubernetes.io/os
operator: In
values: ["linux"]
- key: karpenter.sh/capacity-type
operator: In
values: ["on-demand"]
- key: karpenter.k8s.aws/instance-category
operator: In
values: ["t", "c"]
- key: karpenter.k8s.aws/instance-generation
operator: Gt
values: ["2"]
nodeClassRef:
group: karpenter.k8s.aws
kind: EC2NodeClass
name: default
expireAfter: 720h # 30 * 24h = 720h
limits:
cpu: 1000
disruption:
consolidationPolicy: WhenEmptyOrUnderutilized
consolidateAfter: 1m
---
apiVersion: karpenter.k8s.aws/v1
kind: EC2NodeClass
metadata:
name: default
spec:
role: "KarpenterNodeRole-${CLUSTER_NAME}" # replace with your cluster name
amiSelectorTerms:
- alias: "al2023@${ALIAS_VERSION}" # ex) al2023@latest
subnetSelectorTerms:
- tags:
karpenter.sh/discovery: "${CLUSTER_NAME}" # replace with your cluster name
securityGroupSelectorTerms:
- tags:
karpenter.sh/discovery: "${CLUSTER_NAME}" # replace with your cluster name
EOF
# 확인
kubectl get nodepool,ec2nodeclass,nodeclaims
NAME NODECLASS NODES READY AGE
nodepool.karpenter.sh/default default 0 True 46s
NAME READY AGE
ec2nodeclass.karpenter.k8s.aws/default True 46s
# pause 파드 1개에 CPU 1개 최소 보장 할당할 수 있게 디플로이먼트 배포
cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
name: inflate
spec:
replicas: 0
selector:
matchLabels:
app: inflate
template:
metadata:
labels:
app: inflate
spec:
terminationGracePeriodSeconds: 0
securityContext:
runAsUser: 1000
runAsGroup: 3000
fsGroup: 2000
containers:
- name: inflate
image: public.ecr.aws/eks-distro/kubernetes/pause:3.7
resources:
requests:
cpu: 1
securityContext:
allowPrivilegeEscalation: false
EOF
# [신규 터미널] 모니터링
eks-node-viewer --resources cpu,memory
eks-node-viewer --resources cpu,memory --node-selector "karpenter.sh/registered=true" --extra-labels eks-node-viewer/node-age
# Scale up
kubectl get pod
kubectl scale deployment inflate --replicas 5
# 출력 로그 분석해보자!
kubectl logs -f -n "${KARPENTER_NAMESPACE}" -l app.kubernetes.io/name=karpenter -c controller
kubectl logs -f -n "${KARPENTER_NAMESPACE}" -l app.kubernetes.io/name=karpenter -c controller | jq '.'
kubectl logs -n "${KARPENTER_NAMESPACE}" -l app.kubernetes.io/name=karpenter -c controller | grep 'launched nodeclaim' | jq '.'
{
"level": "INFO",
"time": "2025-03-08T15:48:46.926Z",
"logger": "controller",
"message": "launched nodeclaim",
"commit": "058c665",
"controller": "nodeclaim.lifecycle",
"controllerGroup": "karpenter.sh",
"controllerKind": "NodeClaim",
"NodeClaim": {
"name": "default-dmfp7"
},
"namespace": "",
"name": "default-dmfp7",
"reconcileID": "aaed44b9-e0a7-45e8-a682-1d848b6b9715",
"provider-id": "aws:///ap-northeast-2b/i-07fe24df563aefef5",
"instance-type": "c5a.2xlarge",
"zone": "ap-northeast-2b",
"capacity-type": "on-demand",
"allocatable": {
"cpu": "7910m",
"ephemeral-storage": "17Gi",
"memory": "14162Mi",
"pods": "58",
"vpc.amazonaws.com/pod-eni": "38"
}
}
# 확인
kubectl get nodeclaims
NAME TYPE CAPACITY ZONE NODE READY AGE
default-dmfp7 c5a.2xlarge on-demand ap-northeast-2b ip-192-168-138-151.ap-northeast-2.compute.internal True 4m53s
kubectl describe nodeclaims
Name: default-dmfp7
Namespace:
Labels: karpenter.k8s.aws/ec2nodeclass=default
karpenter.k8s.aws/instance-category=c
karpenter.k8s.aws/instance-cpu=8
karpenter.k8s.aws/instance-cpu-manufacturer=amd
karpenter.k8s.aws/instance-cpu-sustained-clock-speed-mhz=3300
karpenter.k8s.aws/instance-ebs-bandwidth=3170
karpenter.k8s.aws/instance-encryption-in-transit-supported=true
karpenter.k8s.aws/instance-family=c5a
karpenter.k8s.aws/instance-generation=5
karpenter.k8s.aws/instance-hypervisor=nitro
karpenter.k8s.aws/instance-memory=16384
karpenter.k8s.aws/instance-network-bandwidth=2500
karpenter.k8s.aws/instance-size=2xlarge
karpenter.sh/capacity-type=on-demand
karpenter.sh/nodepool=default
kubernetes.io/arch=amd64
kubernetes.io/os=linux
node.kubernetes.io/instance-type=c5a.2xlarge
topology.k8s.aws/zone-id=apne2-az2
topology.kubernetes.io/region=ap-northeast-2
topology.kubernetes.io/zone=ap-northeast-2b
Annotations: compatibility.karpenter.k8s.aws/cluster-name-tagged: true
karpenter.k8s.aws/ec2nodeclass-hash: 5102623682851387637
karpenter.k8s.aws/ec2nodeclass-hash-version: v4
karpenter.k8s.aws/tagged: true
karpenter.sh/nodepool-hash: 6821555240594823858
karpenter.sh/nodepool-hash-version: v3
API Version: karpenter.sh/v1
Kind: NodeClaim
Metadata:
Creation Timestamp: 2025-03-08T15:48:44Z
Finalizers:
karpenter.sh/termination
Generate Name: default-
Generation: 1
Owner References:
API Version: karpenter.sh/v1
Block Owner Deletion: true
Kind: NodePool
Name: default
UID: 8b637e0d-8f19-4e92-97ae-2968af2cef18
Resource Version: 16277
UID: 29651b05-3894-4c7a-84b3-c7c178df224e
Spec:
Expire After: 720h
Node Class Ref:
Group: karpenter.k8s.aws
Kind: EC2NodeClass
Name: default
Requirements:
Key: node.kubernetes.io/instance-type
Operator: In
Values:
c4.2xlarge
c4.4xlarge
c4.8xlarge
c5.12xlarge
c5.18xlarge
c5.24xlarge
c5.2xlarge
c5.4xlarge
c5.9xlarge
c5.metal
c5a.12xlarge
c5a.16xlarge
c5a.24xlarge
c5a.2xlarge
c5a.4xlarge
c5a.8xlarge
c5d.12xlarge
c5d.18xlarge
c5d.24xlarge
c5d.2xlarge
c5d.4xlarge
c5d.9xlarge
c5d.metal
c5n.18xlarge
c5n.2xlarge
c5n.4xlarge
c5n.9xlarge
c5n.metal
c6i.12xlarge
c6i.16xlarge
c6i.24xlarge
c6i.2xlarge
c6i.32xlarge
c6i.4xlarge
c6i.8xlarge
c6i.metal
c6id.12xlarge
c6id.16xlarge
c6id.24xlarge
c6id.2xlarge
c6id.4xlarge
c6id.8xlarge
c6in.12xlarge
c6in.16xlarge
c6in.24xlarge
c6in.2xlarge
c6in.4xlarge
c6in.8xlarge
c7i-flex.2xlarge
c7i-flex.4xlarge
c7i-flex.8xlarge
c7i.12xlarge
c7i.16xlarge
c7i.24xlarge
c7i.2xlarge
c7i.4xlarge
c7i.8xlarge
c7i.metal-24xl
t3.2xlarge
t3a.2xlarge
Key: karpenter.sh/nodepool
Operator: In
Values:
default
Key: karpenter.k8s.aws/ec2nodeclass
Operator: In
Values:
default
Key: kubernetes.io/os
Operator: In
Values:
linux
Key: karpenter.sh/capacity-type
Operator: In
Values:
on-demand
Key: karpenter.k8s.aws/instance-category
Operator: In
Values:
c
t
Key: karpenter.k8s.aws/instance-generation
Operator: Gt
Values:
2
Key: kubernetes.io/arch
Operator: In
Values:
amd64
Resources:
Requests:
Cpu: 5150m
Pods: 9
Status:
Allocatable:
Cpu: 7910m
Ephemeral - Storage: 17Gi
Memory: 14162Mi
Pods: 58
vpc.amazonaws.com/pod-eni: 38
Capacity:
Cpu: 8
Ephemeral - Storage: 20Gi
Memory: 15155Mi
Pods: 58
vpc.amazonaws.com/pod-eni: 38
Conditions:
Last Transition Time: 2025-03-08T15:49:15Z
Message:
Observed Generation: 1
Reason: Initialized
Status: True
Type: Initialized
Last Transition Time: 2025-03-08T15:48:46Z
Message:
Observed Generation: 1
Reason: Launched
Status: True
Type: Launched
Last Transition Time: 2025-03-08T15:49:15Z
Message:
Observed Generation: 1
Reason: Ready
Status: True
Type: Ready
Last Transition Time: 2025-03-08T15:49:05Z
Message:
Observed Generation: 1
Reason: Registered
Status: True
Type: Registered
Image ID: ami-089f1bf55c5291efd
Last Pod Event Time: 2025-03-08T15:54:10Z
Node Name: ip-192-168-138-151.ap-northeast-2.compute.internal
Provider ID: aws:///ap-northeast-2b/i-07fe24df563aefef5
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Launched 5m44s karpenter Status condition transitioned, Type: Launched, Status: Unknown -> True, Reason: Launched
Normal DisruptionBlocked 5m41s karpenter Nodeclaim does not have an associated node
Normal Registered 5m25s karpenter Status condition transitioned, Type: Registered, Status: Unknown -> True, Reason: Registered
Normal Initialized 5m15s karpenter Status condition transitioned, Type: Initialized, Status: Unknown -> True, Reason: Initialized
Normal Ready 5m15s karpenter Status condition transitioned, Type: Ready, Status: Unknown -> True, Reason: Ready
#
kubectl get node -l karpenter.sh/registered=true -o jsonpath="{.items[0].metadata.labels}" | jq '.'
{
"beta.kubernetes.io/arch": "amd64",
"beta.kubernetes.io/instance-type": "c5a.2xlarge",
"beta.kubernetes.io/os": "linux",
"failure-domain.beta.kubernetes.io/region": "ap-northeast-2",
"failure-domain.beta.kubernetes.io/zone": "ap-northeast-2b",
"k8s.io/cloud-provider-aws": "99c8fa5cbf14bd3ac18cdf4809beea3e",
"karpenter.k8s.aws/ec2nodeclass": "default",
"karpenter.k8s.aws/instance-category": "c",
"karpenter.k8s.aws/instance-cpu": "8",
"karpenter.k8s.aws/instance-cpu-manufacturer": "amd",
"karpenter.k8s.aws/instance-cpu-sustained-clock-speed-mhz": "3300",
"karpenter.k8s.aws/instance-ebs-bandwidth": "3170",
"karpenter.k8s.aws/instance-encryption-in-transit-supported": "true",
"karpenter.k8s.aws/instance-family": "c5a",
"karpenter.k8s.aws/instance-generation": "5",
"karpenter.k8s.aws/instance-hypervisor": "nitro",
"karpenter.k8s.aws/instance-memory": "16384",
"karpenter.k8s.aws/instance-network-bandwidth": "2500",
"karpenter.k8s.aws/instance-size": "2xlarge",
"karpenter.sh/capacity-type": "on-demand",
"karpenter.sh/initialized": "true",
"karpenter.sh/nodepool": "default",
"karpenter.sh/registered": "true",
"kubernetes.io/arch": "amd64",
"kubernetes.io/hostname": "ip-192-168-138-151.ap-northeast-2.compute.internal",
"kubernetes.io/os": "linux",
"node.kubernetes.io/instance-type": "c5a.2xlarge",
"topology.k8s.aws/zone-id": "apne2-az2",
"topology.kubernetes.io/region": "ap-northeast-2",
"topology.kubernetes.io/zone": "ap-northeast-2b"
}
# (옵션) 더욱 더 Scale up!
kubectl scale deployment inflate --replicas 30




# Now, delete the deployment. After a short amount of time, Karpenter should terminate the empty nodes due to consolidation.
kubectl delete deployment inflate && date
deployment.apps "inflate" deleted
Sun Mar 9 01:09:44 KST 2025
# 출력 로그 분석해보자!
kubectl logs -f -n "${KARPENTER_NAMESPACE}" -l app.kubernetes.io/name=karpenter -c controller | jq '.'
...
{
"level": "INFO",
"time": "2025-03-02T06:53:28.780Z",
"logger": "controller",
"message": "disrupting nodeclaim(s) via delete, terminating 1 nodes (1 pods) ip-192-168-131-97.ap-northeast-2.compute.internal/c5a.large/on-demand",
"commit": "058c665",
"controller": "disruption",
"namespace": "",
"name": "",
"reconcileID": "86a3a45c-2604-4a71-808a-21290301d096",
"command-id": "51914aee-4e09-436f-af6d-794163c3d1c2",
"reason": "underutilized"
}
{
"level": "INFO",
"time": "2025-03-02T06:53:29.532Z",
"logger": "controller",
"message": "tainted node",
"commit": "058c665",
"controller": "node.termination",
"controllerGroup": "",
"controllerKind": "Node",
"Node": {
"name": "ip-192-168-131-97.ap-northeast-2.compute.internal"
},
"namespace": "",
"name": "ip-192-168-131-97.ap-northeast-2.compute.internal",
"reconcileID": "617bcb4d-5498-44d9-ba1e-6c8b7d97c405",
"taint.Key": "karpenter.sh/disrupted",
"taint.Value": "",
"taint.Effect": "NoSchedule"
}
{
"level": "INFO",
"time": "2025-03-02T06:54:03.234Z",
"logger": "controller",
"message": "deleted node",
"commit": "058c665",
"controller": "node.termination",
"controllerGroup": "",
"controllerKind": "Node",
"Node": {
"name": "ip-192-168-131-97.ap-northeast-2.compute.internal"
},
"namespace": "",
"name": "ip-192-168-131-97.ap-northeast-2.compute.internal",
"reconcileID": "8c71fb19-b7ae-4037-afef-fbf1c7343f84"
}
{
"level": "INFO",
"time": "2025-03-02T06:54:03.488Z",
"logger": "controller",
"message": "deleted nodeclaim",
"commit": "058c665",
"controller": "nodeclaim.lifecycle",
"controllerGroup": "karpenter.sh",
"controllerKind": "NodeClaim",
"NodeClaim": {
"name": "default-mfkgp"
},
"namespace": "",
"name": "default-mfkgp",
"reconcileID": "757b4d88-2bf2-412c-bf83-3149f9517d85",
"Node": {
"name": "ip-192-168-131-97.ap-northeast-2.compute.internal"
},
"provider-id": "aws:///ap-northeast-2a/i-00f22c8bde3faf646"
}
{
"level": "INFO",
"time": "2025-03-02T07:25:55.661Z",
"logger": "controller",
"message": "disrupting nodeclaim(s) via delete, terminating 1 nodes (0 pods) ip-192-168-176-171.ap-northeast-2.compute.internal/c5a.2xlarge/on-demand",
"commit": "058c665",
"controller": "disruption",
"namespace": "",
"name": "",
"reconcileID": "0942417e-7ecb-437a-85db-adc553ccade9",
"command-id": "b2b7c689-91ca-43c5-ac1c-2052bf7418c1",
"reason": "empty"
}
{
"level": "INFO",
"time": "2025-03-02T07:25:56.783Z",
"logger": "controller",
"message": "tainted node",
"commit": "058c665",
"controller": "node.termination",
"controllerGroup": "",
"controllerKind": "Node",
"Node": {
"name": "ip-192-168-176-171.ap-northeast-2.compute.internal"
},
"namespace": "",
"name": "ip-192-168-176-171.ap-northeast-2.compute.internal",
"reconcileID": "6254e6be-2445-4402-b829-0bb75fa540e0",
"taint.Key": "karpenter.sh/disrupted",
"taint.Value": "",
"taint.Effect": "NoSchedule"
}
{
"level": "INFO",
"time": "2025-03-02T07:26:49.195Z",
"logger": "controller",
"message": "deleted node",
"commit": "058c665",
"controller": "node.termination",
"controllerGroup": "",
"controllerKind": "Node",
"Node": {
"name": "ip-192-168-176-171.ap-northeast-2.compute.internal"
},
"namespace": "",
"name": "ip-192-168-176-171.ap-northeast-2.compute.internal",
"reconcileID": "6c126a63-8bfa-4828-8ef6-5d22b8c1e7cc"
}
#
kubectl get nodeclaims


# 기존 nodepool 삭제
kubectl delete nodepool,ec2nodeclass default
# 모니터링
kubectl logs -f -n "${KARPENTER_NAMESPACE}" -l app.kubernetes.io/name=karpenter -c controller | jq '.'
eks-node-viewer --resources cpu,memory --node-selector "karpenter.sh/registered=true" --extra-labels eks-node-viewer/node-age
watch -d "kubectl get nodes -L karpenter.sh/nodepool -L node.kubernetes.io/instance-type -L karpenter.sh/capacity-type"
# Create a Karpenter NodePool and EC2NodeClass
cat <<EOF | envsubst | kubectl apply -f -
apiVersion: karpenter.sh/v1
kind: NodePool
metadata:
name: default
spec:
template:
spec:
nodeClassRef:
group: karpenter.k8s.aws
kind: EC2NodeClass
name: default
requirements:
- key: kubernetes.io/os
operator: In
values: ["linux"]
- key: karpenter.sh/capacity-type
operator: In
values: ["on-demand"]
- key: karpenter.k8s.aws/instance-category
operator: In
values: ["c", "m", "r"]
- key: karpenter.k8s.aws/instance-size
operator: NotIn
values: ["nano","micro","small","medium"]
- key: karpenter.k8s.aws/instance-hypervisor
operator: In
values: ["nitro"]
expireAfter: 1h # nodes are terminated automatically after 1 hour
limits:
cpu: "1000"
memory: 1000Gi
disruption:
consolidationPolicy: WhenEmptyOrUnderutilized # policy enables Karpenter to replace nodes when they are either empty or underutilized
consolidateAfter: 1m
---
apiVersion: karpenter.k8s.aws/v1
kind: EC2NodeClass
metadata:
name: default
spec:
role: "KarpenterNodeRole-${CLUSTER_NAME}" # replace with your cluster name
amiSelectorTerms:
- alias: "al2023@latest"
subnetSelectorTerms:
- tags:
karpenter.sh/discovery: "${CLUSTER_NAME}" # replace with your cluster name
securityGroupSelectorTerms:
- tags:
karpenter.sh/discovery: "${CLUSTER_NAME}" # replace with your cluster name
EOF
# 확인
kubectl get nodepool,ec2nodeclass
# Deploy a sample workload
cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
name: inflate
spec:
replicas: 5
selector:
matchLabels:
app: inflate
template:
metadata:
labels:
app: inflate
spec:
terminationGracePeriodSeconds: 0
securityContext:
runAsUser: 1000
runAsGroup: 3000
fsGroup: 2000
containers:
- name: inflate
image: public.ecr.aws/eks-distro/kubernetes/pause:3.7
resources:
requests:
cpu: 1
memory: 1.5Gi
securityContext:
allowPrivilegeEscalation: false
EOF
#
kubectl get nodes -L karpenter.sh/nodepool -L node.kubernetes.io/instance-type -L karpenter.sh/capacity-type
kubectl get nodeclaims
kubectl describe nodeclaims
kubectl logs -f -n "${KARPENTER_NAMESPACE}" -l app.kubernetes.io/name=karpenter -c controller | jq '.'
kubectl logs -n "${KARPENTER_NAMESPACE}" -l app.kubernetes.io/name=karpenter -c controller | grep 'launched nodeclaim' | jq '.'
# Scale the inflate workload from 5 to 12 replicas, triggering Karpenter to provision additional capacity
kubectl scale deployment/inflate --replicas 12
# This changes the total memory request for this deployment to around 12Gi,
# which when adjusted to account for the roughly 600Mi reserved for the kubelet on each node means that this will fit on 2 instances of type m5.large:
kubectl get nodeclaims
# Scale down the workload back down to 5 replicas
kubectl scale deployment/inflate --replicas 5
kubectl get nodeclaims
NAME TYPE CAPACITY ZONE NODE READY AGE
default-ffnzp c6g.2xlarge on-demand ap-northeast-2c ip-192-168-185-240.ap-northeast-2.compute.internal True 14m
# We can check the Karpenter logs to get an idea of what actions it took in response to our scaling in the deployment. Wait about 5-10 seconds before running the following command:
kubectl logs -f -n "${KARPENTER_NAMESPACE}" -l app.kubernetes.io/name=karpenter -c controller | jq '.'
{
"level": "INFO",
"time": "2025-03-02T08:19:13.969Z",
"logger": "controller",
"message": "disrupting nodeclaim(s) via delete, terminating 1 nodes (5 pods) ip-192-168-132-48.ap-northeast-2.compute.internal/c6g.2xlarge/on-demand",
"commit": "058c665",
"controller": "disruption",
"namespace": "",
"name": "",
"reconcileID": "a900df38-7189-42aa-a3b3-9fcaf944dcf4",
"command-id": "4b7ef3a5-6962-48a9-bd38-c9898580bb75",
"reason": "underutilized"
}
# Karpenter can also further consolidate if a node can be replaced with a cheaper variant in response to workload changes.
# This can be demonstrated by scaling the inflate deployment replicas down to 1, with a total memory request of around 1Gi:
kubectl scale deployment/inflate --replicas 1
kubectl logs -f -n "${KARPENTER_NAMESPACE}" -l app.kubernetes.io/name=karpenter -c controller | jq '.'
{
"level": "INFO",
"time": "2025-03-02T08:23:59.683Z",
"logger": "controller",
"message": "disrupting nodeclaim(s) via replace, terminating 1 nodes (1 pods) ip-192-168-185-240.ap-northeast-2.compute.internal/c6g.2xlarge/on-demand and replacing with on-demand node from types c6g.large, c7g.large, c5a.large, c6gd.large, m6g.large and 55 other(s)",
"commit": "058c665",
"controller": "disruption",
"namespace": "",
"name": "",
"reconcileID": "6669c544-e065-4c97-b594-ec1fb68b68b5",
"command-id": "b115c17f-3e29-48bc-8da8-d7073f189624",
"reason": "underutilized"
}
kubectl get nodeclaims
NAME TYPE CAPACITY ZONE NODE READY AGE
default-ff7xn c6g.large on-demand ap-northeast-2b ip-192-168-109-5.ap-northeast-2.compute.internal True 78s
default-ffnzp c6g.2xlarge on-demand ap-northeast-2c ip-192-168-185-240.ap-northeast-2.compute.internal True 16m
kubectl get nodeclaims
NAME TYPE CAPACITY ZONE NODE READY AGE
default-ff7xn c6g.large on-demand ap-northeast-2b ip-192-168-109-5.ap-northeast-2.compute.internal True 3m3s
# 삭제
kubectl delete deployment inflate
kubectl delete nodepool,ec2nodeclass default
# Karpenter helm 삭제
helm uninstall karpenter --namespace "${KARPENTER_NAMESPACE}"
# Karpenter IAM Role 등 생성한 CloudFormation 삭제
aws cloudformation delete-stack --stack-name "Karpenter-${CLUSTER_NAME}"
# EC2 Launch Template 삭제
aws ec2 describe-launch-templates --filters "Name=tag:karpenter.k8s.aws/cluster,Values=${CLUSTER_NAME}" |
jq -r ".LaunchTemplates[].LaunchTemplateName" |
xargs -I{} aws ec2 delete-launch-template --launch-template-name {}
# 클러스터 삭제
eksctl delete cluster --name "${CLUSTER_NAME}"