
국내 최대 온라인 쇼핑몰의 주문 시스템을 운영하고 있습니다.
2021년 말, 글로벌 진출을 위해 일본 리전에 서비스를 확장 배포했으나,
다음과 같은 문제가 발생했습니다.
@Service
@Slf4j
public class ProductImageService {
@Value("${aws.s3.bucket.name}")
private String bucketName;
public String uploadImage(MultipartFile file) {
try {
// 기존 코드: 인터넷 경유하여 S3 접근
String imageUrl = s3Client.putObject(bucketName,
"products/" + UUID.randomUUID(),
file.getInputStream(),
new ObjectMetadata())
.getVersionId();
log.info("Image upload completed: {}", imageUrl);
return imageUrl;
} catch (Exception e) {
log.error("Image upload failed", e);
throw new RuntimeException("Image upload failed", e);
}
}
}

@Configuration
public class AwsConfig {
@Bean
public AmazonEC2 ec2Client() {
return AmazonEC2ClientBuilder.standard()
.withRegion(Regions.AP_NORTHEAST_1)
.build();
}
@Bean
public VpcPeeringConnectionManager peeringManager(AmazonEC2 ec2Client) {
return new VpcPeeringConnectionManager(ec2Client);
}
}
@Component
@Slf4j
public class VpcPeeringConnectionManager {
private final AmazonEC2 ec2Client;
public void createPeeringConnection(String vpcId, String peerVpcId) {
CreateVpcPeeringConnectionRequest request = new CreateVpcPeeringConnectionRequest()
.withVpcId(vpcId)
.withPeerVpcId(peerVpcId);
CreateVpcPeeringConnectionResult result =
ec2Client.createVpcPeeringConnection(request);
log.info("VPC Peering connection created: {}",
result.getVpcPeeringConnection().getVpcPeeringConnectionId());
}
}
# application.yml
cloud:
aws:
vpc:
endpoint:
s3:
enabled: true
service-name: com.amazonaws.ap-northeast-1.s3
private-dns-enabled: true
@Configuration
public class S3Config {
@Value("${cloud.aws.vpc.endpoint.s3.service-name}")
private String serviceEndpoint;
@Bean
public AmazonS3 s3Client() {
ClientConfiguration clientConfiguration = new ClientConfiguration()
.withMaxConnections(100)
.withConnectionTimeout(5000)
.withSocketTimeout(25000);
return AmazonS3ClientBuilder.standard()
.withEndpointConfiguration(
new AwsClientBuilder.EndpointConfiguration(
serviceEndpoint,
Regions.AP_NORTHEAST_1.getName()))
.withClientConfiguration(clientConfiguration)
.withPathStyleAccessEnabled(true)
.build();
}
}
@Component
@Slf4j
public class NetworkPerformanceMonitor {
private final MeterRegistry meterRegistry;
@Scheduled(fixedRate = 60000)
public void monitorNetworkPerformance() {
Timer.builder("network.latency")
.tag("type", "vpc_peering")
.register(meterRegistry)
.record(() -> {
// VPC Peering 레이턴시 측정
measurePeeringLatency();
});
}
private void measurePeeringLatency() {
try {
Socket socket = new Socket();
long startTime = System.currentTimeMillis();
socket.connect(new InetSocketAddress(targetHost, targetPort), 1000);
long endTime = System.currentTimeMillis();
meterRegistry.gauge("network.latency.ms",
endTime - startTime);
} catch (Exception e) {
log.error("Failed to measure network latency", e);
}
}
}

{
"Routes": [
{
"DestinationCidrBlock": "10.0.0.0/16",
"VpcPeeringConnectionId": "pcx-xxxxxx",
"State": "active"
}
]
}
resource "aws_vpc_endpoint" "s3" {
vpc_id = aws_vpc.main.id
service_name = "com.amazonaws.ap-northeast-1.s3"
route_table_ids = [aws_route_table.private.id]
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Principal = "*"
Action = ["s3:GetObject", "s3:PutObject"]
Resource = ["${aws_s3_bucket.main.arn}/*"]
}
]
})
}
@Configuration
public class GrafanaConfig {
@Bean
public GrafanaDashboard networkDashboard() {
return GrafanaDashboard.builder()
.addPanel(
Panel.builder()
.withTitle("VPC Peering Latency")
.withMetric("network.latency{type='vpc_peering'}")
.build()
)
.addPanel(
Panel.builder()
.withTitle("S3 Operation Latency")
.withMetric("aws.s3.operation.latency")
.build()
)
.build();
}
}
@Component
@Slf4j
public class VpcPeeringFailoverHandler {
@Retryable(
value = { VpcPeeringException.class },
maxAttempts = 3,
backoff = @Backoff(delay = 1000))
public void handlePeeringFailure(String connectionId) {
try {
// 피어링 연결 재설정 로직
reestablishPeeringConnection(connectionId);
} catch (Exception e) {
log.error("Failed to recover peering connection", e);
notifyOperators(connectionId);
}
}
}
@Component
@Slf4j
public class VpcFlowLogAnalyzer {
private final AmazonCloudWatchLogs cloudWatchLogsClient;
private final AlertService alertService;
@Scheduled(fixedRate = 300000) // 5분마다 실행
public void analyzeFlowLogs() {
GetLogEventsRequest request = new GetLogEventsRequest()
.withLogGroupName("/aws/vpc/flowlogs")
.withStartTime(System.currentTimeMillis() - 300000);
GetLogEventsResult result = cloudWatchLogsClient.getLogEvents(request);
Map<String, Integer> rejectedConnections = new HashMap<>();
for (OutputLogEvent event : result.getEvents()) {
if (event.getMessage().contains("REJECT")) {
String[] parts = event.getMessage().split(" ");
String sourceIP = parts[3];
rejectedConnections.merge(sourceIP, 1, Integer::sum);
// 임계값 초과 시 알림
if (rejectedConnections.get(sourceIP) > 100) {
alertService.sendSecurityAlert(
String.format("다수의 연결 거부 감지: %s", sourceIP));
}
}
}
}
}
@Service
public class SecurityGroupAuditor {
public List<SecurityGroupViolation> auditSecurityGroups() {
List<SecurityGroupViolation> violations = new ArrayList<>();
// 보안 그룹 규칙 검사
ec2Client.describeSecurityGroups().getSecurityGroups().forEach(sg -> {
// 전체 개방(0.0.0.0/0) 규칙 검사
sg.getIpPermissions().forEach(permission -> {
permission.getIpRanges().forEach(ipRange -> {
if ("0.0.0.0/0".equals(ipRange.getCidrIp())) {
violations.add(new SecurityGroupViolation(
sg.getGroupId(),
"전체 개방된 포트 발견",
permission.getFromPort(),
permission.getToPort()
));
}
});
});
});
return violations;
}
}
#!/usr/bin/env python3
import boto3
import yaml
def setup_vpc_peering(config_file):
with open(config_file, 'r') as f:
config = yaml.safe_load(f)
ec2 = boto3.client('ec2')
# VPC Peering 생성
response = ec2.create_vpc_peering_connection(
VpcId=config['source_vpc'],
PeerVpcId=config['target_vpc']
)
peering_id = response['VpcPeeringConnection']['VpcPeeringConnectionId']
# 라우팅 테이블 업데이트
route_table_id = config['route_table_id']
ec2.create_route(
RouteTableId=route_table_id,
DestinationCidrBlock=config['target_cidr'],
VpcPeeringConnectionId=peering_id
)
print(f"VPC Peering setup completed: {peering_id}")
@Service
@Slf4j
public class NetworkHealthChecker {
private final RestTemplate restTemplate;
private final AlertService alertService;
@Scheduled(fixedRate = 60000)
public void checkNetworkHealth() {
Map<String, ServiceHealth> healthResults = new ConcurrentHashMap<>();
// 각 엔드포인트 헬스체크
CompletableFuture.allOf(
checkEndpoint("s3", "https://s3.ap-northeast-1.amazonaws.com"),
checkEndpoint("dynamodb", "https://dynamodb.ap-northeast-1.amazonaws.com"),
checkEndpoint("internal-api", "http://internal-service.local")
).join();
// 결과 분석 및 리포팅
analyzeHealthResults(healthResults);
}
private CompletableFuture<Void> checkEndpoint(String service, String url) {
return CompletableFuture.runAsync(() -> {
try {
long startTime = System.currentTimeMillis();
ResponseEntity<String> response =
restTemplate.getForEntity(url + "/health", String.class);
long latency = System.currentTimeMillis() - startTime;
if (latency > 1000) {
alertService.sendLatencyAlert(service, latency);
}
} catch (Exception e) {
log.error("Health check failed for {}", service, e);
alertService.sendServiceDownAlert(service);
}
});
}
}
@Component
public class PeeringFailureDetector {
@Scheduled(fixedRate = 30000)
public void detectFailures() {
List<VpcPeeringConnection> connections =
ec2Client.describeVpcPeeringConnections();
connections.stream()
.filter(conn -> "failed".equals(conn.getStatus()))
.forEach(this::initiateRecovery);
}
private void initiateRecovery(VpcPeeringConnection conn) {
// 1. 기존 연결 삭제
ec2Client.deleteVpcPeeringConnection(conn.getVpcPeeringConnectionId());
// 2. 새로운 연결 생성
CreateVpcPeeringConnectionRequest request =
new CreateVpcPeeringConnectionRequest()
.withVpcId(conn.getRequesterVpcId())
.withPeerVpcId(conn.getAccepterVpcId());
ec2Client.createVpcPeeringConnection(request);
}
}
@Service
public class DataSyncRecoveryService {
private final Queue<SyncEvent> syncEventQueue;
private final TransactionManager txManager;
@Transactional
public void recoverFailedSync(String syncId) {
List<SyncEvent> failedEvents =
syncEventQueue.getFailedEvents(syncId);
for (SyncEvent event : failedEvents) {
try {
// 1. 동기화 상태 확인
validateSyncState(event);
// 2. 데이터 재동기화
resyncData(event);
// 3. 체크포인트 갱신
updateCheckpoint(event);
} catch (Exception e) {
log.error("Sync recovery failed for event: {}", event, e);
handleRecoveryFailure(event);
}
}
}
}
@SpringBootTest
public class VpcPeeringLoadTest {
@Test
void performLoadTest() {
int threadCount = 100;
int requestsPerThread = 1000;
ExecutorService executor = Executors.newFixedThreadPool(threadCount);
CountDownLatch latch = new CountDownLatch(threadCount);
List<Future<TestResult>> futures = new ArrayList<>();
for (int i = 0; i < threadCount; i++) {
futures.add(executor.submit(() -> {
TestResult result = new TestResult();
for (int j = 0; j < requestsPerThread; j++) {
long startTime = System.nanoTime();
try {
// API 호출 테스트
performApiCall();
result.addSuccess();
} catch (Exception e) {
result.addFailure();
}
result.addLatency(System.nanoTime() - startTime);
}
latch.countDown();
return result;
}));
}
// 결과 수집 및 분석
analyzeTestResults(futures);
}
}
@Service
@Slf4j
public class NetworkCostAnalyzer {
private final AmazonCostExplorer costExplorerClient;
private final NotificationService notificationService;
@Scheduled(cron = "0 0 * * * *") // 매시간 실행
public void analyzeCosts() {
GetCostAndUsageRequest request = new GetCostAndUsageRequest()
.withTimePeriod(new DateInterval()
.withStart(getStartOfDay())
.withEnd(getEndOfDay()))
.withGranularity(Granularity.HOURLY)
.withMetrics("UnblendedCost")
.withGroupBy(new GroupDefinition()
.withType("DIMENSION")
.withKey("SERVICE"));
GetCostAndUsageResult result = costExplorerClient.getCostAndUsage(request);
// 비용 분석 및 알림
for (ResultByTime period : result.getResultsByTime()) {
for (Group group : period.getGroups()) {
double cost = Double.parseDouble(group.getMetrics().get("UnblendedCost").getAmount());
if (cost > 100.0) { // 시간당 100달러 초과 시 알림
notificationService.sendCostAlert(group.getKeys().get(0), cost);
}
}
}
}
}
@Component
public class NetworkResourceOptimizer {
@Scheduled(cron = "0 0 0 * * *") // 매일 자정 실행
public void optimizeResources() {
// 미사용 VPC Endpoint 검출
findUnusedEndpoints().forEach(this::cleanupEndpoint);
// 저활용 NAT Gateway 검출
findUnderutilizedNatGateways().forEach(this::recommendDownsize);
// 불필요한 VPC Peering 연결 검출
findUnusedPeering().forEach(this::recommendCleanup);
}
private void cleanupEndpoint(String endpointId) {
try {
ec2Client.deleteVpcEndpoints(new DeleteVpcEndpointsRequest()
.withVpcEndpointIds(endpointId));
log.info("Cleaned up unused endpoint: {}", endpointId);
} catch (Exception e) {
log.error("Failed to cleanup endpoint", e);
}
}
}
# .github/workflows/network-validation.yml
name: Network Configuration Validation
on:
pull_request:
paths:
- 'terraform/**'
- 'network/**'
jobs:
validate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Setup Terraform
uses: hashicorp/setup-terraform@v1
- name: Terraform Format Check
run: terraform fmt -check
- name: Terraform Plan
run: |
terraform init
terraform plan -out=tfplan
- name: Network Security Check
run: |
./scripts/validate-network-security.sh
./scripts/check-vpc-peering.sh
./scripts/validate-endpoints.sh
@Service
@Slf4j
public class NetworkDeploymentManager {
private final CloudFormationClient cloudFormationClient;
public void deployNetworkChanges(String stackName, String templateUrl) {
try {
// 1. 변경 사항 검증
validateChanges(templateUrl);
// 2. 점진적 배포
deployWithCanary(stackName, templateUrl);
// 3. 상태 모니터링
monitorDeployment(stackName);
} catch (Exception e) {
log.error("Deployment failed", e);
initiateRollback(stackName);
}
}
private void deployWithCanary(String stackName, String templateUrl) {
// 카나리 배포 구현
// 10% -> 30% -> 60% -> 100% 점진적 트래픽 전환
List<Integer> deploymentStages = Arrays.asList(10, 30, 60, 100);
for (Integer percentage : deploymentStages) {
updateTrafficPercentage(stackName, percentage);
// 5분간 모니터링
Thread.sleep(300000);
if (!isHealthy(stackName)) {
log.error("Canary deployment failed at {}%", percentage);
initiateRollback(stackName);
return;
}
}
}
}
@Component
public class NetworkTroubleshooter {
public TroubleshootingStep diagnoseProblem(NetworkIssue issue) {
// 단계별 문제 진단
return TroubleshootingStep.builder()
.addStep("연결성 확인", () -> checkConnectivity(issue))
.addStep("보안그룹 검증", () -> validateSecurityGroups(issue))
.addStep("NACL 검증", () -> validateNacls(issue))
.addStep("라우팅 테이블 검증", () -> validateRouteTables(issue))
.addStep("VPC 피어링 상태 확인", () -> checkPeeringStatus(issue))
.addStep("DNS 설정 확인", () -> validateDnsSettings(issue))
.build();
}
private CheckResult validateSecurityGroups(NetworkIssue issue) {
// 보안 그룹 규칙 검증 로직
List<SecurityGroupRule> rules = getSecurityGroupRules(issue.getResourceId());
return rules.stream()
.filter(rule -> isRuleRelevant(rule, issue))
.map(this::validateRule)
.reduce(CheckResult.success(), CheckResult::combine);
}
}
@Service
public class AutoRemediationService {
@EventListener(NetworkIssueEvent.class)
public void handleNetworkIssue(NetworkIssueEvent event) {
switch (event.getIssueType()) {
case VPC_PEERING_FAILURE:
handlePeeringFailure(event);
break;
case ENDPOINT_CONNECTIVITY:
handleEndpointIssue(event);
break;
case ROUTING_ERROR:
handleRoutingIssue(event);
break;
}
}
private void handlePeeringFailure(NetworkIssueEvent event) {
// 1. 현재 상태 백업
backupCurrentState(event.getResourceId());
// 2. 자동 복구 시도
try {
repairPeeringConnection(event.getResourceId());
} catch (Exception e) {
// 3. 실패시 롤백
rollbackChanges(event.getResourceId());
}
}
}

@Test
public void loadTestResults() {
// 테스트 구성
int concurrentUsers = 1000;
Duration testDuration = Duration.ofMinutes(30);
// 테스트 실행
LoadTestResult result = LoadTester.builder()
.withConcurrentUsers(concurrentUsers)
.withDuration(testDuration)
.withEndpoint("https://api.example.com")
.build()
.execute();
// 결과 검증
assertThat(result.getP95Latency()).isLessThan(Duration.ofMillis(200));
assertThat(result.getErrorRate()).isLessThan(0.1); // 0.1% 미만
assertThat(result.getThroughput()).isGreaterThan(5000); // 5000 TPS 이상
}