Prometheus 모니터링 시스템

023·2024년 7월 24일

Data Driven Development Project

목록 보기

7/9

서론

최근에 우리는 CGV 영화 데이터를 정기적으로 크롤링하고, 이 과정을 Quartz 스케줄러로 자동화하며, 결과를 Logback을 사용하여 로깅하는 시스템을 개발하였다. 이제, 로깅된 데이터를 Prometheus로 통합하여 실시간 모니터링 및 경고 시스템을 구축하고자 한다. 이 글에서는 Prometheus와의 통합 과정과 그 중요성을 설명한다.

Prometheus 통합의 중요성

Prometheus는 오픈 소스 모니터링 솔루션으로, 시계열 데이터 관리에 최적화되어 있으며, 복잡한 쿼리, 실시간 경고, 대시보드를 지원한다. 크롤링 시스템의 로깅 데이터를 Prometheus와 연동하면, 시스템의 성능 지표를 실시간으로 관찰하고 문제를 즉각적으로 감지할 수 있다.

구현 과정

1. Logback과 Prometheus 연동

작업 설명: Logback으로 생성된 로깅 데이터를 Prometheus가 모니터링할 수 있도록 적절한 형식으로 전송하겠다. 이를 위해 Logback의 로거를 통해 생성된 메트릭을 Prometheus 포맷으로 변환하는 exporter를 구성하겠다.
구현 방법:

import io.prometheus.client.Counter;
import io.prometheus.client.Gauge;
import io.prometheus.client.exporter.HTTPServer;
import io.prometheus.client.hotspot.DefaultExports;

import java.io.IOException;

/**
 * PrometheusConfig는 메트릭 수집을 위해 Prometheus HTTP 서버를 초기화하고 가동
 */
public class PrometheusConfig {
    private static final Counter requests = Counter.build()
            .name("requests_total").help("Total requests.").register();
    // 크롤링한 페이지 수를 기록하는 카운터.
    private static final Counter crawledPages = Counter.build()
            .name("crawled_pages_total").help("Total number of pages crawled.").register();

    // 크롤링 작업의 성공/실패 횟수를 기록하는 카운터.
    private static final Counter crawlSuccesses = Counter.build()
            .name("successful_crawls_total").help("Total number of successful crawl operations.").register();
    private static final Counter crawlFailures = Counter.build()
            .name("failed_crawls_total").help("Total number of failed crawl operations.").register();

    // 크롤러 상태에 대한 게이지 측정항목 정의.(1 = 실행 중, 0 = 중지됨)
    private static final Gauge crawlerStatus = Gauge.build()
            .name("crawler_status").help("Crawler status (1 = running, 0 = stopped)").register();

    // 포트 9090에서 Prometheus HTTP 서버를 시작.
    public static void start() throws IOException {
        DefaultExports.initialize();
        //HTTPServer server = new HTTPServer(9090);
    }

    // 총 요청 카운터를 증가.
    public static void incrementRequests() {
        requests.inc();
    }

    // 크롤링한 페이지 수를 증가.
    public static void incrementCrawledPages() {
        crawledPages.inc();
    }

    // 성공적인 크롤링 작업을 기록.
    public static void incrementCrawlSuccess() {
        crawlSuccesses.inc();
    }

    // 실패한 크롤링 작업을 기록.
    public static void incrementCrawlFailure() {
        crawlFailures.inc();
    }

    // 크롤러 상태를 설정.
    public static void setCrawlerStatus(boolean isRunning) {
        if (isRunning) {
            crawlerStatus.set(1);
        } else {
            crawlerStatus.set(0);
        }
    }
}

2. Prometheus 서버 설정

작업 설명: Prometheus 서버를 설정하여 로깅 데이터를 주기적으로 수집하겠다. Prometheus 서버는 로깅 데이터 외에도, 시스템의 다양한 성능 지표를 수집하고 저장한다.
구현 방법:

# Prometheus configuration
global:
  scrape_interval: 15s  # 메트릭을 수집하는 주기
  evaluation_interval: 15s  # 룰 평가 주기

alerting:  #alerting.alertmanagers: Alertmanager의 주소를 설정.
  alertmanagers:
    - static_configs:
        - targets: ['localhost:9093']  #targets: Alertmanager가 실행 중인 호스트와 포트.

rule_files: #rule_files: 경고 규칙 파일.
  - "alert_rules.yml"
scrape_configs:  #scrape_configs: Prometheus가 메트릭을 수집할 엔드포인트 설정.
  - job_name: 'CrawlerJob'
    metrics_path: '/metrics'
    static_configs:
      - targets: ['localhost:9090']  # Prometheus HTTP 서버가 실행 중인 호스트 및 포트

  - job_name: 'pushgateway'
    honor_labels: true
    static_configs:
      - targets: [ 'localhost:9091' ]

3. 대시보드 구성 및 경고 설정

작업 설명: Prometheus와 Grafana를 연동하여 크롤링 및 로깅 데이터에 대한 시각적 대시보드를 구성하겠다. 또한, 성능 이슈 또는 예외 발생 시 알림을 받을 수 있는 경고 규칙을 설정하겠다.
구현 방법:

groups:  #groups: 경고 규칙 그룹.
  - name: prometheus  #name: 그룹 이름.
    rules:  #rules: 경고 규칙 목록.
    - alert: HighRequestLatency  #alert: 경고 이름.
      expr: job:request_latency_seconds:mean5m{job="CrawlerJob"} > 0.5  #expr: 경고 조건을 정의하는 PromQL 표현식.
      for: 10m  #for: 조건이 충족되어야 하는 지속 시간.
      labels:  #labels: 경고에 추가할 레이블.
        severity: 'page'
      annotations:  #annotations: 경고에 대한 추가 정보.
        summary: "High request latency"
        description: "Request latency is above 0.5s for more than 10 minutes."

    - alert: CrawlerStart
      expr: up{job="crawler"} == 1
      for: 1m
      labels:
        severity: info
      annotations:
        summary: "Crawler started"
        description: "The crawler has started."

    - alert: CrawlerSuccess
      expr: increase(successful_crawls_total[1m]) > 0
      for: 1m
      labels:
        severity: info
      annotations:
        summary: "Crawler succeeded"
        description: "The crawler has successfully fetched movie data."

    - alert: CrawlerDown
      expr: absent(crawler_status{job="CrawlerJob"} == 1)
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: "Crawler is down"
        description: "The crawler has been down for more than 1 minute."

    - alert: HighErrorRate
      expr: increase(failed_crawls_total[1m]) > 5  # 1분간 실패한 크롤링 횟수가 5회를 초과할 때.
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: "High error rate in crawling"
        description: "Failed crawl rate is {{ $value }} failures per minute."

    - alert: NoMoviesFetched
      expr: increase(requests_total[5m]) > 0 and increase(successful_crawls_total[5m]) == 0  # 5분간 요청이 있었지만 성공적인 크롤링이 없을 때.
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "No movies fetched"
        description: "Requests were made but no movies were fetched in the last 5 minutes."

이렇게 설정하고 서버를 가동시키면 localhost:9090에 접속하면 Prometheus에 접속이 가능하다.

결론

이 프로젝트를 통해 구축한 Prometheus 모니터링 시스템은 크롤링 작업의 안정성과 투명성을 제고하였다. 실시간 데이터 분석과 경고 시스템은 우리가 데이터 수집 작업에서 발생할 수 있는 문제를 빠르게 파악하고 해결할 수 있게 해준다. 앞으로도 이 시스템을 지속적으로 개선하여 더욱 정밀하고 효과적인 데이터 모니터링 환경을 구축해 나가겠다.

023

Get your hands dirty

이전 포스트

Logback/SLF4J 로깅 시스템

다음 포스트