Spring Boot

Grafana Integration

Grafana is the visualisation layer for Prometheus metrics. It connects to Prometheus as a data source and renders time-series dashboards — request rate, error rate, latency percentiles, JVM memory, connection pool health, and custom business metrics. Grafana dashboards are defined as JSON and can be version-controlled, shared, and provisioned automatically at startup.

Docker Compose Setup

The quickest way to run the full observability stack locally is Docker Compose with Spring Boot, Prometheus, and Grafana. Grafana is provisioned at startup with a Prometheus data source and pre-built dashboards — no manual UI configuration required.

yaml

# docker-compose.yml
version: "3.9"

services:

  # ── Your Spring Boot microservice: ───────────────────────────────────
  order-service:
    build: ./order-service
    ports:
      - "8082:8082"
      - "9090:9090"         # management / actuator port
    environment:
      SPRING_PROFILES_ACTIVE: docker
      SPRING_DATASOURCE_URL: jdbc:postgresql://postgres:5432/orders
    depends_on: [postgres, kafka]
    labels:
      prometheus.io/scrape: "true"
      prometheus.io/path:   "/actuator/prometheus"
      prometheus.io/port:   "9090"

  # ── Prometheus: ───────────────────────────────────────────────────────
  prometheus:
    image: prom/prometheus:v2.51.0
    ports:
      - "9091:9090"
    volumes:
      - ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
      - ./monitoring/prometheus/rules:/etc/prometheus/rules
      - prometheus-data:/prometheus
    command:
      - "--config.file=/etc/prometheus/prometheus.yml"
      - "--storage.tsdb.retention.time=15d"
      - "--web.enable-lifecycle"       # enable /-/reload endpoint

  # ── Grafana: ──────────────────────────────────────────────────────────
  grafana:
    image: grafana/grafana:10.4.0
    ports:
      - "3000:3000"
    environment:
      GF_SECURITY_ADMIN_USER:     admin
      GF_SECURITY_ADMIN_PASSWORD: admin
      GF_USERS_ALLOW_SIGN_UP:     "false"
    volumes:
      # Auto-provision data sources on startup:
      - ./monitoring/grafana/provisioning:/etc/grafana/provisioning
      # Auto-provision dashboards on startup:
      - ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards
      - grafana-data:/var/lib/grafana

  postgres:
    image: postgres:16-alpine
    environment:
      POSTGRES_DB:       orders
      POSTGRES_USER:     orders
      POSTGRES_PASSWORD: orders

volumes:
  prometheus-data:
  grafana-data:

Grafana Provisioning

Grafana provisioning configures data sources and dashboards via YAML files mounted into the container. Provisioned resources are created automatically at startup — no manual clicking through the UI. This makes the observability setup reproducible across local, staging, and production environments.

yaml

# monitoring/grafana/provisioning/datasources/prometheus.yml
apiVersion: 1

datasources:
  - name:      Prometheus
    type:      prometheus
    access:    proxy
    url:       http://prometheus:9090
    isDefault: true
    editable:  false
    jsonData:
      timeInterval:           "15s"     # match Prometheus scrape interval
      httpMethod:             POST
      exemplarTraceIdDestinations:
        - name:         traceId
          datasourceUid: tempo          # link traces to Tempo if available

---
# monitoring/grafana/provisioning/dashboards/dashboards.yml
apiVersion: 1

providers:
  - name:            microservices
    orgId:           1
    type:            file
    disableDeletion: false              # allow dashboard updates
    updateIntervalSeconds: 30
    allowUiUpdates:  true
    options:
      path: /var/lib/grafana/dashboards
      foldersFromFilesStructure: true   # subfolder = Grafana folder

Service Overview Dashboard

A service overview dashboard shows the four golden signals — latency, traffic, errors, and saturation — for a selected service. Grafana template variables allow a single dashboard to serve all microservices by switching a dropdown. The dashboard JSON is stored in version control and provisioned automatically.

json

// ── Dashboard structure (monitoring/grafana/dashboards/service-overview.json):
// {
//   "title": "Microservice Overview",
//   "uid":   "microservice-overview",
//   "tags":  ["microservices", "spring-boot"],
//   "time":  { "from": "now-1h", "to": "now" },
//   "refresh": "30s",
//
//   ── Template variables (service selector dropdown): ─────────────────
//   "templating": {
//     "list": [
//       {
//         "name":  "service",
//         "type":  "query",
//         "query": "label_values(http_server_requests_seconds_count,
//                               application)",
//         "label": "Service",
//         "multi": false,
//         "includeAll": false
//       },
//       {
//         "name":  "interval",
//         "type":  "interval",
//         "query": "1m,5m,15m,30m,1h",
//         "label": "Interval"
//       }
//     ]
//   },
//
//   ── Panel 1: Request Rate (Graph): ───────────────────────────────────
//   {
//     "title": "Request Rate (req/s)",
//     "type":  "timeseries",
//     "targets": [{
//       "expr": "sum(rate(http_server_requests_seconds_count
//                   {application="$service"}[$interval]))
//                by (method, uri)",
//       "legendFormat": "{{method}} {{uri}}"
//     }]
//   },
//
//   ── Panel 2: Error Rate (Graph + threshold line): ────────────────────
//   {
//     "title": "Error Rate (%)",
//     "type":  "timeseries",
//     "targets": [{
//       "expr": "sum(rate(http_server_requests_seconds_count
//                   {application="$service",status=~"5.."}[$interval]))
//               / sum(rate(http_server_requests_seconds_count
//                   {application="$service"}[$interval]))
//               * 100",
//       "legendFormat": "Error Rate %"
//     }],
//     "thresholds": [{ "value": 1, "color": "red" }]
//   },
//
//   ── Panel 3: p50/p95/p99 Latency: ────────────────────────────────────
//   {
//     "title": "Latency Percentiles",
//     "type":  "timeseries",
//     "targets": [
//       {
//         "expr": "histogram_quantile(0.50, sum by(le)(
//                   rate(http_server_requests_seconds_bucket
//                     {application="$service"}[$interval])))",
//         "legendFormat": "p50"
//       },
//       {
//         "expr": "histogram_quantile(0.95, sum by(le)(
//                   rate(http_server_requests_seconds_bucket
//                     {application="$service"}[$interval])))",
//         "legendFormat": "p95"
//       },
//       {
//         "expr": "histogram_quantile(0.99, sum by(le)(
//                   rate(http_server_requests_seconds_bucket
//                     {application="$service"}[$interval])))",
//         "legendFormat": "p99"
//       }
//     ]
//   },
//
//   ── Panel 4: Active JVM Heap (Gauge panel): ──────────────────────────
//   {
//     "title": "JVM Heap Usage",
//     "type":  "gauge",
//     "targets": [{
//       "expr": "jvm_memory_used_bytes{application="$service",
//                  area="heap"}
//               / jvm_memory_max_bytes{application="$service",
//                  area="heap"} * 100",
//       "legendFormat": "Heap %"
//     }],
//     "fieldConfig": {
//       "defaults": {
//         "unit": "percent",
//         "thresholds": {
//           "steps": [
//             { "value": 0,  "color": "green" },
//             { "value": 70, "color": "yellow" },
//             { "value": 85, "color": "red" }
//           ]
//         }
//       }
//     }
//   }
// }

JVM and Connection Pool Dashboard Panels

JVM and infrastructure metrics round out a microservices dashboard. Heap usage trends identify memory leaks before OOM kills. GC pause rates correlate with latency spikes. Connection pool pending threads signal database bottlenecks. These panels should live on every service dashboard.

yaml

# ── Key PromQL expressions for JVM panels: ───────────────────────────

# Heap usage % (Gauge panel — threshold: 85%):
jvm_memory_used_bytes{application="$service", area="heap"}
/
jvm_memory_max_bytes{application="$service", area="heap"}
* 100

# Non-heap memory used (for Metaspace leak detection):
jvm_memory_used_bytes{application="$service", area="nonheap"}

# GC pause rate (ms/s) — spikes correlate with latency:
rate(jvm_gc_pause_seconds_sum{application="$service"}[$interval])
* 1000

# Live thread count (rising trend = thread leak):
jvm_threads_live_threads{application="$service"}

# Threads by state (stacked bar):
sum by (state) (
  jvm_threads_states_threads{application="$service"}
)

# ── Connection pool panels: ───────────────────────────────────────────

# Pool utilisation % (Gauge — threshold: 80%):
hikaricp_connections_active{application="$service"}
/
hikaricp_connections_max{application="$service"}
* 100

# Active vs idle connections (stacked area):
hikaricp_connections_active{application="$service"}
hikaricp_connections_idle{application="$service"}

# Pending threads (should always be 0 in a healthy service):
hikaricp_connections_pending{application="$service"}

# Connection timeout rate (should be 0):
rate(hikaricp_connections_timeout_total{
  application="$service"}[$interval])

# ── Business metrics panel: ───────────────────────────────────────────

# Order placement rate (Time series):
sum by (channel) (
  rate(orders_placed_total{application="$service"}[$interval])
)

# Order success vs failure (Stat panels side by side):
sum(rate(orders_placed_total[$interval]))
sum(rate(orders_failed_total[$interval]))

# Revenue rate (requires order value metric):
sum(rate(orders_value_sum[$interval]))

Grafana Alerting

Grafana Unified Alerting evaluates PromQL rules on a schedule and routes alerts through contact points — Slack, PagerDuty, email, or webhooks. Alert rules live in Grafana, separate from Prometheus alerting rules, and can reference any configured data source. Notification policies route alerts by label (severity, team, service) to different contact points.

yaml

# ── Grafana alert rule (provisioning YAML format): ───────────────────
# monitoring/grafana/provisioning/alerting/rules.yml

apiVersion: 1

groups:
  - orgId:    1
    name:     microservices-alerts
    folder:   Alerts
    interval: 1m

    rules:

      # ── High error rate alert: ────────────────────────────────────────
      - uid:    high-error-rate
        title:  High Error Rate
        condition: C

        data:
          - refId: A
            relativeTimeRange:
              from: 300
              to:   0
            datasourceUid: prometheus
            model:
              expr: >
                sum by (application) (
                  rate(http_server_requests_seconds_count{
                    status=~"5.."}[5m])
                )
                /
                sum by (application) (
                  rate(http_server_requests_seconds_count[5m])
                )
                * 100
              intervalMs:    1000
              maxDataPoints: 43200

          - refId: C
            relativeTimeRange:
              from: 300
              to:   0
            datasourceUid: __expr__
            model:
              type:       threshold
              conditions:
                - evaluator:
                    params: [1]       # alert if error rate > 1%
                    type:   gt

        noDataState:  NoData
        execErrState: Error
        for:          2m

        labels:
          severity: warning
          team:     platform

        annotations:
          summary:     "High error rate on {{ $labels.application }}"
          description: "Error rate is {{ $values.A }}%"
          runbook_url: "https://wiki.example.com/runbooks/high-error-rate"

# ── Contact points (Slack): ───────────────────────────────────────────
# monitoring/grafana/provisioning/alerting/contact-points.yml

contactPoints:
  - orgId: 1
    name:  platform-slack
    receivers:
      - uid:  platform-slack-receiver
        type: slack
        settings:
          url:      ${SLACK_WEBHOOK_URL}
          channel:  "#platform-alerts"
          username: Grafana
          text: >
            *{{ .Status | toUpper }}* {{ .CommonAnnotations.summary }}
            {{ range .Alerts }}
              *Service:* {{ .Labels.application }}
              *Severity:* {{ .Labels.severity }}
              *Details:* {{ .Annotations.description }}
            {{ end }}

# ── Notification policy: ──────────────────────────────────────────────
policies:
  - orgId: 1
    receiver: platform-slack       # default contact point
    group_by: [alertname, application]
    group_wait:      30s
    group_interval:  5m
    repeat_interval: 4h
    routes:
      - matchers:
          - severity = critical
        receiver: pagerduty-oncall  # critical → PagerDuty
        continue: true              # also send to Slack
      - matchers:
          - severity = warning
        receiver: platform-slack

Prometheus Integration

Distributed Logging