Spring Boot

Distributed Tracing

Distributed tracing tracks a request as it flows through multiple services, providing end-to-end visibility into latency, errors, and dependencies. Spring Boot 3 integrates with Micrometer Tracing, which supports both Zipkin and OpenTelemetry exporters. Trace and span IDs are automatically propagated through HTTP headers, messaging systems, and async boundaries. This entry covers setup, Zipkin integration, OpenTelemetry, custom spans, baggage propagation, async and messaging tracing, and sampling strategies.

Setup with Micrometer Tracing and Zipkin

Spring Boot 3 uses Micrometer Tracing as the tracing facade. Add the Zipkin reporter and the Brave bridge to send traces to a Zipkin server. Spring Boot auto-configures the tracer and injects trace and span IDs into MDC automatically so they appear in every log line without manual instrumentation.

XML

<!-- pom.xml -->
<dependency>
    <groupId>io.micrometer</groupId>
    <artifactId>micrometer-tracing-bridge-brave</artifactId>
</dependency>
<dependency>
    <groupId>io.zipkin.reporter2</groupId>
    <artifactId>zipkin-reporter-brave</artifactId>
</dependency>
<!-- HTTP instrumentation (RestTemplate, WebClient, RestClient) -->
<dependency>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-starter-actuator</artifactId>
</dependency>

# ── application.yml ────────────────────────────────────────────────────
management:
  tracing:
    sampling:
      probability: 1.0          # 1.0 = 100% in dev; 0.1 = 10% in prod
    propagation:
      type: b3_multi            # B3 multi-header (Zipkin default)
                                # or w3c for OpenTelemetry W3C TraceContext

spring:
  zipkin:
    base-url: http://localhost:9411
    enabled: true

logging:
  pattern:
    # Inject traceId and spanId into every log line automatically
    level: "%5p [${spring.application.name:},%X{traceId:-},%X{spanId:-}]"

# ── application-prod.yml ──────────────────────────────────────────────
management:
  tracing:
    sampling:
      probability: 0.1          # sample 10% in production

# ── Docker Compose — run Zipkin locally ───────────────────────────────
# services:
#   zipkin:
#     image: openzipkin/zipkin:3
#     ports:
#       - "9411:9411"

OpenTelemetry Setup

OpenTelemetry (OTel) is the vendor-neutral standard for distributed tracing. Replace the Brave bridge with the OTel bridge and configure an OTLP exporter to send traces to any OTel-compatible backend — Jaeger, Tempo, Honeycomb, Datadog, or a collector. The same Micrometer Tracing API works unchanged regardless of the bridge.

XML

<!-- pom.xml — OpenTelemetry bridge instead of Brave ─────────────── -->
<dependency>
    <groupId>io.micrometer</groupId>
    <artifactId>micrometer-tracing-bridge-otel</artifactId>
</dependency>
<!-- OTLP exporter — sends to any OTel-compatible collector -->
<dependency>
    <groupId>io.opentelemetry</groupId>
    <artifactId>opentelemetry-exporter-otlp</artifactId>
</dependency>
<!-- Or Zipkin exporter for OTel -->
<dependency>
    <groupId>io.opentelemetry</groupId>
    <artifactId>opentelemetry-exporter-zipkin</artifactId>
</dependency>

# ── application.yml — OTLP exporter to Grafana Tempo ─────────────────
management:
  tracing:
    sampling:
      probability: 1.0
    propagation:
      type: w3c                 # W3C TraceContext + Baggage (OTel default)

  otlp:
    tracing:
      endpoint: http://localhost:4318/v1/traces

# ── application.yml — OTLP to Jaeger ─────────────────────────────────
  otlp:
    tracing:
      endpoint: http://jaeger:4318/v1/traces

# ── application.yml — OTLP to Honeycomb ──────────────────────────────
  otlp:
    tracing:
      endpoint: https://api.honeycomb.io/v1/traces
      headers:
        x-honeycomb-team: ${HONEYCOMB_API_KEY}
        x-honeycomb-dataset: my-service

# ── Docker Compose — Grafana LGTM stack (Loki, Grafana, Tempo, Mimir)
# services:
#   tempo:
#     image: grafana/tempo:latest
#     command: [ "-config.file=/etc/tempo.yaml" ]
#     ports:
#       - "4318:4318"   # OTLP HTTP
#       - "3200:3200"   # Tempo query
#   grafana:
#     image: grafana/grafana:latest
#     ports:
#       - "3000:3000"

Automatic Instrumentation

Spring Boot auto-instruments HTTP server requests, RestTemplate, WebClient, RestClient, Spring Data repositories, scheduled tasks, and Kafka consumers. Every inbound HTTP request starts a new trace; outbound HTTP calls propagate the trace context through standard headers (B3 or W3C TraceContext) so the receiving service continues the same trace.

Java

// ── Inbound HTTP — trace started automatically ────────────────────────
// GET /api/v1/orders/42
// Spring creates: traceId=abc123, spanId=def456
// MDC populated:  traceId=abc123, spanId=def456
// Logs include: [order-service,abc123,def456] INFO ...

@RestController
@RequestMapping("/api/v1/orders")
@RequiredArgsConstructor
@Slf4j
public class OrderController {

    private final OrderService orderService;

    @GetMapping("/{id}")
    public ResponseEntity<OrderResponse> findById(
            @PathVariable Long id) {
        // traceId and spanId already in MDC from auto-instrumentation
        log.info("Fetching order {}", id);  // logged with trace context
        return ResponseEntity.ok(orderService.findById(id));
    }
}

// ── Outbound HTTP — trace propagated automatically ────────────────────
// RestClient, RestTemplate, and WebClient all propagate B3 headers
@Service
@RequiredArgsConstructor
@Slf4j
public class InventoryClient {

    private final RestClient restClient;   // auto-instrumented

    public InventoryResponse checkStock(Long productId) {
        // Spring injects:
        //   X-B3-TraceId: abc123          (same trace as caller)
        //   X-B3-SpanId:  789xyz          (new child span)
        //   X-B3-Sampled: 1
        return restClient.get()
            .uri("http://inventory-service/api/v1/stock/{id}",
                productId)
            .retrieve()
            .body(InventoryResponse.class);
    }
}

// ── Configure RestClient with tracing ─────────────────────────────────
@Configuration
public class RestClientConfig {

    @Bean
    public RestClient restClient(
            RestClient.Builder builder) {
        // Builder is auto-configured with tracing interceptor
        return builder
            .baseUrl("http://inventory-service")
            .build();
    }
}

// ── WebClient with tracing ─────────────────────────────────────────────
@Configuration
public class WebClientConfig {

    @Bean
    public WebClient webClient(WebClient.Builder builder) {
        // Builder is auto-configured with tracing filter
        return builder
            .baseUrl("http://payment-service")
            .build();
    }
}

Custom Spans

Wrap significant business operations in custom spans to see them in the trace timeline. Inject the Tracer bean and use try-with-resources or a lambda to scope the span. Add tags (key-value metadata) to make spans searchable and meaningful in the tracing UI.

Java

@Service
@RequiredArgsConstructor
@Slf4j
public class OrderService {

    private final OrderRepository    orderRepo;
    private final InventoryClient    inventoryClient;
    private final PaymentService     paymentService;
    private final Tracer             tracer;

    // ── Wrapping a business operation in a custom span ────────────────
    @Transactional
    public OrderResponse placeOrder(PlaceOrderRequest request,
                                     Long userId) {
        // Start a child span scoped to this business operation
        Span span = tracer.nextSpan()
            .name("order.place")
            .tag("order.userId",    String.valueOf(userId))
            .tag("order.itemCount",
                String.valueOf(request.items().size()))
            .start();

        try (Tracer.SpanInScope ws =
                tracer.withSpan(span)) {

            // ── Step 1: Reserve inventory ──────────────────────────
            Span inventorySpan = tracer.nextSpan()
                .name("inventory.reserve")
                .start();
            try (Tracer.SpanInScope is =
                    tracer.withSpan(inventorySpan)) {
                inventoryClient.reserve(request.items());
            } catch (Exception ex) {
                inventorySpan.tag("error", ex.getMessage())
                             .error(ex);
                throw ex;
            } finally {
                inventorySpan.end();
            }

            // ── Step 2: Persist order ──────────────────────────────
            Order order = orderRepo.save(
                Order.from(request, userId));
            span.tag("order.id", String.valueOf(order.getId()));

            // ── Step 3: Process payment ───────────────────────────
            Span paymentSpan = tracer.nextSpan()
                .name("payment.charge")
                .tag("payment.amount",
                    order.getTotal().toPlainString())
                .start();
            try (Tracer.SpanInScope ps =
                    tracer.withSpan(paymentSpan)) {
                paymentService.charge(order);
            } catch (Exception ex) {
                paymentSpan.tag("error", ex.getMessage())
                           .error(ex);
                throw ex;
            } finally {
                paymentSpan.end();
            }

            log.info("Order {} placed successfully", order.getId());
            return OrderResponse.from(order);

        } catch (Exception ex) {
            span.tag("error", ex.getMessage()).error(ex);
            throw ex;
        } finally {
            span.end();
        }
    }

    // ── Simpler span using @NewSpan (AOP-based) ───────────────────────
    @NewSpan("order.findById")
    @SpanTag("order.id")                 // tags the id parameter
    @Transactional(readOnly = true)
    public OrderResponse findById(
            @SpanTag("order.id") Long id) {
        return orderRepo.findById(id)
            .map(OrderResponse::from)
            .orElseThrow(() -> new OrderNotFoundException(id));
    }
}

Baggage Propagation

Baggage is key-value data that propagates alongside the trace context through every service in the call chain. Use it to carry correlation IDs, tenant IDs, feature flags, or user IDs without adding them to every method signature. Baggage is available in any service that participates in the same trace.

Java

// ── Define baggage fields ─────────────────────────────────────────────
@Configuration
public class TracingConfig {

    // Declare baggage fields that should propagate
    @Bean
    public BaggageField tenantIdField() {
        return BaggageField.create("tenant-id");
    }

    @Bean
    public BaggageField correlationIdField() {
        return BaggageField.create("correlation-id");
    }
}

# ── application.yml — whitelist baggage fields for propagation ────────
management:
  tracing:
    baggage:
      remote-fields:             # propagate in HTTP headers
        - tenant-id
        - correlation-id
      correlation:               # also inject into MDC/logs
        fields:
          - tenant-id
          - correlation-id

// ── Filter: set baggage on every inbound request ──────────────────────
@Component
@RequiredArgsConstructor
@Order(Ordered.HIGHEST_PRECEDENCE)
public class BaggageFilter extends OncePerRequestFilter {

    private final BaggageField tenantIdField;
    private final BaggageField correlationIdField;

    @Override
    protected void doFilterInternal(HttpServletRequest  request,
                                    HttpServletResponse response,
                                    FilterChain         chain)
            throws ServletException, IOException {

        // Set tenant-id from header or JWT claim
        String tenantId = request.getHeader("X-Tenant-ID");
        if (tenantId != null) {
            tenantIdField.updateValue(tenantId);
        }

        // Set or generate correlation-id
        String correlationId = Optional
            .ofNullable(request.getHeader("X-Correlation-ID"))
            .orElse(UUID.randomUUID().toString());
        correlationIdField.updateValue(correlationId);
        response.setHeader("X-Correlation-ID", correlationId);

        chain.doFilter(request, response);
    }
}

// ── Read baggage anywhere in the call chain ───────────────────────────
@Service
@RequiredArgsConstructor
@Slf4j
public class TenantAwareService {

    private final BaggageField tenantIdField;
    private final BaggageField correlationIdField;
    private final ProductRepository productRepo;

    public List<ProductResponse> findAll() {
        String tenantId = tenantIdField.getValue();
        String corrId   = correlationIdField.getValue();
        // tenantId is automatically logged via MDC
        log.info("Loading products for tenant={}", tenantId);
        return productRepo.findByTenantId(tenantId)
            .stream().map(ProductResponse::from).toList();
    }
}

// ── Baggage in downstream services ───────────────────────────────────
// When Service A calls Service B via HTTP:
// Spring propagates baggage as headers:
//   baggage: tenant-id=acme,correlation-id=abc-123
// Service B receives and injects into its own BaggageField beans
// Logs in Service B automatically include tenant-id and correlation-id

Async and Messaging Tracing

Trace context does not propagate automatically to @Async threads or message consumers. Wrap async executors with tracing support and use Spring Kafka or RabbitMQ tracing instrumentation to propagate headers through messages. Without this, async operations appear as disconnected traces.

Java

// ── @Async with trace context propagation ────────────────────────────
@Configuration
@EnableAsync
@RequiredArgsConstructor
public class AsyncConfig {

    private final Tracer tracer;

    @Bean("tracingTaskExecutor")
    public Executor tracingExecutor() {
        ThreadPoolTaskExecutor executor =
            new ThreadPoolTaskExecutor();
        executor.setCorePoolSize(4);
        executor.setMaxPoolSize(16);
        executor.setQueueCapacity(100);
        executor.setThreadNamePrefix("async-tracing-");
        executor.initialize();

        // Wrap with tracing — propagates trace context to async threads
        return new io.micrometer.context.ContextExecutorService(
            executor.getThreadPoolExecutor());
    }
}

// ── Async service method ──────────────────────────────────────────────
@Service
@RequiredArgsConstructor
@Slf4j
public class NotificationService {

    @Async("tracingTaskExecutor")
    public CompletableFuture<Void> sendAsync(Long userId,
                                              String message) {
        // traceId and spanId preserved from the calling thread
        log.info("Sending notification to user {}", userId);
        // ... send notification
        return CompletableFuture.completedFuture(null);
    }
}

// ── Kafka tracing — add dependency ────────────────────────────────────
// <dependency>
//     <groupId>io.micrometer</groupId>
//     <artifactId>micrometer-tracing-bridge-brave</artifactId>
// </dependency>
// Spring Kafka auto-instruments producers and consumers when
// spring-kafka and micrometer-tracing are both on the classpath.

// ── Kafka producer — trace headers injected automatically ─────────────
@Service
@RequiredArgsConstructor
@Slf4j
public class OrderEventProducer {

    private final KafkaTemplate<String, OrderEvent> kafkaTemplate;

    public void publish(OrderEvent event) {
        // Spring Kafka injects b3 or traceparent headers automatically
        kafkaTemplate.send("orders.created",
            String.valueOf(event.orderId()), event);
        log.info("Published order event for order {}",
            event.orderId());
    }
}

// ── Kafka consumer — trace headers extracted and continued ────────────
@Component
@Slf4j
public class OrderEventConsumer {

    @KafkaListener(topics = "orders.created",
                   groupId = "inventory-service")
    public void onOrderCreated(OrderEvent event,
            @Header(KafkaHeaders.RECEIVED_TOPIC) String topic) {
        // Spring Kafka extracts trace headers from the message
        // and continues the trace from the producer
        log.info("Processing order {} from topic {}",
            event.orderId(), topic);
        inventoryService.reserve(event.items());
    }
}

// ── RabbitMQ tracing ──────────────────────────────────────────────────
// spring-rabbit auto-instruments with micrometer-tracing on classpath
@Component
@Slf4j
public class PaymentEventConsumer {

    @RabbitListener(queues = "payment.events")
    public void onPaymentEvent(PaymentEvent event,
            Message message) {
        // Trace context extracted from message headers:
        //   x-b3-traceid, x-b3-spanid, x-b3-sampled
        log.info("Processing payment event for order {}",
            event.orderId());
    }
}

Sampling Strategies

Sampling controls what fraction of traces are recorded and exported. High-traffic production systems cannot trace every request without significant overhead and storage cost. Spring Boot supports probability-based sampling, rate-limited sampling, and custom samplers that make decisions based on request attributes.

yaml

// ── Probability sampler — fixed percentage ───────────────────────────
# application.yml
management:
  tracing:
    sampling:
      probability: 0.1    # 10% of requests traced

// ── Custom sampler — always trace errors and slow requests ────────────
@Component
@Slf4j
public class AdaptiveSampler implements SamplerFunction<HttpRequest> {

    private static final double BASE_RATE      = 0.1;
    private static final long   SLOW_THRESHOLD = 1000; // ms

    @Override
    public Boolean trySample(HttpRequest request) {
        // Always trace admin and auth endpoints
        String path = request.path();
        if (path.startsWith("/api/v1/admin") ||
                path.startsWith("/api/v1/auth")) {
            return true;
        }

        // Always trace health checks — sample everything else
        if (path.equals("/actuator/health")) {
            return false;   // never trace health — too noisy
        }

        // Probabilistic sampling for regular traffic
        return Math.random() < BASE_RATE;
    }
}

// ── Register custom sampler ────────────────────────────────────────────
@Configuration
public class TracingConfig {

    @Bean
    public Sampler customSampler() {
        // Brave sampler — always sample
        return Sampler.ALWAYS_SAMPLE;   // override in prod
    }
}

// ── Rate-limited sampler — traces per second ──────────────────────────
@Configuration
public class RateLimitedTracingConfig {

    @Bean
    public Sampler rateLimitedSampler() {
        // Sample at most 10 traces per second regardless of traffic
        return RateLimitingSampler.create(10);
    }
}

// ── Conditional sampling — trace all errors ───────────────────────────
@Configuration
public class ErrorTracingConfig {

    @Bean
    public SpanExportingPredicate alwaysExportErrors() {
        return finishedSpan ->
            // Always export spans that contain an error tag
            finishedSpan.getTags().containsKey("error") ||
            finishedSpan.getError() != null;
    }
}

# ── Sampling decision summary ─────────────────────────────────────────
# probability: 0.0   → trace nothing    (disable tracing)
# probability: 0.01  → trace 1%         (high-traffic production)
# probability: 0.1   → trace 10%        (medium-traffic production)
# probability: 1.0   → trace everything (development / staging)
#
# Never use 1.0 in production under real load —
# tracing overhead is non-trivial at high request rates.

Correlating Traces with Logs and Metrics

The full observability picture comes from correlating traces, logs, and metrics. Micrometer Tracing injects traceId and spanId into the MDC automatically, so every log line carries the trace context. Grafana's LGTM stack (Loki, Grafana, Tempo, Mimir) provides a unified view where clicking a log line jumps to the trace, and clicking a trace shows the logs.

XML

// ── Structured JSON logging with trace context ───────────────────────
<!-- pom.xml — Logstash encoder for JSON logs -->
<dependency>
    <groupId>net.logstash.logback</groupId>
    <artifactId>logstash-logback-encoder</artifactId>
    <version>7.4</version>
</dependency>

// ── logback-spring.xml ────────────────────────────────────────────────
// <configuration>
//   <appender name="JSON"
//     class="ch.qos.logback.core.ConsoleAppender">
//     <encoder
//       class="net.logstash.logback.encoder.LogstashEncoder">
//       <customFields>
//         {"service":"order-service","env":"prod"}
//       </customFields>
//     </encoder>
//   </appender>
//   <root level="INFO">
//     <appender-ref ref="JSON"/>
//   </root>
// </configuration>

// ── JSON log output (traceId and spanId injected by Micrometer) ───────
// {
//   "timestamp": "2024-03-15T10:30:00.123Z",
//   "level":     "INFO",
//   "logger":    "com.myapp.OrderService",
//   "message":   "Order 42 placed successfully",
//   "traceId":   "abc123def456abc123def456abc12345",
//   "spanId":    "def456abc123def4",
//   "service":   "order-service",
//   "env":       "prod"
// }

// ── Add custom tags to correlate with metrics ─────────────────────────
@Component
@RequiredArgsConstructor
public class OrderMetrics {

    private final MeterRegistry meterRegistry;
    private final Tracer        tracer;

    public void recordOrderPlaced(String customerId,
                                   BigDecimal total) {
        // Tag the current span with business metrics
        Span current = tracer.currentSpan();
        if (current != null) {
            current.tag("order.customerId", customerId)
                   .tag("order.totalRange",
                       total.compareTo(BigDecimal.valueOf(100)) < 0
                           ? "small" : "large");
        }

        // Emit a metric with the same tags for correlation
        meterRegistry.counter("orders.placed",
            "customerId", customerId,
            "totalRange",
            total.compareTo(BigDecimal.valueOf(100)) < 0
                ? "small" : "large")
            .increment();
    }
}

// ── Grafana dashboard query examples ─────────────────────────────────
// Find logs for a specific trace:
//   {service="order-service"} | json | traceId="abc123..."
//
// Jump from trace to logs in Grafana:
//   Tempo → click span → "Logs for this span" → Loki query
//
// Find slow traces correlated with error logs:
//   {level="ERROR", service="order-service"}
//   → traceId extracted → Tempo shows full trace timeline

// ── application.yml — Grafana Loki log shipping ───────────────────────
# logging:
#   loki:
#     enabled: true
#     url: http://loki:3100/loki/api/v1/push
#     labels:
#       service: order-service
#       env: production

Resilience4j

Config Server