Spring BootResilience4j
Spring Boot

Resilience4j

Resilience4j is a lightweight fault-tolerance library for Java, designed as the replacement for Netflix Hystrix in Spring Boot 3.x. It provides six core modules: CircuitBreaker, Retry, TimeLimiter, Bulkhead, RateLimiter, and Cache. Each module is a decorator that wraps a function call — they are composable, lightweight (no external dependencies), and integrate natively with Spring Boot Actuator for metrics and health.

Setup and Dependencies

Resilience4j integrates with Spring Boot through the spring-cloud-starter-circuitbreaker-resilience4j starter, which pulls in all core modules. Spring Boot AOP is required for the annotation-based API. The Actuator starter exposes circuit breaker state, metrics, and health indicators at /actuator.
XML
<!-- pom.xml: -->
<dependencies>

    <!-- Resilience4j Spring Cloud integration (includes all modules): -->
    <dependency>
        <groupId>org.springframework.cloud</groupId>
        <artifactId>spring-cloud-starter-circuitbreaker-resilience4j</artifactId>
    </dependency>

    <!-- AOP — required for @CircuitBreaker, @Retry, @Bulkhead annotations: -->
    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-aop</artifactId>
    </dependency>

    <!-- Actuator — exposes /actuator/health, /actuator/circuitbreakers: -->
    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-actuator</artifactId>
    </dependency>

</dependencies>

// ── Expose Resilience4j endpoints via Actuator (application.yml): ─────
// management:
//   endpoints:
//     web:
//       exposure:
//         include: health, info, circuitbreakers, circuitbreakerevents,
//                  retries, retryevents, bulkheads, bulkheadevents
//   endpoint:
//     health:
//       show-details: always
//   health:
//     circuitbreakers:
//       enabled: true    // shows CB state in /actuator/health

// ── Actuator endpoints available after setup: ─────────────────────────
// GET /actuator/circuitbreakers          → state of all circuit breakers
// GET /actuator/circuitbreakerevents     → last N events (transitions, calls)
// GET /actuator/retries                  → retry instance configs
// GET /actuator/retryevents              → retry attempt history
// GET /actuator/bulkheads                → bulkhead states
// GET /actuator/health                   → includes CB state (UP/CIRCUIT_OPEN)

CircuitBreaker

The CircuitBreaker module wraps a method with the CLOSED/OPEN/HALF-OPEN state machine. Configuration controls the sliding window size, failure rate threshold, slow call threshold, wait duration in open state, and the number of probe calls in half-open state. Both annotation-based and programmatic APIs are available.
Java
// ── application.yml configuration: ───────────────────────────────────
// resilience4j:
//   circuitbreaker:
//     instances:
//       paymentService:
//         sliding-window-type: COUNT_BASED       # COUNT_BASED or TIME_BASED
//         sliding-window-size: 10                # last 10 calls evaluated
//         failure-rate-threshold: 50             # open if >= 50% fail
//         slow-call-rate-threshold: 50           # open if >= 50% are slow
//         slow-call-duration-threshold: 2s       # "slow" = > 2 seconds
//         wait-duration-in-open-state: 10s       # stay open for 10s
//         permitted-number-of-calls-in-half-open-state: 3
//         minimum-number-of-calls: 5             # min calls before evaluating
//         record-exceptions:                     # treat these as failures
//           - java.net.ConnectException
//           - feign.FeignException
//         ignore-exceptions:                     # never count as failures
//           - com.example.exception.BusinessException

// ── Annotation-based API: ─────────────────────────────────────────────
@Service
@RequiredArgsConstructor
@Slf4j
public class PaymentService {

    private final PaymentClient paymentClient;

    @CircuitBreaker(
        name = "paymentService",        // must match yml instance name
        fallbackMethod = "fallback"     // called when circuit is open
    )
    public PaymentResponse charge(PaymentRequest request) {
        return paymentClient.charge(request);
    }

    // Fallback — same return type, Throwable as extra last parameter:
    private PaymentResponse fallback(
            PaymentRequest request, CallNotPermittedException ex) {
        // CallNotPermittedException = circuit is OPEN
        log.warn("Circuit OPEN for paymentService — returning fallback");
        return PaymentResponse.deferred(request.getOrderId());
    }

    private PaymentResponse fallback(
            PaymentRequest request, Throwable ex) {
        // Catches all other exceptions (timeouts, connection errors, etc.)
        log.error("PaymentService call failed: {}", ex.getMessage());
        return PaymentResponse.failed(request.getOrderId());
    }
}

// ── Programmatic API: ────────────────────────────────────────────────
@Service
@RequiredArgsConstructor
public class PaymentServiceProgrammatic {

    private final CircuitBreakerRegistry circuitBreakerRegistry;
    private final PaymentClient paymentClient;

    public PaymentResponse charge(PaymentRequest request) {
        CircuitBreaker cb =
            circuitBreakerRegistry.circuitBreaker("paymentService");

        return cb.executeSupplier(() -> paymentClient.charge(request));
    }

    // With fallback using Vavr Try:
    public PaymentResponse chargeWithFallback(PaymentRequest request) {
        CircuitBreaker cb =
            circuitBreakerRegistry.circuitBreaker("paymentService");

        return Try.ofSupplier(
                CircuitBreaker.decorateSupplier(
                    cb, () -> paymentClient.charge(request)))
            .recover(CallNotPermittedException.class,
                ex -> PaymentResponse.deferred(request.getOrderId()))
            .recover(Throwable.class,
                ex -> PaymentResponse.failed(request.getOrderId()))
            .get();
    }
}

Retry

The Retry module automatically re-attempts a failed call up to a configurable number of times with a configurable wait strategy between attempts. It should only be applied to idempotent operations. Exponential backoff with jitter is the recommended wait strategy in production — it spreads retry storms across a time window.
Java
// ── application.yml configuration: ───────────────────────────────────
// resilience4j:
//   retry:
//     instances:
//       userService:
//         max-attempts: 3                     # total attempts (1 + 2 retries)
//         wait-duration: 500ms                # fixed wait between attempts
//         retry-exceptions:
//           - java.net.ConnectException
//           - java.net.SocketTimeoutException
//           - feign.RetryableException
//         ignore-exceptions:
//           - com.example.exception.UserNotFoundException  # don't retry 404
//
//       orderService:
//         max-attempts: 3
//         enable-exponential-backoff: true
//         exponential-backoff-multiplier: 2    # 500ms → 1000ms → 2000ms
//         exponential-max-wait-duration: 10s
//         enable-randomized-wait: true         # add jitter
//         randomized-wait-factor: 0.5          # ± 50% jitter

// ── Annotation-based API: ─────────────────────────────────────────────
@Service
@RequiredArgsConstructor
@Slf4j
public class UserService {

    private final UserClient userClient;

    @Retry(
        name = "userService",
        fallbackMethod = "retryFallback"
    )
    public UserResponse findById(Long id) {
        log.info("Attempting to fetch user {}", id);
        return userClient.findById(id);
    }

    // Called only after ALL retry attempts are exhausted:
    private UserResponse retryFallback(Long id, Throwable ex) {
        log.error("All retry attempts exhausted for user {}: {}",
            id, ex.getMessage());
        return UserResponse.unknown(id);
    }
}

// ── Combining @Retry with @CircuitBreaker: ────────────────────────────
// IMPORTANT: CircuitBreaker must be outermost (runs first).
// Retry is inner — it fires before CircuitBreaker records a failure.
// This way 3 retry attempts count as 3 recorded failures in the CB window.
@Service
public class InventoryService {

    @CircuitBreaker(name = "inventoryService",
                    fallbackMethod = "inventoryFallback")
    @Retry(name = "inventoryService")      // inner — retries first
    public InventoryResponse checkStock(Long productId) {
        return inventoryClient.checkStock(productId);
    }

    private InventoryResponse inventoryFallback(
            Long productId, Throwable ex) {
        return InventoryResponse.unknown(productId);
    }
}

// ── Programmatic retry with exponential backoff: ──────────────────────
@Service
@RequiredArgsConstructor
public class NotificationService {

    private final RetryRegistry retryRegistry;
    private final EmailClient emailClient;

    public void sendEmail(EmailRequest request) {
        Retry retry = retryRegistry.retry("emailService");

        // Log each retry attempt:
        retry.getEventPublisher()
            .onRetry(e -> log.warn("Retry attempt #{} for email to {}",
                e.getNumberOfRetryAttempts(),
                request.getTo()));

        Retry.decorateRunnable(retry,
            () -> emailClient.send(request)).run();
    }
}

TimeLimiter

The TimeLimiter enforces a maximum duration on a single method call. If the call does not complete within the configured timeout a TimeoutException is thrown and the future is cancelled. TimeLimiter works with CompletableFuture and reactive types — for blocking calls, it must be combined with a thread-pool executor.
Java
// ── application.yml configuration: ───────────────────────────────────
// resilience4j:
//   timelimiter:
//     instances:
//       paymentService:
//         timeout-duration: 3s      # cancel if call exceeds 3 seconds
//         cancel-running-future: true  # cancel the underlying future

// ── TimeLimiter with @CircuitBreaker (annotation): ────────────────────
// Note: @TimeLimiter alone requires a CompletableFuture return type.
// In practice, TimeLimiter is usually configured alongside CircuitBreaker
// via spring-cloud-starter-circuitbreaker-resilience4j, which applies
// the TimeLimiter automatically based on the CB instance config.

// application.yml combined config:
// resilience4j:
//   circuitbreaker:
//     instances:
//       paymentService:
//         sliding-window-size: 10
//         failure-rate-threshold: 50
//         wait-duration-in-open-state: 10s
//   timelimiter:
//     instances:
//       paymentService:            // same instance name ties them together
//         timeout-duration: 3s

@Service
public class PaymentService {

    @CircuitBreaker(name = "paymentService", fallbackMethod = "fallback")
    @TimeLimiter(name = "paymentService")   // enforces 3s timeout
    public CompletableFuture<PaymentResponse> chargeAsync(
            PaymentRequest request) {
        return CompletableFuture.supplyAsync(
            () -> paymentClient.charge(request)
        );
    }

    private CompletableFuture<PaymentResponse> fallback(
            PaymentRequest request, Throwable ex) {
        if (ex instanceof TimeoutException) {
            log.warn("PaymentService timed out after 3s");
        }
        return CompletableFuture.completedFuture(
            PaymentResponse.deferred(request.getOrderId()));
    }
}

Bulkhead

The Bulkhead module limits the number of concurrent calls to a service, preventing a slow downstream dependency from consuming all available threads. Resilience4j provides two implementations: SemaphoreBulkhead (same-thread, lightweight) and ThreadPoolBulkhead (dedicated thread pool, true isolation). When the bulkhead is full a BulkheadFullException is thrown immediately.
Java
// ── application.yml — SemaphoreBulkhead: ─────────────────────────────
// resilience4j:
//   bulkhead:
//     instances:
//       paymentService:
//         max-concurrent-calls: 20      # max simultaneous in-flight calls
//         max-wait-duration: 0ms        # 0 = fail immediately if full

// ── application.yml — ThreadPoolBulkhead: ────────────────────────────
// resilience4j:
//   thread-pool-bulkhead:
//     instances:
//       inventoryService:
//         max-thread-pool-size: 10      # max threads in dedicated pool
//         core-thread-pool-size: 5      # core (always alive) threads
//         queue-capacity: 20            # requests queued when pool full
//         keep-alive-duration: 20ms

// ── SemaphoreBulkhead annotation: ────────────────────────────────────
@Service
public class PaymentService {

    @Bulkhead(
        name = "paymentService",
        type = Bulkhead.Type.SEMAPHORE,     // default
        fallbackMethod = "bulkheadFallback"
    )
    public PaymentResponse charge(PaymentRequest request) {
        return paymentClient.charge(request);
    }

    private PaymentResponse bulkheadFallback(
            PaymentRequest request, BulkheadFullException ex) {
        log.warn("Bulkhead full for paymentService — {} concurrent calls",
            ex.getMessage());
        return PaymentResponse.rejected(request.getOrderId(),
            "System is busy, please retry in a moment.");
    }
}

// ── ThreadPoolBulkhead annotation (returns CompletableFuture): ────────
@Service
public class InventoryService {

    @Bulkhead(
        name = "inventoryService",
        type = Bulkhead.Type.THREADPOOL,
        fallbackMethod = "inventoryFallback"
    )
    public CompletableFuture<InventoryResponse> checkStock(
            Long productId) {
        // Executed in the dedicated inventoryService thread pool:
        return CompletableFuture.supplyAsync(
            () -> inventoryClient.checkStock(productId)
        );
    }

    private CompletableFuture<InventoryResponse> inventoryFallback(
            Long productId, BulkheadFullException ex) {
        return CompletableFuture.completedFuture(
            InventoryResponse.unknown(productId));
    }
}

RateLimiter

The RateLimiter module limits the number of calls a service is permitted to make to a downstream service within a time window. Unlike a bulkhead (which limits concurrency) a rate limiter controls throughput — the number of calls per unit of time. When the limit is exceeded a RequestNotPermitted exception is thrown immediately.
Java
// ── application.yml configuration: ───────────────────────────────────
// resilience4j:
//   ratelimiter:
//     instances:
//       externalApiService:
//         limit-for-period: 10          # max 10 calls per refresh period
//         limit-refresh-period: 1s      # period resets every 1 second
//         timeout-duration: 0ms         # 0 = fail immediately if limit hit
//                                       # >0 = wait up to N ms for a permit
//
//       smsService:
//         limit-for-period: 5
//         limit-refresh-period: 1m      # max 5 SMS per minute
//         timeout-duration: 500ms       # wait up to 500ms for a permit

// ── Annotation-based API: ─────────────────────────────────────────────
@Service
@RequiredArgsConstructor
@Slf4j
public class ExternalApiService {

    private final ExternalApiClient apiClient;

    @RateLimiter(
        name = "externalApiService",
        fallbackMethod = "rateLimitFallback"
    )
    public ExternalDataResponse fetchData(String query) {
        return apiClient.search(query);
    }

    private ExternalDataResponse rateLimitFallback(
            String query, RequestNotPermitted ex) {
        log.warn("Rate limit exceeded for externalApiService — " +
                 "query: {}", query);
        return ExternalDataResponse.rateLimited(
            "Too many requests — please slow down.");
    }
}

// ── Combining all annotations — correct order: ────────────────────────
//
//  Execution order (outermost to innermost):
//  @RateLimiter@Bulkhead@CircuitBreaker@Retry@TimeLimiter
//
@Service
public class RobustPaymentService {

    @RateLimiter(name = "paymentService")       // 1st: check rate limit
    @Bulkhead(name = "paymentService")          // 2nd: check concurrency
    @CircuitBreaker(name = "paymentService",
                    fallbackMethod = "fallback")// 3rd: check circuit state
    @Retry(name = "paymentService")             // 4th: retry on failure
    @TimeLimiter(name = "paymentService")       // 5th: enforce timeout
    public CompletableFuture<PaymentResponse> charge(
            PaymentRequest request) {
        return CompletableFuture.supplyAsync(
            () -> paymentClient.charge(request));
    }

    private CompletableFuture<PaymentResponse> fallback(
            PaymentRequest request, Throwable ex) {
        return CompletableFuture.completedFuture(
            PaymentResponse.deferred(request.getOrderId()));
    }
}

Metrics and Monitoring

Resilience4j publishes metrics and events for every module via Micrometer, which integrates with Spring Boot Actuator. Circuit breaker state transitions, retry attempts, bulkhead rejections, and rate limiter events are all observable. These metrics can be scraped by Prometheus and visualised in Grafana.
Java
// ── Key metrics published by Resilience4j: ───────────────────────────
//
// CircuitBreaker:
//   resilience4j_circuitbreaker_state              (0=CLOSED,1=OPEN,2=HALF_OPEN)
//   resilience4j_circuitbreaker_failure_rate
//   resilience4j_circuitbreaker_slow_call_rate
//   resilience4j_circuitbreaker_calls_total        (kind=successful/failed/ignored)
//   resilience4j_circuitbreaker_not_permitted_calls_total
//
// Retry:
//   resilience4j_retry_calls_total                 (kind=successful_with_retry/
//                                                         failed_with_retry/
//                                                         failed_without_retry)
// Bulkhead:
//   resilience4j_bulkhead_available_concurrent_calls
//   resilience4j_bulkhead_max_allowed_concurrent_calls
//
// RateLimiter:
//   resilience4j_ratelimiter_available_permissions
//   resilience4j_ratelimiter_waiting_threads

// ── Listen to events programmatically: ───────────────────────────────
@Component
@RequiredArgsConstructor
@Slf4j
public class CircuitBreakerEventLogger {

    private final CircuitBreakerRegistry registry;

    @PostConstruct
    public void registerListeners() {
        CircuitBreaker cb = registry.circuitBreaker("paymentService");

        cb.getEventPublisher()
            .onStateTransition(e -> log.warn(
                "CircuitBreaker [{}] state change: {} → {}",
                e.getCircuitBreakerName(),
                e.getStateTransition().getFromState(),
                e.getStateTransition().getToState()))
            .onFailureRateExceeded(e -> log.error(
                "CircuitBreaker [{}] failure rate: {}%",
                e.getCircuitBreakerName(),
                e.getFailureRate()))
            .onCallNotPermitted(e -> log.warn(
                "CircuitBreaker [{}] call rejected — circuit is OPEN",
                e.getCircuitBreakerName()));
    }
}

// ── application.yml — enable all Actuator endpoints: ─────────────────
// management:
//   endpoints:
//     web:
//       exposure:
//         include: "*"
//   endpoint:
//     health:
//       show-details: always
//   health:
//     circuitbreakers:
//       enabled: true
//
// Sample /actuator/health response when circuit is open:
// {
//   "status": "DOWN",
//   "components": {
//     "circuitBreakers": {
//       "status": "DOWN",
//       "details": {
//         "paymentService": {
//           "status": "CIRCUIT_OPEN",
//           "details": {
//             "failureRate":       "60.0%",
//             "slowCallRate":      "0.0%",
//             "bufferedCalls":     10,
//             "failedCalls":        6,
//             "state":             "OPEN"
//           }
//         }
//       }
//     }
//   }
// }