Spring BootRate Limiting
Spring Boot

Rate Limiting

Rate limiting controls the number of requests a client is permitted to make within a time window. It protects microservices from being overwhelmed by excessive traffic, prevents abuse, enforces fair usage across clients, and reduces infrastructure costs. Rate limiting can be applied at the API Gateway level (protecting all services) or at the individual service level (protecting specific endpoints).

Rate Limiting Algorithms

Several algorithms implement rate limiting, each with different trade-offs between simplicity, memory usage, and burst handling. The most widely used in production are the token bucket (used by Spring Cloud Gateway's Redis rate limiter) and the sliding window log.
Java
// ── 1. TOKEN BUCKET ───────────────────────────────────────────────────
//
//  A bucket holds up to CAPACITY tokens.
//  Tokens are added at REFILL_RATE per second.
//  Each request consumes 1 (or more) tokens.
//  If the bucket is empty → reject with 429.
//
//  Bucket capacity: 10 tokens
//  Refill rate:      5 tokens/second
//
//  t=0s   Bucket: 10/108 requests → Bucket: 2/10  (all pass)
//  t=0s   2 more requests → Bucket: 0/10  (pass), next request → 429
//  t=1s   Bucket: 5/10   (refilled by 5) → more requests allowed
//
//  Pros:  allows short bursts (up to capacity), smooth long-term rate
//  Cons:  two parameters to tune (capacity + refill rate)
//  Used by: Spring Cloud Gateway Redis RateLimiter

// ── 2. SLIDING WINDOW LOG ─────────────────────────────────────────────
//
//  Keep a log of timestamps of all requests in the last N seconds.
//  If log size >= limit → reject.
//  Evict entries older than N seconds on each check.
//
//  Limit: 5 requests per 10 seconds
//
//  t=1s  Request → log: [1]         → allow
//  t=3s  Request → log: [1,3]       → allow
//  t=5s  Request → log: [1,3,5]     → allow
//  t=6s  Request → log: [1,3,5,6]   → allow
//  t=7s  Request → log: [1,3,5,6,7] → allow (5th)
//  t=8s  Request → log size = 5     → REJECT 429
//  t=11s Evict t=1 → log: [3,5,6,7] → allow
//
//  Pros:  very accurate, no burst overshoot
//  Cons:  high memory (stores every request timestamp)

// ── 3. FIXED WINDOW COUNTER ───────────────────────────────────────────
//
//  Count requests in fixed time windows (e.g. each minute).
//  Reset counter at window boundary.
//  Limit: 100 requests per minute
//
//  Problem — boundary burst:
//  t=0:59  99 requests (window 1, counter=99)  → all pass
//  t=1:00  Counter resets to 0
//  t=1:01  99 requests (window 2, counter=99)  → all pass
//  → 198 requests pass in 2 seconds at the boundary.
//
//  Pros:  simple, low memory (just one counter per window)
//  Cons:  boundary burst allows 2× the intended rate

// ── 4. SLIDING WINDOW COUNTER (compromise): ──────────────────────────
//
//  Combines fixed windows with a weighted rolling calculation.
//  current_count = prev_window_count × (1 - elapsed/window)
//                + current_window_count
//
//  Smooths out boundary bursts without storing every timestamp.
//  Used by: Redis (INCR + EXPIRE), Resilience4j RateLimiter

Rate Limiting at the API Gateway (Spring Cloud Gateway)

Spring Cloud Gateway's built-in RequestRateLimiter filter uses Redis to store token buckets — one per key resolver result. Redis is used because it is atomic and shared across all gateway instances in a cluster. The filter adds rate-limit headers to responses so clients know their current quota.
yaml
<!-- pom.xml — add Redis reactive for rate limiting: -->
<!-- <dependency>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-starter-data-redis-reactive</artifactId>
</dependency> -->

// ── Key resolvers — determine the rate-limit bucket per request: ──────
@Configuration
public class RateLimiterKeyResolvers {

    // Rate-limit by authenticated user (JWT principal name):
    @Bean
    @Primary
    public KeyResolver userKeyResolver() {
        return exchange -> exchange.getPrincipal()
            .map(Principal::getName)
            .defaultIfEmpty("anonymous");
    }

    // Rate-limit by client IP address:
    @Bean
    public KeyResolver ipKeyResolver() {
        return exchange -> Mono.justOrEmpty(
                exchange.getRequest().getRemoteAddress())
            .map(addr -> addr.getAddress().getHostAddress())
            .defaultIfEmpty("unknown");
    }

    // Rate-limit by API key header:
    @Bean
    public KeyResolver apiKeyResolver() {
        return exchange -> Mono.justOrEmpty(
                exchange.getRequest()
                    .getHeaders()
                    .getFirst("X-Api-Key"))
            .defaultIfEmpty("no-key");
    }
}

// ── application.yml — routes with rate limiting: ──────────────────────
// spring:
//   data:
//     redis:
//       host: localhost
//       port: 6379
//
//   cloud:
//     gateway:
//       routes:
//         - id: order-service
//           uri: lb://order-service
//           predicates:
//             - Path=/api/orders/**
//           filters:
//             - StripPrefix=1
//             - name: RequestRateLimiter
//               args:
//                 redis-rate-limiter:
//                   replenish-rate: 20      # tokens added per second
//                   burst-capacity: 40      # max burst (bucket size)
//                   requested-tokens: 1     # tokens per request
//                 key-resolver: "#{@userKeyResolver}"
//
//         - id: search-service
//           uri: lb://search-service
//           predicates:
//             - Path=/api/search/**
//           filters:
//             - name: RequestRateLimiter
//               args:
//                 redis-rate-limiter:
//                   replenish-rate: 5       # search is expensive — lower limit
//                   burst-capacity: 10
//                 key-resolver: "#{@ipKeyResolver}"

// ── Response headers added by the filter: ────────────────────────────
// X-RateLimit-Replenish-Rate: 20
// X-RateLimit-Burst-Capacity: 40
// X-RateLimit-Remaining:       15     ← tokens left in the bucket
// HTTP 429 Too Many Requests          ← when bucket is empty

Rate Limiting at the Service Level (Resilience4j)

For rate limiting within a microservice — for example, limiting how many calls it makes to a third-party external API — Resilience4j RateLimiter is the right tool. It limits the outbound call rate from one service to another, preventing the calling service from overwhelming a dependency or exceeding a paid API quota.
Java
// ── application.yml configuration: ───────────────────────────────────
// resilience4j:
//   ratelimiter:
//     instances:
//       stripeApi:
//         limit-for-period: 100        # max 100 calls per refresh period
//         limit-refresh-period: 1s     # period resets every second
//         timeout-duration: 500ms      # wait up to 500ms for a permit
//                                      # 0ms = fail immediately if no permit
//       sendgridApi:
//         limit-for-period: 10         # max 10 emails per minute
//         limit-refresh-period: 1m
//         timeout-duration: 0ms

// ── Annotation-based rate limiting: ──────────────────────────────────
@Service
@RequiredArgsConstructor
@Slf4j
public class PaymentGatewayService {

    private final StripeClient stripeClient;

    @RateLimiter(
        name = "stripeApi",
        fallbackMethod = "rateLimitedFallback"
    )
    public ChargeResponse charge(ChargeRequest request) {
        return stripeClient.charge(request);
    }

    private ChargeResponse rateLimitedFallback(
            ChargeRequest request, RequestNotPermitted ex) {
        log.warn("Stripe API rate limit reached — queuing charge for {}",
            request.getOrderId());
        chargeQueue.enqueue(request);
        return ChargeResponse.queued(request.getOrderId());
    }
}

// ── Programmatic rate limiter with dynamic configuration: ─────────────
@Service
@RequiredArgsConstructor
public class TenantAwareRateLimiter {

    private final RateLimiterRegistry rateLimiterRegistry;

    // Create per-tenant rate limiters dynamically:
    public <T> T executeWithTenantLimit(
            String tenantId,
            int requestsPerSecond,
            Supplier<T> call) {

        RateLimiterConfig config = RateLimiterConfig.custom()
            .limitForPeriod(requestsPerSecond)
            .limitRefreshPeriod(Duration.ofSeconds(1))
            .timeoutDuration(Duration.ofMillis(200))
            .build();

        // getOrCreate — reuses existing limiter for the same tenant:
        RateLimiter limiter = rateLimiterRegistry
            .rateLimiter("tenant-" + tenantId, config);

        return RateLimiter.decorateSupplier(limiter, call).get();
    }
}

Tiered Rate Limiting

Different clients often deserve different rate limits — free tier users get fewer requests than paid subscribers, internal services are unrestricted, and partner APIs have contractually defined limits. Tiered rate limiting reads the client's tier from the JWT claims or an API key lookup and applies the corresponding bucket configuration.
Java
// ── Tier-aware key resolver: ─────────────────────────────────────────
@Component
@RequiredArgsConstructor
public class TieredRateLimitingFilter implements GlobalFilter, Ordered {

    private final RedisTemplate<String, String> redisTemplate;

    // Token bucket limits per tier (tokens/second : burst):
    private static final Map<String, int[]> TIER_LIMITS = Map.of(
        "FREE",     new int[]{10,  20},
        "PRO",      new int[]{100, 200},
        "ENTERPRISE", new int[]{500, 1000},
        "INTERNAL", new int[]{10000, 10000}
    );

    @Override
    public Mono<Void> filter(ServerWebExchange exchange,
                             GatewayFilterChain chain) {
        String tier = extractTier(exchange);
        String userId = extractUserId(exchange);
        int[] limits = TIER_LIMITS.getOrDefault(tier, TIER_LIMITS.get("FREE"));

        String bucketKey = "rate:" + tier + ":" + userId;
        boolean allowed = checkAndDecrementBucket(
            bucketKey, limits[0], limits[1]);

        if (!allowed) {
            ServerHttpResponse response = exchange.getResponse();
            response.setStatusCode(HttpStatus.TOO_MANY_REQUESTS);
            response.getHeaders().add("Retry-After", "1");
            response.getHeaders().add("X-RateLimit-Tier", tier);
            return response.setComplete();
        }

        exchange.getResponse().getHeaders()
            .add("X-RateLimit-Tier", tier);
        return chain.filter(exchange);
    }

    private String extractTier(ServerWebExchange exchange) {
        // Read tier from header set by JWT auth filter:
        return Optional.ofNullable(
                exchange.getRequest()
                    .getHeaders()
                    .getFirst("X-User-Tier"))
            .orElse("FREE");
    }

    private String extractUserId(ServerWebExchange exchange) {
        return Optional.ofNullable(
                exchange.getRequest()
                    .getHeaders()
                    .getFirst("X-User-Id"))
            .orElseGet(() ->
                exchange.getRequest()
                    .getRemoteAddress()
                    .getAddress()
                    .getHostAddress());
    }

    private boolean checkAndDecrementBucket(
            String key, int refillRate, int capacity) {
        // Lua script ensures atomic check-and-decrement in Redis:
        Long remaining = redisTemplate.execute(
            RATE_LIMIT_SCRIPT,
            Collections.singletonList(key),
            String.valueOf(capacity),
            String.valueOf(refillRate)
        );
        return remaining != null && remaining >= 0;
    }

    @Override
    public int getOrder() { return -2; }  // run after auth filter
}

Rate Limit Response Headers and Client Guidance

When a request is rate-limited the service should return HTTP 429 with headers that tell the client exactly why they were limited and when they can retry. Well-formed rate-limit responses allow clients to implement automatic backoff rather than hammering the server with retries.
Java
// ── Standard rate-limit response headers: ────────────────────────────
//
// X-RateLimit-Limit:      100    ← max requests allowed in this window
// X-RateLimit-Remaining:  0      ← requests left in current window
// X-RateLimit-Reset:      1717000060  ← Unix timestamp when window resets
// Retry-After:            30     ← seconds to wait before retrying
// HTTP/1.1 429 Too Many Requests

// ── Custom 429 response with guidance: ───────────────────────────────
@RestControllerAdvice
public class RateLimitExceptionHandler {

    @ExceptionHandler(RequestNotPermitted.class)
    public ResponseEntity<RateLimitErrorResponse> handleRateLimit(
            RequestNotPermitted ex,
            HttpServletRequest request) {

        long resetAt = Instant.now().plusSeconds(1).getEpochSecond();

        RateLimitErrorResponse body = RateLimitErrorResponse.builder()
            .status(429)
            .error("Too Many Requests")
            .message("Rate limit exceeded. " +
                     "Please slow down and retry after 1 second.")
            .retryAfterSeconds(1)
            .documentationUrl("https://api.example.com/docs/rate-limits")
            .build();

        return ResponseEntity
            .status(HttpStatus.TOO_MANY_REQUESTS)
            .header("X-RateLimit-Limit",     "100")
            .header("X-RateLimit-Remaining", "0")
            .header("X-RateLimit-Reset",     String.valueOf(resetAt))
            .header("Retry-After",           "1")
            .body(body);
    }
}

// ── Client-side retry with exponential backoff on 429: ────────────────
@Service
public class ApiClientWithBackoff {

    private static final int MAX_RETRIES = 4;

    public <T> T callWithBackoff(Supplier<T> apiCall) {
        int attempt = 0;
        while (true) {
            try {
                return apiCall.get();
            } catch (RateLimitException ex) {
                if (++attempt >= MAX_RETRIES) throw ex;
                long waitMs = (long) Math.pow(2, attempt) * 100;  // 200,400,800ms
                long jitter  = ThreadLocalRandom.current().nextLong(50);
                log.warn("Rate limited — retry {} in {}ms", attempt, waitMs + jitter);
                sleep(waitMs + jitter);
            }
        }
    }
}