llm-quant/app/llm/rate_limit.py

"""Simple token-bucket rate limiter for LLM calls."""
from __future__ import annotations

from threading import Lock
from time import monotonic
from typing import Callable, Dict


class RateLimiter:
    """Token bucket rate limiter that returns required wait time."""

    def __init__(self, monotonic_func: Callable[[], float] | None = None) -> None:
        self._now = monotonic_func or monotonic
        self._lock = Lock()
        self._buckets: Dict[str, dict[str, float]] = {}

    def acquire(self, key: str, rate_per_minute: int, burst: int) -> float:
        """Attempt to consume a token; return wait time if throttled."""

        if rate_per_minute <= 0:
            return 0.0
        capacity = float(max(1, burst if burst > 0 else rate_per_minute))
        rate = float(rate_per_minute)
        now = self._now()
        with self._lock:
            bucket = self._buckets.get(key)
            if bucket is None:
                bucket = {"tokens": capacity, "capacity": capacity, "last": now, "rate": rate}
                self._buckets[key] = bucket
            else:
                bucket["capacity"] = capacity
                bucket["rate"] = rate
            tokens = bucket["tokens"]
            elapsed = max(0.0, now - bucket["last"])
            tokens = min(capacity, tokens + elapsed * rate / 60.0)
            if tokens >= 1.0:
                bucket["tokens"] = tokens - 1.0
                bucket["last"] = now
                return 0.0
            bucket["tokens"] = tokens
            bucket["last"] = now
            deficit = 1.0 - tokens
            wait_time = deficit * 60.0 / rate
            return max(wait_time, 0.0)

    def reset(self) -> None:
        with self._lock:
            self._buckets.clear()