llm-quant/app/llm/rate_limit.py

49 lines
1.7 KiB
Python

"""Simple token-bucket rate limiter for LLM calls."""
from __future__ import annotations
from threading import Lock
from time import monotonic
from typing import Callable, Dict
class RateLimiter:
"""Token bucket rate limiter that returns required wait time."""
def __init__(self, monotonic_func: Callable[[], float] | None = None) -> None:
self._now = monotonic_func or monotonic
self._lock = Lock()
self._buckets: Dict[str, dict[str, float]] = {}
def acquire(self, key: str, rate_per_minute: int, burst: int) -> float:
"""Attempt to consume a token; return wait time if throttled."""
if rate_per_minute <= 0:
return 0.0
capacity = float(max(1, burst if burst > 0 else rate_per_minute))
rate = float(rate_per_minute)
now = self._now()
with self._lock:
bucket = self._buckets.get(key)
if bucket is None:
bucket = {"tokens": capacity, "capacity": capacity, "last": now, "rate": rate}
self._buckets[key] = bucket
else:
bucket["capacity"] = capacity
bucket["rate"] = rate
tokens = bucket["tokens"]
elapsed = max(0.0, now - bucket["last"])
tokens = min(capacity, tokens + elapsed * rate / 60.0)
if tokens >= 1.0:
bucket["tokens"] = tokens - 1.0
bucket["last"] = now
return 0.0
bucket["tokens"] = tokens
bucket["last"] = now
deficit = 1.0 - tokens
wait_time = deficit * 60.0 / rate
return max(wait_time, 0.0)
def reset(self) -> None:
with self._lock:
self._buckets.clear()