49 lines
1.7 KiB
Python
49 lines
1.7 KiB
Python
"""Simple token-bucket rate limiter for LLM calls."""
|
|
from __future__ import annotations
|
|
|
|
from threading import Lock
|
|
from time import monotonic
|
|
from typing import Callable, Dict
|
|
|
|
|
|
class RateLimiter:
|
|
"""Token bucket rate limiter that returns required wait time."""
|
|
|
|
def __init__(self, monotonic_func: Callable[[], float] | None = None) -> None:
|
|
self._now = monotonic_func or monotonic
|
|
self._lock = Lock()
|
|
self._buckets: Dict[str, dict[str, float]] = {}
|
|
|
|
def acquire(self, key: str, rate_per_minute: int, burst: int) -> float:
|
|
"""Attempt to consume a token; return wait time if throttled."""
|
|
|
|
if rate_per_minute <= 0:
|
|
return 0.0
|
|
capacity = float(max(1, burst if burst > 0 else rate_per_minute))
|
|
rate = float(rate_per_minute)
|
|
now = self._now()
|
|
with self._lock:
|
|
bucket = self._buckets.get(key)
|
|
if bucket is None:
|
|
bucket = {"tokens": capacity, "capacity": capacity, "last": now, "rate": rate}
|
|
self._buckets[key] = bucket
|
|
else:
|
|
bucket["capacity"] = capacity
|
|
bucket["rate"] = rate
|
|
tokens = bucket["tokens"]
|
|
elapsed = max(0.0, now - bucket["last"])
|
|
tokens = min(capacity, tokens + elapsed * rate / 60.0)
|
|
if tokens >= 1.0:
|
|
bucket["tokens"] = tokens - 1.0
|
|
bucket["last"] = now
|
|
return 0.0
|
|
bucket["tokens"] = tokens
|
|
bucket["last"] = now
|
|
deficit = 1.0 - tokens
|
|
wait_time = deficit * 60.0 / rate
|
|
return max(wait_time, 0.0)
|
|
|
|
def reset(self) -> None:
|
|
with self._lock:
|
|
self._buckets.clear()
|