add buildbot extension for autoscaling nix builders
This commit is contained in:
parent
ea12318b88
commit
d1976a5fd8
13 changed files with 2300 additions and 8 deletions
12
buildbot-ext/buildbot_autoscale_ext/__init__.py
Normal file
12
buildbot-ext/buildbot_autoscale_ext/__init__.py
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
"""Buildbot autoscale extension package."""
|
||||
|
||||
from .configurator import AutoscaleConfigurator
|
||||
from .settings import AutoscaleSettings
|
||||
from .steps import CapacityGateStep, CapacityReleaseStep
|
||||
|
||||
__all__ = [
|
||||
"AutoscaleConfigurator",
|
||||
"AutoscaleSettings",
|
||||
"CapacityGateStep",
|
||||
"CapacityReleaseStep",
|
||||
]
|
||||
169
buildbot-ext/buildbot_autoscale_ext/client.py
Normal file
169
buildbot-ext/buildbot_autoscale_ext/client.py
Normal file
|
|
@ -0,0 +1,169 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import http.client
|
||||
import json
|
||||
import random
|
||||
import socket
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RetryPolicy:
|
||||
max_attempts: int
|
||||
base_seconds: float
|
||||
max_seconds: float
|
||||
|
||||
|
||||
class DaemonError(RuntimeError):
|
||||
def __init__(
|
||||
self,
|
||||
message: str,
|
||||
*,
|
||||
path: str,
|
||||
status: int | None = None,
|
||||
response: dict[str, Any] | None = None,
|
||||
cause: Exception | None = None,
|
||||
) -> None:
|
||||
super().__init__(message)
|
||||
self.path = path
|
||||
self.status = status
|
||||
self.response = response
|
||||
self.cause = cause
|
||||
|
||||
|
||||
class UnixSocketHTTPConnection(http.client.HTTPConnection):
|
||||
def __init__(self, socket_path: str, timeout: float) -> None:
|
||||
super().__init__(host="localhost", port=0, timeout=timeout)
|
||||
self._socket_path = socket_path
|
||||
|
||||
def connect(self) -> None:
|
||||
self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
||||
self.sock.settimeout(self.timeout)
|
||||
self.sock.connect(self._socket_path)
|
||||
|
||||
|
||||
class DaemonClient:
|
||||
def __init__(self, socket_path: str, retry_policy: RetryPolicy) -> None:
|
||||
self._socket_path = socket_path
|
||||
self._retry = retry_policy
|
||||
|
||||
def post_json(
|
||||
self,
|
||||
path: str,
|
||||
body: dict[str, Any],
|
||||
timeout_seconds: float,
|
||||
retryable_statuses: set[int],
|
||||
) -> dict[str, Any]:
|
||||
return self._request_json(
|
||||
method="POST",
|
||||
path=path,
|
||||
timeout_seconds=timeout_seconds,
|
||||
retryable_statuses=retryable_statuses,
|
||||
body=body,
|
||||
)
|
||||
|
||||
def get_json(
|
||||
self,
|
||||
path: str,
|
||||
timeout_seconds: float,
|
||||
retryable_statuses: set[int],
|
||||
) -> dict[str, Any]:
|
||||
return self._request_json(
|
||||
method="GET",
|
||||
path=path,
|
||||
timeout_seconds=timeout_seconds,
|
||||
retryable_statuses=retryable_statuses,
|
||||
body=None,
|
||||
)
|
||||
|
||||
def _request_json(
|
||||
self,
|
||||
*,
|
||||
method: str,
|
||||
path: str,
|
||||
timeout_seconds: float,
|
||||
retryable_statuses: set[int],
|
||||
body: dict[str, Any] | None,
|
||||
) -> dict[str, Any]:
|
||||
last_error: DaemonError | None = None
|
||||
for attempt in range(1, self._retry.max_attempts + 1):
|
||||
try:
|
||||
payload = json.dumps(body).encode("utf-8") if body is not None else None
|
||||
response_body, status = self._raw_request(
|
||||
method=method,
|
||||
path=path,
|
||||
timeout_seconds=timeout_seconds,
|
||||
payload=payload,
|
||||
)
|
||||
parsed = self._parse_json(response_body, path)
|
||||
|
||||
if 200 <= status < 300:
|
||||
return parsed
|
||||
|
||||
err = DaemonError(
|
||||
f"daemon returned HTTP {status} for {method} {path}",
|
||||
path=path,
|
||||
status=status,
|
||||
response=parsed,
|
||||
)
|
||||
|
||||
retryable = status in retryable_statuses
|
||||
if not retryable:
|
||||
raise err
|
||||
last_error = err
|
||||
except (ConnectionRefusedError, FileNotFoundError, TimeoutError, OSError) as exc:
|
||||
last_error = DaemonError(
|
||||
f"daemon transport error during {method} {path}: {exc}",
|
||||
path=path,
|
||||
cause=exc,
|
||||
)
|
||||
except DaemonError:
|
||||
raise
|
||||
|
||||
if attempt < self._retry.max_attempts:
|
||||
self._sleep_backoff(attempt)
|
||||
|
||||
assert last_error is not None
|
||||
raise last_error
|
||||
|
||||
def _raw_request(
|
||||
self,
|
||||
*,
|
||||
method: str,
|
||||
path: str,
|
||||
timeout_seconds: float,
|
||||
payload: bytes | None,
|
||||
) -> tuple[bytes, int]:
|
||||
conn = UnixSocketHTTPConnection(self._socket_path, timeout=timeout_seconds)
|
||||
headers = {"Accept": "application/json"}
|
||||
if payload is not None:
|
||||
headers["Content-Type"] = "application/json"
|
||||
try:
|
||||
conn.request(method=method, url=path, body=payload, headers=headers)
|
||||
response = conn.getresponse()
|
||||
data = response.read()
|
||||
return data, response.status
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
@staticmethod
|
||||
def _parse_json(raw: bytes, path: str) -> dict[str, Any]:
|
||||
if not raw:
|
||||
return {}
|
||||
try:
|
||||
data = json.loads(raw.decode("utf-8"))
|
||||
except json.JSONDecodeError as exc:
|
||||
raise DaemonError(
|
||||
f"daemon returned invalid JSON for {path}",
|
||||
path=path,
|
||||
cause=exc,
|
||||
) from exc
|
||||
if not isinstance(data, dict):
|
||||
raise DaemonError(f"daemon returned non-object JSON for {path}", path=path)
|
||||
return data
|
||||
|
||||
def _sleep_backoff(self, attempt: int) -> None:
|
||||
ceiling = min(self._retry.max_seconds, self._retry.base_seconds * (2 ** (attempt - 1)))
|
||||
time.sleep(random.uniform(0.0, ceiling))
|
||||
66
buildbot-ext/buildbot_autoscale_ext/configurator.py
Normal file
66
buildbot-ext/buildbot_autoscale_ext/configurator.py
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from buildbot.configurators import ConfiguratorBase
|
||||
|
||||
from .settings import AutoscaleSettings
|
||||
from .steps import CapacityGateStep, CapacityReleaseStep
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AutoscaleConfigurator(ConfiguratorBase):
|
||||
def __init__(self, settings: AutoscaleSettings) -> None:
|
||||
super().__init__()
|
||||
self.settings = settings
|
||||
|
||||
def configure(self, config_dict: dict[str, Any]) -> None:
|
||||
builders = config_dict.get("builders", [])
|
||||
patched: list[str] = []
|
||||
|
||||
for builder in builders:
|
||||
name = getattr(builder, "name", "")
|
||||
if not isinstance(name, str) or not name.endswith("/nix-build"):
|
||||
continue
|
||||
|
||||
factory = getattr(builder, "factory", None)
|
||||
steps = getattr(factory, "steps", None)
|
||||
if factory is None or not isinstance(steps, list):
|
||||
log.warning("Skipping builder with unrecognized factory shape: %s", name)
|
||||
continue
|
||||
|
||||
gate = CapacityGateStep(
|
||||
name="Ensure remote builder capacity",
|
||||
daemon_socket=self.settings.daemon_socket,
|
||||
system_property=self.settings.system_property,
|
||||
default_system=self.settings.default_system,
|
||||
reserve_timeout_seconds=self.settings.reserve_timeout_seconds,
|
||||
poll_interval_seconds=self.settings.poll_interval_seconds,
|
||||
retry_max_attempts=self.settings.retry_max_attempts,
|
||||
retry_base_seconds=self.settings.retry_base_seconds,
|
||||
retry_max_seconds=self.settings.retry_max_seconds,
|
||||
haltOnFailure=True,
|
||||
flunkOnFailure=True,
|
||||
warnOnFailure=False,
|
||||
)
|
||||
steps.insert(0, gate)
|
||||
|
||||
if self.settings.release_on_finish:
|
||||
steps.append(
|
||||
CapacityReleaseStep(
|
||||
name="Release autoscaler reservation",
|
||||
daemon_socket=self.settings.daemon_socket,
|
||||
retry_max_attempts=self.settings.retry_max_attempts,
|
||||
retry_base_seconds=self.settings.retry_base_seconds,
|
||||
retry_max_seconds=self.settings.retry_max_seconds,
|
||||
alwaysRun=True,
|
||||
flunkOnFailure=False,
|
||||
warnOnFailure=True,
|
||||
)
|
||||
)
|
||||
|
||||
patched.append(name)
|
||||
|
||||
log.info("AutoscaleConfigurator patched builders: %s", patched)
|
||||
14
buildbot-ext/buildbot_autoscale_ext/settings.py
Normal file
14
buildbot-ext/buildbot_autoscale_ext/settings.py
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AutoscaleSettings:
|
||||
daemon_socket: str
|
||||
system_property: str = "system"
|
||||
default_system: str = "x86_64-linux"
|
||||
reserve_timeout_seconds: int = 900
|
||||
poll_interval_seconds: float = 5.0
|
||||
retry_max_attempts: int = 5
|
||||
retry_base_seconds: float = 0.5
|
||||
retry_max_seconds: float = 5.0
|
||||
release_on_finish: bool = True
|
||||
199
buildbot-ext/buildbot_autoscale_ext/steps.py
Normal file
199
buildbot-ext/buildbot_autoscale_ext/steps.py
Normal file
|
|
@ -0,0 +1,199 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from typing import Any, cast
|
||||
|
||||
from buildbot.plugins import util
|
||||
from buildbot.process import buildstep
|
||||
from twisted.internet import defer, reactor
|
||||
from twisted.internet.interfaces import IReactorTime
|
||||
from twisted.internet.task import deferLater
|
||||
from twisted.internet.threads import deferToThread
|
||||
|
||||
from .client import DaemonClient, DaemonError, RetryPolicy
|
||||
from .telemetry import phase_message
|
||||
|
||||
|
||||
class CapacityGateStep(buildstep.BuildStep):
|
||||
renderables = ()
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
daemon_socket: str,
|
||||
system_property: str,
|
||||
default_system: str,
|
||||
reserve_timeout_seconds: int,
|
||||
poll_interval_seconds: float,
|
||||
retry_max_attempts: int,
|
||||
retry_base_seconds: float,
|
||||
retry_max_seconds: float = 5.0,
|
||||
**kwargs: object,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self._system_property = system_property
|
||||
self._default_system = default_system
|
||||
self._reserve_timeout_seconds = reserve_timeout_seconds
|
||||
self._poll_interval_seconds = poll_interval_seconds
|
||||
self._client = DaemonClient(
|
||||
socket_path=daemon_socket,
|
||||
retry_policy=RetryPolicy(
|
||||
max_attempts=retry_max_attempts,
|
||||
base_seconds=retry_base_seconds,
|
||||
max_seconds=retry_max_seconds,
|
||||
),
|
||||
)
|
||||
|
||||
def _determine_system(self) -> str:
|
||||
if self.build is None:
|
||||
return self._default_system
|
||||
props = self.build.getProperties()
|
||||
value = props.getProperty(self._system_property)
|
||||
if value:
|
||||
return str(value)
|
||||
return self._default_system
|
||||
|
||||
def run(self) -> defer.Deferred[int]:
|
||||
return defer.ensureDeferred(self._run())
|
||||
|
||||
async def _run(self) -> int:
|
||||
system = self._determine_system()
|
||||
start = time.monotonic()
|
||||
|
||||
try:
|
||||
reserve = await deferToThread(
|
||||
self._client.post_json,
|
||||
"/v1/reservations",
|
||||
{
|
||||
"system": system,
|
||||
"reason": "buildbot-nix-build",
|
||||
"build_id": getattr(self.build, "buildid", None),
|
||||
},
|
||||
10.0,
|
||||
{429, 500, 502, 503, 504},
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
await self._add_log("autoscale_gate_reserve_error", f"reserve failed: {exc}")
|
||||
self.descriptionDone = ["capacity reservation failed after retries"]
|
||||
return util.FAILURE
|
||||
|
||||
reservation_id = str(reserve["reservation_id"])
|
||||
self._set_property("autoscale_reservation_id", reservation_id)
|
||||
|
||||
last_phase: str | None = "pending"
|
||||
last_reason: str | None = None
|
||||
|
||||
while True:
|
||||
elapsed = time.monotonic() - start
|
||||
self.descriptionSuffix = [f"phase={last_phase} elapsed={int(elapsed)}s"]
|
||||
if elapsed > self._reserve_timeout_seconds:
|
||||
await self._add_log(
|
||||
"autoscale_gate_timeout",
|
||||
f"capacity wait timeout {phase_message(last_phase, last_reason)}",
|
||||
)
|
||||
self.descriptionDone = ["capacity wait timeout"]
|
||||
return util.FAILURE
|
||||
|
||||
try:
|
||||
status = await deferToThread(
|
||||
self._client.get_json,
|
||||
f"/v1/reservations/{reservation_id}",
|
||||
10.0,
|
||||
{429, 500, 502, 503, 504},
|
||||
)
|
||||
except DaemonError as exc:
|
||||
last_reason = str(exc)
|
||||
await deferLater(
|
||||
cast(IReactorTime, reactor),
|
||||
self._poll_interval_seconds,
|
||||
lambda: None,
|
||||
)
|
||||
continue
|
||||
|
||||
phase = str(status.get("phase", "pending"))
|
||||
reason = status.get("reason")
|
||||
last_phase = phase
|
||||
last_reason = str(reason) if reason is not None else None
|
||||
|
||||
if phase == "ready":
|
||||
slot = str(status["slot"])
|
||||
instance_id = str(status["instance_id"])
|
||||
waited = int(time.monotonic() - start)
|
||||
self._set_property("autoscale_slot", slot)
|
||||
self._set_property("autoscale_instance_id", instance_id)
|
||||
self._set_property("autoscale_wait_seconds", waited)
|
||||
self.descriptionDone = [f"capacity ready in {waited}s"]
|
||||
self.descriptionSuffix = [f"phase={phase}"]
|
||||
return util.SUCCESS
|
||||
|
||||
if phase in {"failed", "expired", "released"}:
|
||||
await self._add_log(
|
||||
"autoscale_gate_failure",
|
||||
f"capacity gate terminal {phase_message(last_phase, last_reason)}",
|
||||
)
|
||||
self.descriptionDone = [f"autoscaler reservation {phase}"]
|
||||
self.descriptionSuffix = [f"phase={phase}"]
|
||||
return util.FAILURE
|
||||
|
||||
await deferLater(
|
||||
cast(IReactorTime, reactor),
|
||||
self._poll_interval_seconds,
|
||||
lambda: None,
|
||||
)
|
||||
|
||||
def _set_property(self, name: str, value: object) -> None:
|
||||
if self.build is None:
|
||||
return
|
||||
self.build.setProperty(name, value, "autoscale")
|
||||
|
||||
async def _add_log(self, name: str, message: str) -> None:
|
||||
log = cast(Any, await self.addLog(name))
|
||||
log.addStderr(f"{message}\n")
|
||||
|
||||
|
||||
class CapacityReleaseStep(buildstep.BuildStep):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
daemon_socket: str,
|
||||
retry_max_attempts: int,
|
||||
retry_base_seconds: float,
|
||||
retry_max_seconds: float = 5.0,
|
||||
**kwargs: object,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self._client = DaemonClient(
|
||||
socket_path=daemon_socket,
|
||||
retry_policy=RetryPolicy(
|
||||
max_attempts=retry_max_attempts,
|
||||
base_seconds=retry_base_seconds,
|
||||
max_seconds=retry_max_seconds,
|
||||
),
|
||||
)
|
||||
|
||||
def run(self) -> defer.Deferred[int]:
|
||||
return defer.ensureDeferred(self._run())
|
||||
|
||||
async def _run(self) -> int:
|
||||
if self.build is None:
|
||||
return util.SKIPPED
|
||||
|
||||
reservation_id = self.build.getProperty("autoscale_reservation_id")
|
||||
if not reservation_id:
|
||||
return util.SKIPPED
|
||||
|
||||
try:
|
||||
await deferToThread(
|
||||
self._client.post_json,
|
||||
f"/v1/reservations/{reservation_id}/release",
|
||||
{},
|
||||
10.0,
|
||||
{429, 500, 502, 503, 504},
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log = cast(Any, await self.addLog("autoscale_release_error"))
|
||||
log.addStderr(f"release failed: {exc}\n")
|
||||
return util.WARNINGS
|
||||
|
||||
self.descriptionDone = ["autoscaler reservation released"]
|
||||
return util.SUCCESS
|
||||
20
buildbot-ext/buildbot_autoscale_ext/telemetry.py
Normal file
20
buildbot-ext/buildbot_autoscale_ext/telemetry.py
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
|
||||
def get_logger(name: str) -> logging.Logger:
|
||||
return logging.getLogger(name)
|
||||
|
||||
|
||||
def event(logger: logging.Logger, level: int, name: str, **fields: object) -> None:
|
||||
logger.log(level, "%s %s", name, " ".join(f"{key}={value!r}" for key, value in fields.items()))
|
||||
|
||||
|
||||
def phase_message(phase: str | None, reason: str | None) -> str:
|
||||
details: list[str] = []
|
||||
if phase:
|
||||
details.append(f"phase={phase}")
|
||||
if reason:
|
||||
details.append(f"reason={reason}")
|
||||
return " ".join(details) if details else "no daemon details"
|
||||
11
buildbot-ext/buildbot_autoscale_ext/tests/__init__.py
Normal file
11
buildbot-ext/buildbot_autoscale_ext/tests/__init__.py
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
"""Tests for buildbot_autoscale_ext."""
|
||||
|
||||
import asyncio
|
||||
from contextlib import suppress
|
||||
|
||||
from twisted.internet import asyncioreactor
|
||||
|
||||
TEST_LOOP = asyncio.new_event_loop()
|
||||
|
||||
with suppress(Exception):
|
||||
asyncioreactor.install(TEST_LOOP)
|
||||
212
buildbot-ext/buildbot_autoscale_ext/tests/test_client.py
Normal file
212
buildbot-ext/buildbot_autoscale_ext/tests/test_client.py
Normal file
|
|
@ -0,0 +1,212 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import socketserver
|
||||
import tempfile
|
||||
import threading
|
||||
from collections.abc import Callable
|
||||
from contextlib import suppress
|
||||
from dataclasses import dataclass
|
||||
from http import HTTPStatus
|
||||
from http.server import BaseHTTPRequestHandler
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from buildbot_autoscale_ext.client import (
|
||||
DaemonClient,
|
||||
DaemonError,
|
||||
RetryPolicy,
|
||||
UnixSocketHTTPConnection,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ServerState:
|
||||
post_count: int = 0
|
||||
get_count: int = 0
|
||||
|
||||
|
||||
class _Handler(BaseHTTPRequestHandler):
|
||||
server: _UnixHTTPServer
|
||||
|
||||
def do_GET(self) -> None: # noqa: N802
|
||||
self.server.state.get_count += 1
|
||||
status, body = self.server.on_get(self.path, self.server.state.get_count)
|
||||
self._send(status, body)
|
||||
|
||||
def do_POST(self) -> None: # noqa: N802
|
||||
self.server.state.post_count += 1
|
||||
size = int(self.headers.get("Content-Length", "0"))
|
||||
raw = self.rfile.read(size) if size else b"{}"
|
||||
payload = json.loads(raw.decode("utf-8"))
|
||||
status, body = self.server.on_post(self.path, payload, self.server.state.post_count)
|
||||
self._send(status, body)
|
||||
|
||||
def log_message(self, format: str, *args: object) -> None:
|
||||
del format, args
|
||||
|
||||
def _send(self, status: int, body: dict[str, Any]) -> None:
|
||||
encoded = json.dumps(body).encode("utf-8")
|
||||
self.send_response(status)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.send_header("Content-Length", str(len(encoded)))
|
||||
self.end_headers()
|
||||
self.wfile.write(encoded)
|
||||
|
||||
|
||||
class _UnixHTTPServer(socketserver.UnixStreamServer):
|
||||
def __init__(
|
||||
self,
|
||||
socket_path: str,
|
||||
*,
|
||||
on_get: Callable[[str, int], tuple[int, dict[str, Any]]],
|
||||
on_post: Callable[[str, dict[str, Any], int], tuple[int, dict[str, Any]]],
|
||||
) -> None:
|
||||
self.on_get = on_get
|
||||
self.on_post = on_post
|
||||
self.state = ServerState()
|
||||
super().__init__(socket_path, _Handler)
|
||||
|
||||
|
||||
class FakeDaemon:
|
||||
def __init__(
|
||||
self,
|
||||
socket_path: str,
|
||||
*,
|
||||
on_get: Callable[[str, int], tuple[int, dict[str, Any]]],
|
||||
on_post: Callable[[str, dict[str, Any], int], tuple[int, dict[str, Any]]],
|
||||
) -> None:
|
||||
self._socket_path = socket_path
|
||||
self._server = _UnixHTTPServer(socket_path, on_get=on_get, on_post=on_post)
|
||||
self._thread = threading.Thread(target=self._server.serve_forever, daemon=True)
|
||||
|
||||
def __enter__(self) -> FakeDaemon:
|
||||
self._thread.start()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
|
||||
del exc_type, exc, tb
|
||||
self._server.shutdown()
|
||||
self._server.server_close()
|
||||
with suppress(FileNotFoundError):
|
||||
os.unlink(self._socket_path)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def socket_path() -> str:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
yield str(Path(tmp) / "daemon.sock")
|
||||
|
||||
|
||||
def _client(socket_path: str, attempts: int = 3) -> DaemonClient:
|
||||
return DaemonClient(
|
||||
socket_path=socket_path,
|
||||
retry_policy=RetryPolicy(max_attempts=attempts, base_seconds=0.001, max_seconds=0.01),
|
||||
)
|
||||
|
||||
|
||||
def test_post_json_success(socket_path: str) -> None:
|
||||
with FakeDaemon(
|
||||
socket_path,
|
||||
on_get=lambda _p, _a: (HTTPStatus.OK, {}),
|
||||
on_post=lambda _p, payload, _a: (HTTPStatus.OK, {"echo": payload["system"]}),
|
||||
):
|
||||
response = _client(socket_path).post_json(
|
||||
"/v1/reservations",
|
||||
{"system": "x86_64-linux"},
|
||||
timeout_seconds=1.0,
|
||||
retryable_statuses={429, 500, 503},
|
||||
)
|
||||
|
||||
assert response == {"echo": "x86_64-linux"}
|
||||
|
||||
|
||||
def test_get_json_success(socket_path: str) -> None:
|
||||
with FakeDaemon(
|
||||
socket_path,
|
||||
on_get=lambda _p, _a: (HTTPStatus.OK, {"phase": "ready"}),
|
||||
on_post=lambda _p, _payload, _a: (HTTPStatus.OK, {}),
|
||||
):
|
||||
response = _client(socket_path).get_json(
|
||||
"/v1/reservations/r1",
|
||||
timeout_seconds=1.0,
|
||||
retryable_statuses={429, 500, 503},
|
||||
)
|
||||
|
||||
assert response == {"phase": "ready"}
|
||||
|
||||
|
||||
def test_transient_503_retries_then_raises(socket_path: str) -> None:
|
||||
with (
|
||||
FakeDaemon(
|
||||
socket_path,
|
||||
on_get=lambda _p, _a: (HTTPStatus.SERVICE_UNAVAILABLE, {"error": "busy"}),
|
||||
on_post=lambda _p, _payload, _a: (HTTPStatus.OK, {}),
|
||||
) as daemon,
|
||||
pytest.raises(DaemonError) as exc,
|
||||
):
|
||||
_client(socket_path, attempts=3).get_json(
|
||||
"/v1/reservations/r1",
|
||||
timeout_seconds=1.0,
|
||||
retryable_statuses={429, 500, 502, 503, 504},
|
||||
)
|
||||
|
||||
assert exc.value.status == HTTPStatus.SERVICE_UNAVAILABLE
|
||||
assert daemon._server.state.get_count == 3
|
||||
|
||||
|
||||
def test_400_not_retried(socket_path: str) -> None:
|
||||
with (
|
||||
FakeDaemon(
|
||||
socket_path,
|
||||
on_get=lambda _p, _a: (HTTPStatus.BAD_REQUEST, {"error": "bad"}),
|
||||
on_post=lambda _p, _payload, _a: (HTTPStatus.OK, {}),
|
||||
) as daemon,
|
||||
pytest.raises(DaemonError) as exc,
|
||||
):
|
||||
_client(socket_path, attempts=5).get_json(
|
||||
"/v1/reservations/r1",
|
||||
timeout_seconds=1.0,
|
||||
retryable_statuses={429, 500, 502, 503, 504},
|
||||
)
|
||||
|
||||
assert exc.value.status == HTTPStatus.BAD_REQUEST
|
||||
assert daemon._server.state.get_count == 1
|
||||
|
||||
|
||||
def test_connection_refused_retries_then_raises(
|
||||
socket_path: str,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
def _boom(self: UnixSocketHTTPConnection) -> None:
|
||||
raise ConnectionRefusedError("refused")
|
||||
|
||||
monkeypatch.setattr(UnixSocketHTTPConnection, "connect", _boom)
|
||||
|
||||
with pytest.raises(DaemonError):
|
||||
_client(socket_path, attempts=3).get_json(
|
||||
"/v1/reservations/r1",
|
||||
timeout_seconds=1.0,
|
||||
retryable_statuses={429, 500, 502, 503, 504},
|
||||
)
|
||||
|
||||
|
||||
def test_backoff_attempts_at_least_two(socket_path: str) -> None:
|
||||
with (
|
||||
FakeDaemon(
|
||||
socket_path,
|
||||
on_get=lambda _p, _a: (HTTPStatus.SERVICE_UNAVAILABLE, {"error": "busy"}),
|
||||
on_post=lambda _p, _payload, _a: (HTTPStatus.OK, {}),
|
||||
) as daemon,
|
||||
pytest.raises(DaemonError),
|
||||
):
|
||||
_client(socket_path, attempts=2).get_json(
|
||||
"/v1/reservations/r1",
|
||||
timeout_seconds=1.0,
|
||||
retryable_statuses={429, 500, 502, 503, 504},
|
||||
)
|
||||
|
||||
assert daemon._server.state.get_count >= 2
|
||||
|
|
@ -0,0 +1,59 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import pytest
|
||||
|
||||
from buildbot_autoscale_ext.configurator import AutoscaleConfigurator
|
||||
from buildbot_autoscale_ext.settings import AutoscaleSettings
|
||||
from buildbot_autoscale_ext.steps import CapacityGateStep, CapacityReleaseStep
|
||||
|
||||
|
||||
@dataclass
|
||||
class FakeFactory:
|
||||
steps: list[object] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FakeBuilder:
|
||||
name: str
|
||||
factory: FakeFactory
|
||||
|
||||
|
||||
def test_patches_nix_builders() -> None:
|
||||
cfg = {
|
||||
"builders": [
|
||||
FakeBuilder("proj/nix-build", FakeFactory(["original"])),
|
||||
FakeBuilder("proj/eval", FakeFactory(["eval"])),
|
||||
]
|
||||
}
|
||||
|
||||
AutoscaleConfigurator(AutoscaleSettings(daemon_socket="/tmp/daemon.sock")).configure(cfg)
|
||||
|
||||
patched_steps = cfg["builders"][0].factory.steps
|
||||
assert isinstance(patched_steps[0], CapacityGateStep)
|
||||
assert patched_steps[1] == "original"
|
||||
assert isinstance(patched_steps[-1], CapacityReleaseStep)
|
||||
|
||||
untouched_steps = cfg["builders"][1].factory.steps
|
||||
assert untouched_steps == ["eval"]
|
||||
|
||||
|
||||
def test_empty_builders_no_error() -> None:
|
||||
cfg = {"builders": []}
|
||||
AutoscaleConfigurator(AutoscaleSettings(daemon_socket="/tmp/daemon.sock")).configure(cfg)
|
||||
|
||||
|
||||
def test_startup_log_contains_patched_names(caplog: pytest.LogCaptureFixture) -> None:
|
||||
caplog.set_level(logging.INFO)
|
||||
cfg = {
|
||||
"builders": [
|
||||
FakeBuilder("one/nix-build", FakeFactory()),
|
||||
FakeBuilder("two/eval", FakeFactory()),
|
||||
]
|
||||
}
|
||||
|
||||
AutoscaleConfigurator(AutoscaleSettings(daemon_socket="/tmp/daemon.sock")).configure(cfg)
|
||||
|
||||
assert "one/nix-build" in caplog.text
|
||||
337
buildbot-ext/buildbot_autoscale_ext/tests/test_steps.py
Normal file
337
buildbot-ext/buildbot_autoscale_ext/tests/test_steps.py
Normal file
|
|
@ -0,0 +1,337 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import socketserver
|
||||
import tempfile
|
||||
import threading
|
||||
from collections.abc import Callable
|
||||
from contextlib import suppress
|
||||
from http import HTTPStatus
|
||||
from http.server import BaseHTTPRequestHandler
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from buildbot.plugins import util
|
||||
from twisted.internet import defer
|
||||
|
||||
from buildbot_autoscale_ext.steps import CapacityGateStep, CapacityReleaseStep
|
||||
|
||||
|
||||
class FakeProperties:
|
||||
def __init__(self, data: dict[str, Any]) -> None:
|
||||
self._data = data
|
||||
|
||||
def getProperty(self, name: str) -> Any:
|
||||
return self._data.get(name)
|
||||
|
||||
|
||||
class FakeBuild:
|
||||
def __init__(self) -> None:
|
||||
self.buildid = 42
|
||||
self._props: dict[str, Any] = {}
|
||||
|
||||
def getProperties(self) -> FakeProperties:
|
||||
return FakeProperties(self._props)
|
||||
|
||||
def setProperty(self, key: str, value: Any, _source: str) -> None:
|
||||
self._props[key] = value
|
||||
|
||||
def getProperty(self, key: str) -> Any:
|
||||
return self._props.get(key)
|
||||
|
||||
|
||||
class FakeLog:
|
||||
def __init__(self) -> None:
|
||||
self.stderr: list[str] = []
|
||||
|
||||
def addStderr(self, text: str) -> None:
|
||||
self.stderr.append(text)
|
||||
|
||||
|
||||
class _Handler(BaseHTTPRequestHandler):
|
||||
server: _UnixHTTPServer
|
||||
|
||||
def do_GET(self) -> None: # noqa: N802
|
||||
self.server.get_count += 1
|
||||
status, body = self.server.on_get(self.path, self.server.get_count)
|
||||
self._send(status, body)
|
||||
|
||||
def do_POST(self) -> None: # noqa: N802
|
||||
self.server.post_count += 1
|
||||
size = int(self.headers.get("Content-Length", "0"))
|
||||
raw = self.rfile.read(size) if size else b"{}"
|
||||
payload = json.loads(raw.decode("utf-8"))
|
||||
status, body = self.server.on_post(self.path, payload, self.server.post_count)
|
||||
self._send(status, body)
|
||||
|
||||
def _send(self, status: int, body: dict[str, Any]) -> None:
|
||||
data = json.dumps(body).encode("utf-8")
|
||||
self.send_response(status)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.send_header("Content-Length", str(len(data)))
|
||||
self.end_headers()
|
||||
self.wfile.write(data)
|
||||
|
||||
def log_message(self, format: str, *args: object) -> None:
|
||||
del format, args
|
||||
|
||||
|
||||
class _UnixHTTPServer(socketserver.UnixStreamServer):
|
||||
def __init__(
|
||||
self,
|
||||
socket_path: str,
|
||||
*,
|
||||
on_get: Callable[[str, int], tuple[int, dict[str, Any]]],
|
||||
on_post: Callable[[str, dict[str, Any], int], tuple[int, dict[str, Any]]],
|
||||
) -> None:
|
||||
self.on_get = on_get
|
||||
self.on_post = on_post
|
||||
self.get_count = 0
|
||||
self.post_count = 0
|
||||
super().__init__(socket_path, _Handler)
|
||||
|
||||
|
||||
class FakeDaemon:
|
||||
def __init__(
|
||||
self,
|
||||
socket_path: str,
|
||||
*,
|
||||
on_get: Callable[[str, int], tuple[int, dict[str, Any]]],
|
||||
on_post: Callable[[str, dict[str, Any], int], tuple[int, dict[str, Any]]],
|
||||
) -> None:
|
||||
self._socket_path = socket_path
|
||||
self._server = _UnixHTTPServer(socket_path, on_get=on_get, on_post=on_post)
|
||||
self._thread = threading.Thread(target=self._server.serve_forever, daemon=True)
|
||||
|
||||
def __enter__(self) -> FakeDaemon:
|
||||
self._thread.start()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
|
||||
del exc_type, exc, tb
|
||||
self._server.shutdown()
|
||||
self._server.server_close()
|
||||
with suppress(FileNotFoundError):
|
||||
os.unlink(self._socket_path)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def socket_path() -> str:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
yield str(Path(tmp) / "daemon.sock")
|
||||
|
||||
|
||||
def _attach_build(step: Any, build: FakeBuild) -> None:
|
||||
object.__setattr__(build, "master", None)
|
||||
object.__setattr__(step, "build", build)
|
||||
object.__setattr__(step, "master", None)
|
||||
|
||||
async def _add_log(_name: str) -> FakeLog:
|
||||
return FakeLog()
|
||||
|
||||
object.__setattr__(step, "addLog", _add_log)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _patch_twisted_waits(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
from buildbot.config import errors as config_errors
|
||||
from buildbot.process import buildstep as buildstep_module
|
||||
|
||||
import buildbot_autoscale_ext.steps as steps_mod
|
||||
|
||||
def _defer_to_thread(func: Any, *args: object, **kwargs: object) -> defer.Deferred[object]:
|
||||
try:
|
||||
return defer.succeed(func(*args, **kwargs))
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return defer.fail(exc)
|
||||
|
||||
def _defer_later(
|
||||
_clock: object,
|
||||
_seconds: float,
|
||||
callback: Any,
|
||||
*args: object,
|
||||
**kwargs: object,
|
||||
) -> defer.Deferred[object]:
|
||||
return defer.succeed(callback(*args, **kwargs))
|
||||
|
||||
monkeypatch.setattr(steps_mod, "deferToThread", _defer_to_thread)
|
||||
monkeypatch.setattr(steps_mod, "deferLater", _defer_later)
|
||||
monkeypatch.setattr(config_errors, "_errors", None, raising=False)
|
||||
monkeypatch.setattr(buildstep_module.config, "error", lambda *_a, **_k: None)
|
||||
|
||||
|
||||
def _run_step(step: Any) -> int:
|
||||
deferred = step.run()
|
||||
out: list[int] = []
|
||||
failures: list[defer.Failure] = []
|
||||
deferred.addCallbacks(lambda value: out.append(value), lambda err: failures.append(err))
|
||||
if failures:
|
||||
failures[0].raiseException()
|
||||
return out[0]
|
||||
|
||||
|
||||
def test_gate_success_pending_then_ready(socket_path: str) -> None:
|
||||
with FakeDaemon(
|
||||
socket_path,
|
||||
on_post=lambda _p, _payload, _n: (
|
||||
HTTPStatus.OK,
|
||||
{
|
||||
"reservation_id": "r1",
|
||||
"phase": "pending",
|
||||
"created_at": "now",
|
||||
"expires_at": "soon",
|
||||
},
|
||||
),
|
||||
on_get=lambda _p, n: (
|
||||
HTTPStatus.OK,
|
||||
{"reservation_id": "r1", "phase": "pending"}
|
||||
if n == 1
|
||||
else {
|
||||
"reservation_id": "r1",
|
||||
"phase": "ready",
|
||||
"slot": "slot-1",
|
||||
"instance_id": "i-123",
|
||||
"system": "x86_64-linux",
|
||||
"updated_at": "now",
|
||||
},
|
||||
),
|
||||
):
|
||||
step = CapacityGateStep(
|
||||
name="gate",
|
||||
daemon_socket=socket_path,
|
||||
system_property="system",
|
||||
default_system="x86_64-linux",
|
||||
reserve_timeout_seconds=2,
|
||||
poll_interval_seconds=0.01,
|
||||
retry_max_attempts=2,
|
||||
retry_base_seconds=0.001,
|
||||
)
|
||||
build = FakeBuild()
|
||||
_attach_build(step, build)
|
||||
|
||||
result = _run_step(step)
|
||||
|
||||
assert result == util.SUCCESS
|
||||
assert build.getProperty("autoscale_reservation_id") == "r1"
|
||||
assert build.getProperty("autoscale_slot") == "slot-1"
|
||||
assert build.getProperty("autoscale_instance_id") == "i-123"
|
||||
assert isinstance(build.getProperty("autoscale_wait_seconds"), int)
|
||||
|
||||
|
||||
def test_gate_failure_on_failed_phase(socket_path: str) -> None:
|
||||
with FakeDaemon(
|
||||
socket_path,
|
||||
on_post=lambda _p, _payload, _n: (
|
||||
HTTPStatus.OK,
|
||||
{"reservation_id": "r1", "phase": "pending"},
|
||||
),
|
||||
on_get=lambda _p, _n: (
|
||||
HTTPStatus.OK,
|
||||
{"reservation_id": "r1", "phase": "failed", "reason": "no"},
|
||||
),
|
||||
):
|
||||
step = CapacityGateStep(
|
||||
name="gate",
|
||||
daemon_socket=socket_path,
|
||||
system_property="system",
|
||||
default_system="x86_64-linux",
|
||||
reserve_timeout_seconds=2,
|
||||
poll_interval_seconds=0.01,
|
||||
retry_max_attempts=2,
|
||||
retry_base_seconds=0.001,
|
||||
)
|
||||
build = FakeBuild()
|
||||
_attach_build(step, build)
|
||||
|
||||
result = _run_step(step)
|
||||
|
||||
assert result == util.FAILURE
|
||||
|
||||
|
||||
def test_gate_failure_on_timeout(socket_path: str) -> None:
|
||||
with FakeDaemon(
|
||||
socket_path,
|
||||
on_post=lambda _p, _payload, _n: (
|
||||
HTTPStatus.OK,
|
||||
{"reservation_id": "r1", "phase": "pending"},
|
||||
),
|
||||
on_get=lambda _p, _n: (HTTPStatus.OK, {"reservation_id": "r1", "phase": "pending"}),
|
||||
):
|
||||
step = CapacityGateStep(
|
||||
name="gate",
|
||||
daemon_socket=socket_path,
|
||||
system_property="system",
|
||||
default_system="x86_64-linux",
|
||||
reserve_timeout_seconds=0,
|
||||
poll_interval_seconds=0.01,
|
||||
retry_max_attempts=2,
|
||||
retry_base_seconds=0.001,
|
||||
)
|
||||
build = FakeBuild()
|
||||
_attach_build(step, build)
|
||||
|
||||
result = _run_step(step)
|
||||
|
||||
assert result == util.FAILURE
|
||||
|
||||
|
||||
def test_release_skipped_without_reservation(socket_path: str) -> None:
|
||||
step = CapacityReleaseStep(
|
||||
name="release",
|
||||
daemon_socket=socket_path,
|
||||
retry_max_attempts=2,
|
||||
retry_base_seconds=0.001,
|
||||
)
|
||||
build = FakeBuild()
|
||||
_attach_build(step, build)
|
||||
|
||||
result = _run_step(step)
|
||||
|
||||
assert result == util.SKIPPED
|
||||
|
||||
|
||||
def test_release_warnings_on_retry_exhausted_500(socket_path: str) -> None:
|
||||
with FakeDaemon(
|
||||
socket_path,
|
||||
on_post=lambda _p, _payload, _n: (HTTPStatus.INTERNAL_SERVER_ERROR, {"error": "boom"}),
|
||||
on_get=lambda _p, _n: (HTTPStatus.OK, {}),
|
||||
):
|
||||
step = CapacityReleaseStep(
|
||||
name="release",
|
||||
daemon_socket=socket_path,
|
||||
retry_max_attempts=2,
|
||||
retry_base_seconds=0.001,
|
||||
)
|
||||
build = FakeBuild()
|
||||
build.setProperty("autoscale_reservation_id", "r1", "test")
|
||||
_attach_build(step, build)
|
||||
|
||||
result = _run_step(step)
|
||||
|
||||
assert result == util.WARNINGS
|
||||
|
||||
|
||||
def test_release_success(socket_path: str) -> None:
|
||||
with FakeDaemon(
|
||||
socket_path,
|
||||
on_post=lambda _p, _payload, _n: (
|
||||
HTTPStatus.OK,
|
||||
{"reservation_id": "r1", "phase": "released"},
|
||||
),
|
||||
on_get=lambda _p, _n: (HTTPStatus.OK, {}),
|
||||
):
|
||||
step = CapacityReleaseStep(
|
||||
name="release",
|
||||
daemon_socket=socket_path,
|
||||
retry_max_attempts=2,
|
||||
retry_base_seconds=0.001,
|
||||
)
|
||||
build = FakeBuild()
|
||||
build.setProperty("autoscale_reservation_id", "r1", "test")
|
||||
_attach_build(step, build)
|
||||
|
||||
result = _run_step(step)
|
||||
|
||||
assert result == util.SUCCESS
|
||||
Loading…
Add table
Add a link
Reference in a new issue