From b3d96689f3cbc12c4f2bd30317b8dbeefd21acef Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Thu, 26 Feb 2026 08:31:16 +0100 Subject: [PATCH 01/28] POC: Noble and cgroupsv2 support in acceptance tests --- .gitignore | 1 + .../{bionic_test.go => jammy_test.go} | 12 +- acceptance-tests/run-local.sh | 35 +-- ci/noble-updates.yml | 3 + ci/pipeline.yml | 14 +- ci/scripts/functions-ci.sh | 14 +- ci/scripts/start-bosh.sh | 217 +++++++++++------- .../.gitkeep | 0 manifests/haproxy.yml | 14 +- 9 files changed, 182 insertions(+), 128 deletions(-) rename acceptance-tests/{bionic_test.go => jammy_test.go} (73%) create mode 100644 ci/noble-updates.yml rename ci/scripts/{stemcell-bionic => stemcell-jammy}/.gitkeep (100%) diff --git a/.gitignore b/.gitignore index 5b6f7d4b..54fa5535 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ config/settings.yml releases/*.tgz releases/**/*.tgz ci/scripts/stemcell/*.tgz +ci/scripts/stemcell-jammy/*.tgz ci/scripts/stemcell-bionic/*.tgz dev_releases blobs/* diff --git a/acceptance-tests/bionic_test.go b/acceptance-tests/jammy_test.go similarity index 73% rename from acceptance-tests/bionic_test.go rename to acceptance-tests/jammy_test.go index 1823cd98..44ffc175 100644 --- a/acceptance-tests/bionic_test.go +++ b/acceptance-tests/jammy_test.go @@ -7,14 +7,14 @@ import ( . "github.com/onsi/ginkgo/v2" ) -var _ = Describe("Bionic", func() { - It("Correctly proxies HTTP requests when using the Bionic stemcell", func() { +var _ = Describe("Jammy", func() { + It("Correctly proxies HTTP requests when using the Jammy stemcell", func() { - opsfileBionic := `--- -# Configure Bionic stemcell + opsfileJammy := `--- +# Configure Jammy stemcell - type: replace path: /stemcells/alias=default/os - value: ubuntu-bionic + value: ubuntu-jammy ` haproxyBackendPort := 12000 @@ -22,7 +22,7 @@ var _ = Describe("Bionic", func() { haproxyBackendPort: haproxyBackendPort, haproxyBackendServers: []string{"127.0.0.1"}, deploymentName: deploymentNameForTestNode(), - }, []string{opsfileBionic}, map[string]interface{}{}, true) + }, []string{opsfileJammy}, map[string]interface{}{}, true) closeLocalServer, localPort := startDefaultTestServer() defer closeLocalServer() diff --git a/acceptance-tests/run-local.sh b/acceptance-tests/run-local.sh index 391a8acd..07a9c8f1 100755 --- a/acceptance-tests/run-local.sh +++ b/acceptance-tests/run-local.sh @@ -43,15 +43,16 @@ docker_mac_check_cgroupsv1() { } check_required_files() { - PIDS="" +# PIDS="" REQUIRED_FILE_PATTERNS=( - ci/scripts/stemcell/bosh-stemcell-*-ubuntu-jammy-*.tgz!https://bosh.io/d/stemcells/bosh-warden-boshlite-ubuntu-jammy-go_agent - ci/scripts/stemcell-bionic/bosh-stemcell-*-ubuntu-bionic-*.tgz!https://bosh.io/d/stemcells/bosh-warden-boshlite-ubuntu-bionic-go_agent + ci/scripts/stemcell/bosh-stemcell-*-ubuntu-noble.tgz!https://storage.googleapis.com/bosh-core-stemcells/1.238/bosh-stemcell-1.238-warden-boshlite-ubuntu-noble.tgz!no + ci/scripts/stemcell-jammy/bosh-stemcell-*-ubuntu-jammy-*.tgz!https://bosh.io/d/stemcells/bosh-warden-boshlite-ubuntu-jammy-go_agent!yes ) for entry in "${REQUIRED_FILE_PATTERNS[@]}"; do pattern=$(cut -f1 -d! <<<"$entry") url=$(cut -f2 -d! <<<"$entry") + resolve=$(cut -f2 -d! <<<"$entry") folder=$(realpath "$(dirname "$REPO_DIR/$pattern")") filepattern=$(basename "$pattern") pattern=$folder/$filepattern @@ -62,28 +63,32 @@ check_required_files() { continue fi - ( - echo "$filepattern not found, downloading latest." - cd "$folder" && \ - resolved=$(curl -s --write-out '\n%{redirect_url}' "$url" | tail -n1) && \ - curl -s --remote-name --remote-header-name --location "$resolved" && \ - echo "Downloaded '$url' successfully." && \ + #( + echo "$filepattern not found, downloading." + cd "$folder" + resolved="$url" + if [ "$resolve" == "yes" ]; then + resolved=$(curl -s --write-out '\n%{redirect_url}' "$url" | tail -n1 | tr -d '\n') + fi + echo "Resolved URL: $resolved" + curl -s --remote-name --remote-header-name --location "$resolved" + echo "Downloaded '$url' successfully." ls -1lh "$folder/"$filepattern - )& + #)& - PIDS="$PIDS $!" +# PIDS="$PIDS $!" done # shellcheck disable=SC2086 # expansion is desired, as $PIDS is a list of PIDs. Wait on all of those PIDs. - wait $PIDS +# wait $PIDS } check_required_files -if [ "$(uname)" == "Darwin" ]; then - docker_mac_check_cgroupsv1 -fi +#if [ "$(uname)" == "Darwin" ]; then +# docker_mac_check_cgroupsv1 +#fi build_image "${REPO_DIR}/ci" prepare_docker_scratch diff --git a/ci/noble-updates.yml b/ci/noble-updates.yml new file mode 100644 index 00000000..e75c21da --- /dev/null +++ b/ci/noble-updates.yml @@ -0,0 +1,3 @@ +- path: /cloud_provider/properties/docker_cpi/start_containers_with_systemd? + type: replace + value: true diff --git a/ci/pipeline.yml b/ci/pipeline.yml index 59a5cd68..6363a3aa 100644 --- a/ci/pipeline.yml +++ b/ci/pipeline.yml @@ -121,7 +121,7 @@ jobs: - in_parallel: - { get: git, trigger: true, passed: [unit-tests] } - { get: stemcell } - - { get: stemcell-bionic } + - { get: stemcell-jammy } - get: haproxy-boshrelease-testflight - task: acceptance-tests privileged: true @@ -131,7 +131,7 @@ jobs: inputs: - { name: git } - { name: stemcell } - - { name: stemcell-bionic } + - { name: stemcell-jammy } run: path: ./git/ci/scripts/acceptance-tests args: [] @@ -152,7 +152,7 @@ jobs: - do: - { get: git-pull-requests, trigger: true, version: every } - { get: stemcell } - - { get: stemcell-bionic } + - { get: stemcell-jammy } - get: haproxy-boshrelease-testflight - put: git-pull-requests params: @@ -169,7 +169,7 @@ jobs: inputs: - { name: git-pull-requests } - { name: stemcell } - - { name: stemcell-bionic } + - { name: stemcell-jammy } run: path: ./git-pull-requests/ci/scripts/acceptance-tests args: [] @@ -403,15 +403,15 @@ resources: - "dependabot" - "CFN-CI" - - name: stemcell-bionic + - name: stemcell-jammy type: bosh-io-stemcell source: - name: bosh-warden-boshlite-ubuntu-bionic-go_agent + name: bosh-warden-boshlite-ubuntu-jammy-go_agent - name: stemcell type: bosh-io-stemcell source: - name: bosh-warden-boshlite-ubuntu-jammy-go_agent + name: bosh-warden-boshlite-ubuntu-noble-go_agent - name: version type: semver diff --git a/ci/scripts/functions-ci.sh b/ci/scripts/functions-ci.sh index d3e64d50..fa8e5465 100755 --- a/ci/scripts/functions-ci.sh +++ b/ci/scripts/functions-ci.sh @@ -62,18 +62,18 @@ function bosh_release() { } function bosh_assets() { - stemcell_jammy_path="$START_DIR/stemcell/*.tgz" - stemcell_bionic_path="$START_DIR/stemcell-bionic/*.tgz" + stemcell_noble_path="$START_DIR/stemcell/*.tgz" + stemcell_jammy_path="$START_DIR/stemcell-jammy/*.tgz" + + echo "----- Uploading Noble stemcell" + bosh -n upload-stemcell $stemcell_noble_path echo "----- Uploading Jammy stemcell" bosh -n upload-stemcell $stemcell_jammy_path - echo "----- Uploading Bionic stemcell" - bosh -n upload-stemcell $stemcell_bionic_path - echo "----- Uploading os-conf (used for tests only)" - bosh -n upload-release --sha1 386293038ae3d00813eaa475b4acf63f8da226ef \ - https://bosh.io/d/github.com/cloudfoundry/os-conf-release?v=22.1.2 + bosh -n upload-release --sha1 sha256:efcf30754ce4c5f308aedab3329d8d679f5967b2a4c3c453204c7cb10c7c5ed9 \ + https://bosh.io/d/github.com/cloudfoundry/os-conf-release?v=23.0.0 export BOSH_PATH=$(command -v bosh) export BASE_MANIFEST_PATH="$PWD/manifests/haproxy.yml" diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 3bda28f6..c3ae7552 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -1,49 +1,45 @@ #!/usr/bin/env bash -set -eo pipefail +set -eu -o pipefail + +if [[ -n "${DEBUG:-}" ]]; then + set -x + export BOSH_LOG_LEVEL=debug +fi + +export BOSH_DIRECTOR_IP="10.245.0.11" +export BOSH_ENVIRONMENT="docker-director" + +export DNS_IP="8.8.8.8" function generate_certs() { local certs_dir certs_dir="${1}" - pushd "${certs_dir}" - - jq -ner --arg "ip" "${OUTER_CONTAINER_IP}" '{ - "variables": [ - { - "name": "docker_ca", - "type": "certificate", - "options": { - "is_ca": true, - "common_name": "ca" - } - }, - { - "name": "docker_tls", - "type": "certificate", - "options": { - "extended_key_usage": [ - "server_auth" - ], - "common_name": $ip, - "alternative_names": [ $ip ], - "ca": "docker_ca" - } - }, - { - "name": "client_docker_tls", - "type": "certificate", - "options": { - "extended_key_usage": [ - "client_auth" - ], - "common_name": $ip, - "alternative_names": [ $ip ], - "ca": "docker_ca" - } - } - ] - }' > ./bosh-vars.yml + pushd "${certs_dir}" > /dev/null + cat < ./bosh-vars.yml +--- +variables: +- name: docker_ca + type: certificate + options: + is_ca: true + common_name: ca +- name: docker_tls + type: certificate + options: + extended_key_usage: [server_auth] + common_name: $OUTER_CONTAINER_IP + alternative_names: [$OUTER_CONTAINER_IP] + ca: docker_ca +- name: client_docker_tls + type: certificate + options: + extended_key_usage: [client_auth] + common_name: $OUTER_CONTAINER_IP + alternative_names: [$OUTER_CONTAINER_IP] + ca: docker_ca +EOF bosh int ./bosh-vars.yml --vars-store=./certs.yml bosh int ./certs.yml --path=/docker_ca/ca > ./ca.pem @@ -51,12 +47,13 @@ function generate_certs() { bosh int ./certs.yml --path=/docker_tls/private_key > ./server-key.pem bosh int ./certs.yml --path=/client_docker_tls/certificate > ./cert.pem bosh int ./certs.yml --path=/client_docker_tls/private_key > ./key.pem - # generate certs in json format - # - ruby -e 'puts File.read("./ca.pem").split("\n").join("\\n")' > "$certs_dir/ca_json_safe.pem" - ruby -e 'puts File.read("./cert.pem").split("\n").join("\\n")' > "$certs_dir/client_certificate_json_safe.pem" - ruby -e 'puts File.read("./key.pem").split("\n").join("\\n")' > "$certs_dir/client_private_key_json_safe.pem" - popd + + # generate certs in json format + ruby -e 'puts File.read("./ca.pem").split("\n").join("\\n")' > "${certs_dir}/ca_json_safe.pem" + ruby -e 'puts File.read("./cert.pem").split("\n").join("\\n")' > "${certs_dir}/client_certificate_json_safe.pem" + ruby -e 'puts File.read("./key.pem").split("\n").join("\\n")' > "${certs_dir}/client_private_key_json_safe.pem" + + popd > /dev/null } function sanitize_cgroups() { @@ -64,15 +61,28 @@ function sanitize_cgroups() { mountpoint -q /sys/fs/cgroup || \ mount -t tmpfs -o uid=0,gid=0,mode=0755 cgroup /sys/fs/cgroup + if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + # cgroups v2: enable nesting (based on moby/moby hack/dind) + mkdir -p /sys/fs/cgroup/init + # Loop to handle races from concurrent process creation (e.g. docker exec) + while ! { + xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs 2>/dev/null || : + sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers \ + > /sys/fs/cgroup/cgroup.subtree_control + }; do true; done + return + fi + mount -o remount,rw /sys/fs/cgroup - sed -e 1d /proc/cgroups | while read sys hierarchy num enabled; do + # shellcheck disable=SC2034 + sed -e 1d /proc/cgroups | while read -r sys hierarchy num enabled; do if [ "$enabled" != "1" ]; then # subsystem disabled; skip continue fi - grouping="$(cat /proc/self/cgroup | cut -d: -f2 | grep "\\<$sys\\>")" + grouping="$(cut -d: -f2 < /proc/self/cgroup | grep "\\<$sys\\>")" if [ -z "$grouping" ]; then # subsystem not mounted anywhere; mount it on its own grouping="$sys" @@ -99,20 +109,34 @@ function sanitize_cgroups() { done } -source "ci/scripts/functions-ci.sh" +function stop_docker() { + service docker stop +} function start_docker() { - generate_certs "$1" - local mtu + local certs_dir + certs_dir="${1}" + + # docker will fail starting with the new iptables. it throws: + # iptables v1.8.7 (nf_tables): Could not fetch rule set generation id: .... + update-alternatives --set iptables /usr/sbin/iptables-legacy + + generate_certs "${certs_dir}" + mkdir -p /var/log mkdir -p /var/run sanitize_cgroups - # ensure systemd cgroup is present - mkdir -p /sys/fs/cgroup/systemd - if ! mountpoint -q /sys/fs/cgroup/systemd ; then - mount -t cgroup -o none,name=systemd cgroup /sys/fs/cgroup/systemd + # systemd inside nested Docker containers requires shared mount propagation + mount --make-rshared / + + # ensure systemd cgroup is present (cgroups v1 only) + if [ ! -f /sys/fs/cgroup/cgroup.controllers ]; then + mkdir -p /sys/fs/cgroup/systemd + if ! mountpoint -q /sys/fs/cgroup/systemd ; then + mount -t cgroup -o none,name=systemd cgroup /sys/fs/cgroup/systemd + fi fi # check for /proc/sys being mounted readonly, as systemd does @@ -120,12 +144,13 @@ function start_docker() { mount -o remount,rw /proc/sys fi - mtu=$(cat /sys/class/net/$(ip route get 8.8.8.8|awk '{ print $5 }')/mtu) + local mtu + mtu=$(cat "/sys/class/net/$(ip route get ${DNS_IP} | awk '{ print $5 }')/mtu") [[ ! -d /etc/docker ]] && mkdir /etc/docker cat < /etc/docker/daemon.json { - "hosts": ["${DOCKER_HOST}","unix:///var/run/docker.sock"], + "hosts": ["${DOCKER_HOST}"], "tls": true, "tlscert": "${certs_dir}/server-cert.pem", "tlskey": "${certs_dir}/server-key.pem", @@ -138,13 +163,10 @@ EOF service docker start - export DOCKER_TLS_VERIFY=1 - export DOCKER_CERT_PATH=$1 - rc=1 - for i in $(seq 1 10); do - echo waiting for docker to come up... - sleep 10 + for i in $(seq 1 100); do + echo "waiting for docker to come up... (${i})" + sleep 1 set +e docker info rc=$? @@ -165,66 +187,89 @@ EOF if [ -z "${KEEP_RUNNING}" ] ; then trap stop_docker ERR fi - echo "$certs_dir" + + echo "${certs_dir}" } function main() { - export OUTER_CONTAINER_IP=$(ruby -rsocket -e 'puts Socket.ip_address_list + # ".first" - original code could return multiple IPs (e.g., container IP + docker0 bridge IP) + # which breaks the docker_tls JSON variable formatting + OUTER_CONTAINER_IP=$(ruby -rsocket -e 'puts Socket.ip_address_list .reject { |addr| !addr.ip? || addr.ipv4_loopback? || addr.ipv6? } .map { |addr| addr.ip_address }.first') - - export DOCKER_HOST="tcp://${OUTER_CONTAINER_IP}:4243" + export OUTER_CONTAINER_IP local certs_dir certs_dir=$(mktemp -d) - start_docker "${certs_dir}" local local_bosh_dir local_bosh_dir="/tmp/local-bosh/director" + mkdir -p ${local_bosh_dir} + + cat < "${local_bosh_dir}/docker-env" +export DOCKER_HOST="tcp://${OUTER_CONTAINER_IP}:4243" +export DOCKER_TLS_VERIFY=1 +export DOCKER_CERT_PATH="${certs_dir}" +EOF + echo "Source '${local_bosh_dir}/docker-env' to run docker" >&2 + source "${local_bosh_dir}/docker-env" - if ! docker network ls | grep director_network; then - docker network create -d bridge --subnet=10.245.0.0/16 director_network + start_docker "${certs_dir}" + + local docker_network_name="director_network" + local docker_network_cidr="10.245.0.0/16" + if docker network ls | grep -q "${docker_network_name}"; then + echo "A docker network named '${docker_network_name}' already exists, skipping creation" >&2 + else + docker network create -d bridge --subnet=${docker_network_cidr} "${docker_network_name}" fi - compilation_ops="$PWD/ci/compilation.yml" pushd "${BOSH_DEPLOYMENT_PATH:-/usr/local/bosh-deployment}" > /dev/null - export BOSH_DIRECTOR_IP="10.245.0.3" - export BOSH_ENVIRONMENT="docker-director" - - mkdir -p ${local_bosh_dir} + cat < "${local_bosh_dir}/docker_tls.json" +{ + "ca": "$(cat "${certs_dir}/ca_json_safe.pem")", + "certificate": "$(cat "${certs_dir}/client_certificate_json_safe.pem")", + "private_key": "$(cat "${certs_dir}/client_private_key_json_safe.pem")" +} +EOF - command bosh int bosh.yml \ + bosh int bosh.yml \ -o docker/cpi.yml \ -o jumpbox-user.yml \ + -o /usr/local/local-releases.yml \ + -o "$PWD/ci/noble-updates.yml" -v director_name=docker \ - -v internal_cidr=10.245.0.0/16 \ + -v internal_cidr=${docker_network_cidr} \ -v internal_gw=10.245.0.1 \ -v internal_ip="${BOSH_DIRECTOR_IP}" \ -v docker_host="${DOCKER_HOST}" \ - -v network=director_network \ - -v docker_tls="{\"ca\": \"$(cat "${certs_dir}"/ca_json_safe.pem)\",\"certificate\": \"$(cat "${certs_dir}"/client_certificate_json_safe.pem)\",\"private_key\": \"$(cat "${certs_dir}"/client_private_key_json_safe.pem)\"}" \ - ${@} > "${local_bosh_dir}/bosh-director.yml" + -v network="${docker_network_name}" \ + -v docker_tls="$(cat "${local_bosh_dir}/docker_tls.json")" \ + "${@}" > "${local_bosh_dir}/bosh-director.yml" - command bosh create-env "${local_bosh_dir}/bosh-director.yml" \ - --vars-store="${local_bosh_dir}/creds.yml" \ - --state="${local_bosh_dir}/state.json" + bosh create-env "${local_bosh_dir}/bosh-director.yml" \ + --vars-store="${local_bosh_dir}/creds.yml" \ + --state="${local_bosh_dir}/state.json" bosh int "${local_bosh_dir}/creds.yml" --path /director_ssl/ca > "${local_bosh_dir}/ca.crt" + bosh_client_secret="$(bosh int "${local_bosh_dir}/creds.yml" --path /admin_password)" + bosh -e "${BOSH_DIRECTOR_IP}" --ca-cert "${local_bosh_dir}/ca.crt" alias-env "${BOSH_ENVIRONMENT}" cat < "${local_bosh_dir}/env" + export BOSH_DIRECTOR_IP="${BOSH_DIRECTOR_IP}" export BOSH_ENVIRONMENT="${BOSH_ENVIRONMENT}" export BOSH_CLIENT=admin - export BOSH_CLIENT_SECRET=$(bosh int "${local_bosh_dir}/creds.yml" --path /admin_password) + export BOSH_CLIENT_SECRET=${bosh_client_secret} export BOSH_CA_CERT="${local_bosh_dir}/ca.crt" - EOF + echo "Source '${local_bosh_dir}/env' to run bosh" >&2 source "${local_bosh_dir}/env" - bosh -n update-cloud-config docker/cloud-config.yml -v network=director_network -o "${compilation_ops}" + bosh -n update-cloud-config docker/cloud-config.yml -v network="${docker_network_name}" popd > /dev/null } echo "----- Starting BOSH" -main $@ +main "${@}" diff --git a/ci/scripts/stemcell-bionic/.gitkeep b/ci/scripts/stemcell-jammy/.gitkeep similarity index 100% rename from ci/scripts/stemcell-bionic/.gitkeep rename to ci/scripts/stemcell-jammy/.gitkeep diff --git a/manifests/haproxy.yml b/manifests/haproxy.yml index 711ca421..7ff56c4b 100644 --- a/manifests/haproxy.yml +++ b/manifests/haproxy.yml @@ -31,15 +31,15 @@ update: stemcells: - alias: default - os: ubuntu-jammy + os: ubuntu-noble version: latest releases: - name: bpm - version: 1.2.14 - url: https://bosh.io/d/github.com/cloudfoundry/bpm-release?v=1.2.14 - sha1: 1e357a533654e2067e15231dd8ac5bad2e697dff + version: 1.4.26 + url: https://bosh.io/d/github.com/cloudfoundry/bpm-release?v=1.4.26 + sha1: sha256:40af85114d2a8a67812bf65212076581ea42cefcf67ee6b8d78d778ed1ca2b85 - name: haproxy - version: 16.1.0+3.2.9 - url: https://github.com/cloudfoundry/haproxy-boshrelease/releases/download/v16.1.0+3.2.9/haproxy-16.1.0+3.2.9.tgz - sha1: 91d57dcd744fc5cfef0494a8a74072f56e5ee892 + version: 16.4.0+3.2.13 + url: https://github.com/cloudfoundry/haproxy-boshrelease/releases/download/v16.4.0+3.2.13/haproxy-16.4.0+3.2.13.tgz + sha1: sha256:a6544aaab0de421ff7342c7511198c428bfb49e136b9279f2345585b53f8979b From 9cd23471153394c24f37b580bcbfce2d94255c9b Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Fri, 27 Feb 2026 09:40:22 +0100 Subject: [PATCH 02/28] POC: Noble and cgroupsv2, fixes --- acceptance-tests/run-local.sh | 7 ++++--- ci/Dockerfile | 2 ++ ci/noble-updates.yml | 32 ++++++++++++++++++++++++++++++-- ci/scripts/start-bosh.sh | 14 ++++++++++++-- 4 files changed, 48 insertions(+), 7 deletions(-) diff --git a/acceptance-tests/run-local.sh b/acceptance-tests/run-local.sh index 07a9c8f1..8daa4b56 100755 --- a/acceptance-tests/run-local.sh +++ b/acceptance-tests/run-local.sh @@ -3,6 +3,7 @@ set -eu REPO_DIR="$(cd "$(dirname "$0")/.." && pwd)" source "${REPO_DIR}/ci/scripts/functions-ci.sh" +FOCUS="" KEEP_RUNNING="" usage() { @@ -52,7 +53,7 @@ check_required_files() { for entry in "${REQUIRED_FILE_PATTERNS[@]}"; do pattern=$(cut -f1 -d! <<<"$entry") url=$(cut -f2 -d! <<<"$entry") - resolve=$(cut -f2 -d! <<<"$entry") + to_resolve=$(cut -f3 -d! <<<"$entry") folder=$(realpath "$(dirname "$REPO_DIR/$pattern")") filepattern=$(basename "$pattern") pattern=$folder/$filepattern @@ -67,7 +68,7 @@ check_required_files() { echo "$filepattern not found, downloading." cd "$folder" resolved="$url" - if [ "$resolve" == "yes" ]; then + if [ "$to_resolve" == "yes" ]; then resolved=$(curl -s --write-out '\n%{redirect_url}' "$url" | tail -n1 | tr -d '\n') fi echo "Resolved URL: $resolved" @@ -98,7 +99,7 @@ if [ -n "$KEEP_RUNNING" ] ; then echo echo "*** KEEP_RUNNING enabled. Please clean up docker scratch after removing containers: ${DOCKER_SCRATCH}" echo - docker run --privileged -v "$REPO_DIR":/repo -v "${DOCKER_SCRATCH}":/scratch/docker -e REPO_ROOT=/repo -e FOCUS="$FOCUS" -e KEEP_RUNNING="${KEEP_RUNNING}" haproxy-boshrelease-testflight bash -c "cd /repo/ci/scripts && ./acceptance-tests ; sleep infinity" + docker run --privileged -v "$REPO_DIR":/repo -v "${DOCKER_SCRATCH}":/scratch/docker -e REPO_ROOT=/repo -e FOCUS="${FOCUS}" -e KEEP_RUNNING="${KEEP_RUNNING}" haproxy-boshrelease-testflight bash -c "cd /repo/ci/scripts && ./acceptance-tests ; sleep infinity" else docker run --rm --privileged -v "$REPO_DIR":/repo -v "${DOCKER_SCRATCH}":/scratch/docker -e REPO_ROOT=/repo -e KEEP_RUNNING="" haproxy-boshrelease-testflight bash -c "cd /repo/ci/scripts && ./acceptance-tests" echo "Cleaning up docker scratch: ${DOCKER_SCRATCH}" diff --git a/ci/Dockerfile b/ci/Dockerfile index 56c7550b..6f97d4c5 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -9,6 +9,8 @@ RUN apt-get update && \ # Set bosh env at login RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc +COPY noble-updates.yml /usr/local/bosh-deployment/ci/noble-updates.yml + # Install Python libraries needed for scripts COPY scripts/requirements.txt /requirements.txt RUN /usr/bin/python3 -m pip install -r /requirements.txt diff --git a/ci/noble-updates.yml b/ci/noble-updates.yml index e75c21da..21e478ca 100644 --- a/ci/noble-updates.yml +++ b/ci/noble-updates.yml @@ -1,3 +1,31 @@ -- path: /cloud_provider/properties/docker_cpi/start_containers_with_systemd? - type: replace +- type: replace + path: /cloud_provider/properties/docker_cpi/start_containers_with_systemd? value: true +- type: replace + path: /releases/name=bosh + value: + name: bosh + version: 282.1.2 + url: https://bosh.io/d/github.com/cloudfoundry/bosh?v=282.1.2 + sha1: sha256:e9c95823932c81b4ce4442b3fae67223c04492a8dfe54489d6b5ec7d4fac2183 +- type: replace + path: /releases/name=bpm + value: + name: bpm + version: 1.4.26 + url: https://bosh.io/d/github.com/cloudfoundry/bpm-release?v=1.4.26 + sha1: sha256:40af85114d2a8a67812bf65212076581ea42cefcf67ee6b8d78d778ed1ca2b85 +- type: replace + path: /releases/name=bosh-docker-cpi + value: + name: bosh-docker-cpi + version: 0.2.4 + url: https://bosh.io/d/github.com/cloudfoundry/bosh-docker-cpi-release?v=0.2.4 + sha1: sha256:a485beee8d3f1d6a434800a4ac2dea5644c826b143988f04102cad894e81565d +- type: replace + path: /releases/name=os-conf + value: + name: os-conf + version: 23.0.0 + url: https://bosh.io/d/github.com/cloudfoundry/os-conf-release?v=23.0.0 + sha1: sha256:efcf30754ce4c5f308aedab3329d8d679f5967b2a4c3c453204c7cb10c7c5ed9 \ No newline at end of file diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index c3ae7552..82e8ab3e 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -7,7 +7,7 @@ if [[ -n "${DEBUG:-}" ]]; then export BOSH_LOG_LEVEL=debug fi -export BOSH_DIRECTOR_IP="10.245.0.11" +export BOSH_DIRECTOR_IP="10.245.0.3" export BOSH_ENVIRONMENT="docker-director" export DNS_IP="8.8.8.8" @@ -156,6 +156,7 @@ function start_docker() { "tlskey": "${certs_dir}/server-key.pem", "tlscacert": "${certs_dir}/ca.pem", "mtu": ${mtu}, + "dns": ["8.8.8.8", "8.8.4.4"], "data-root": "/scratch/docker", "tlsverify": true } @@ -168,6 +169,7 @@ EOF echo "waiting for docker to come up... (${i})" sleep 1 set +e + echo "Docker started, checking if it's responsive..." docker info rc=$? set -e @@ -222,9 +224,12 @@ EOF echo "A docker network named '${docker_network_name}' already exists, skipping creation" >&2 else docker network create -d bridge --subnet=${docker_network_cidr} "${docker_network_name}" + echo "Created docker network '${docker_network_name}' with subnet '${docker_network_cidr}'" >&2 fi pushd "${BOSH_DEPLOYMENT_PATH:-/usr/local/bosh-deployment}" > /dev/null + echo "Current directory: $(pwd)" >&2 + cat < "${local_bosh_dir}/docker_tls.json" { "ca": "$(cat "${certs_dir}/ca_json_safe.pem")", @@ -233,11 +238,12 @@ EOF } EOF + echo "Interpolating BOSH deployment manifest with Docker CPI and TLS configuration..." >&2 bosh int bosh.yml \ -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ - -o "$PWD/ci/noble-updates.yml" + -o "$PWD/ci/noble-updates.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ -v internal_gw=10.245.0.1 \ @@ -247,13 +253,16 @@ EOF -v docker_tls="$(cat "${local_bosh_dir}/docker_tls.json")" \ "${@}" > "${local_bosh_dir}/bosh-director.yml" + echo "Creating BOSH director environment..." >&2 bosh create-env "${local_bosh_dir}/bosh-director.yml" \ --vars-store="${local_bosh_dir}/creds.yml" \ --state="${local_bosh_dir}/state.json" + echo "Extracting BOSH director credentials and CA certificate..." >&2 bosh int "${local_bosh_dir}/creds.yml" --path /director_ssl/ca > "${local_bosh_dir}/ca.crt" bosh_client_secret="$(bosh int "${local_bosh_dir}/creds.yml" --path /admin_password)" + echo "Setting up BOSH CLI environment..." >&2 bosh -e "${BOSH_DIRECTOR_IP}" --ca-cert "${local_bosh_dir}/ca.crt" alias-env "${BOSH_ENVIRONMENT}" cat < "${local_bosh_dir}/env" @@ -266,6 +275,7 @@ EOF echo "Source '${local_bosh_dir}/env' to run bosh" >&2 source "${local_bosh_dir}/env" + echo "Updating BOSH cloud config with Docker network..." >&2 bosh -n update-cloud-config docker/cloud-config.yml -v network="${docker_network_name}" popd > /dev/null From 85d1fedf40d70b2c24d4472d2e0b5f398c743200 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Mon, 2 Mar 2026 10:06:32 +0100 Subject: [PATCH 03/28] POC: Noble and cgroupsv2, the latest Docker CPI image --- ci/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index 6f97d4c5..27bda59c 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -1,4 +1,4 @@ -FROM bosh/docker-cpi:main +FROM ghcr.io/cloudfoundry/bosh/docker-cpi:latest # Install all necessary tools for haproxy testflight and dependency autobump ENV DEBIAN_FRONTEND=noninteractive From 0305f8f807621452dcf34bb3a4b5e2469ca020f7 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Mon, 2 Mar 2026 10:28:04 +0100 Subject: [PATCH 04/28] POC: Noble and cgroupsv2, pip fixes --- ci/Dockerfile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index 27bda59c..9c7ba8d2 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -3,7 +3,7 @@ FROM ghcr.io/cloudfoundry/bosh/docker-cpi:latest # Install all necessary tools for haproxy testflight and dependency autobump ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && \ - apt-get install -y wget jq git vim nano python3-pip && \ + apt-get install -y wget jq git vim nano python3-pip python3-venv && \ apt-get clean # Set bosh env at login @@ -12,8 +12,10 @@ RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc COPY noble-updates.yml /usr/local/bosh-deployment/ci/noble-updates.yml # Install Python libraries needed for scripts +RUN python3 -m venv /opt/venv +ENV PATH="/opt/venv/bin:${PATH}" COPY scripts/requirements.txt /requirements.txt -RUN /usr/bin/python3 -m pip install -r /requirements.txt +RUN pip install -r /requirements.txt # Install go dependencies ENV GOBIN=/usr/local/bin From 6d3f46a4e0e9e80dcbac5dd586ac2277f6f09b12 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Mon, 2 Mar 2026 11:30:21 +0100 Subject: [PATCH 05/28] POC: Noble and cgroupsv2, Bosh fixes --- acceptance-tests/run-local.sh | 28 +++++----------------------- ci/scripts/start-bosh.sh | 29 ++++++++++++++++------------- 2 files changed, 21 insertions(+), 36 deletions(-) diff --git a/acceptance-tests/run-local.sh b/acceptance-tests/run-local.sh index 8daa4b56..9ca9c5d2 100755 --- a/acceptance-tests/run-local.sh +++ b/acceptance-tests/run-local.sh @@ -29,22 +29,8 @@ while getopts ":F:k" o; do done shift $((OPTIND-1)) -docker_mac_check_cgroupsv1() { - # Force cgroups v1 on Docker for Mac - # inspired by https://github.com/docker/for-mac/issues/6073#issuecomment-1018793677 - - SETTINGS=~/Library/Group\ Containers/group.com.docker/settings.json - - cgroupsV1Enabled=$(jq '.deprecatedCgroupv1' "$SETTINGS") - if [ "$cgroupsV1Enabled" != "true" ]; then - echo "deprecatedCgroupv1 should be enabled in $SETTINGS. Otherwise the acceptance tests will not run on Docker for Mac." - echo "Check in the README.md for a convenient script to set deprecatedCgroupv1 and restart Docker." - exit 1 - fi -} - check_required_files() { -# PIDS="" + PIDS="" REQUIRED_FILE_PATTERNS=( ci/scripts/stemcell/bosh-stemcell-*-ubuntu-noble.tgz!https://storage.googleapis.com/bosh-core-stemcells/1.238/bosh-stemcell-1.238-warden-boshlite-ubuntu-noble.tgz!no ci/scripts/stemcell-jammy/bosh-stemcell-*-ubuntu-jammy-*.tgz!https://bosh.io/d/stemcells/bosh-warden-boshlite-ubuntu-jammy-go_agent!yes @@ -64,7 +50,7 @@ check_required_files() { continue fi - #( + ( echo "$filepattern not found, downloading." cd "$folder" resolved="$url" @@ -75,22 +61,18 @@ check_required_files() { curl -s --remote-name --remote-header-name --location "$resolved" echo "Downloaded '$url' successfully." ls -1lh "$folder/"$filepattern - #)& + )& -# PIDS="$PIDS $!" + PIDS="$PIDS $!" done # shellcheck disable=SC2086 # expansion is desired, as $PIDS is a list of PIDs. Wait on all of those PIDs. -# wait $PIDS + wait $PIDS } check_required_files -#if [ "$(uname)" == "Darwin" ]; then -# docker_mac_check_cgroupsv1 -#fi - build_image "${REPO_DIR}/ci" prepare_docker_scratch diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 82e8ab3e..e8eb0b38 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -7,11 +7,6 @@ if [[ -n "${DEBUG:-}" ]]; then export BOSH_LOG_LEVEL=debug fi -export BOSH_DIRECTOR_IP="10.245.0.3" -export BOSH_ENVIRONMENT="docker-director" - -export DNS_IP="8.8.8.8" - function generate_certs() { local certs_dir certs_dir="${1}" @@ -117,9 +112,7 @@ function start_docker() { local certs_dir certs_dir="${1}" - # docker will fail starting with the new iptables. it throws: - # iptables v1.8.7 (nf_tables): Could not fetch rule set generation id: .... - update-alternatives --set iptables /usr/sbin/iptables-legacy + export DNS_IP="8.8.8.8" generate_certs "${certs_dir}" @@ -194,13 +187,20 @@ EOF } function main() { - # ".first" - original code could return multiple IPs (e.g., container IP + docker0 bridge IP) - # which breaks the docker_tls JSON variable formatting - OUTER_CONTAINER_IP=$(ruby -rsocket -e 'puts Socket.ip_address_list - .reject { |addr| !addr.ip? || addr.ipv4_loopback? || addr.ipv6? } - .map { |addr| addr.ip_address }.first') + OUTER_CONTAINER_IP=$( + ip addr \ + | grep 'inet ' \ + | grep -v -E ' (127\.|172\.|10\.245)' \ + | cut -d/ -f 1 \ + | cut -d' ' -f6 + ) export OUTER_CONTAINER_IP + if [[ "${OUTER_CONTAINER_IP}" == *$'\n'* ]] ; then + echo "OUTER_CONTAINER_IP had more than one ip: '${OUTER_CONTAINER_IP}'" >&2 + exit 1 + fi + local certs_dir certs_dir=$(mktemp -d) @@ -230,6 +230,9 @@ EOF pushd "${BOSH_DEPLOYMENT_PATH:-/usr/local/bosh-deployment}" > /dev/null echo "Current directory: $(pwd)" >&2 + export BOSH_DIRECTOR_IP="10.245.0.11" + export BOSH_ENVIRONMENT="docker-director" + cat < "${local_bosh_dir}/docker_tls.json" { "ca": "$(cat "${certs_dir}/ca_json_safe.pem")", From 536d4e003d30b2517724df9b092d57def027ad9b Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Mon, 2 Mar 2026 12:17:10 +0100 Subject: [PATCH 06/28] POC: Noble and cgroupsv2, releases fixes --- ci/Dockerfile | 2 -- ci/noble-updates.yml | 31 ------------------------------- ci/scripts/start-bosh.sh | 8 ++++++-- 3 files changed, 6 insertions(+), 35 deletions(-) delete mode 100644 ci/noble-updates.yml diff --git a/ci/Dockerfile b/ci/Dockerfile index 9c7ba8d2..20cf3cb8 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -9,8 +9,6 @@ RUN apt-get update && \ # Set bosh env at login RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc -COPY noble-updates.yml /usr/local/bosh-deployment/ci/noble-updates.yml - # Install Python libraries needed for scripts RUN python3 -m venv /opt/venv ENV PATH="/opt/venv/bin:${PATH}" diff --git a/ci/noble-updates.yml b/ci/noble-updates.yml deleted file mode 100644 index 21e478ca..00000000 --- a/ci/noble-updates.yml +++ /dev/null @@ -1,31 +0,0 @@ -- type: replace - path: /cloud_provider/properties/docker_cpi/start_containers_with_systemd? - value: true -- type: replace - path: /releases/name=bosh - value: - name: bosh - version: 282.1.2 - url: https://bosh.io/d/github.com/cloudfoundry/bosh?v=282.1.2 - sha1: sha256:e9c95823932c81b4ce4442b3fae67223c04492a8dfe54489d6b5ec7d4fac2183 -- type: replace - path: /releases/name=bpm - value: - name: bpm - version: 1.4.26 - url: https://bosh.io/d/github.com/cloudfoundry/bpm-release?v=1.4.26 - sha1: sha256:40af85114d2a8a67812bf65212076581ea42cefcf67ee6b8d78d778ed1ca2b85 -- type: replace - path: /releases/name=bosh-docker-cpi - value: - name: bosh-docker-cpi - version: 0.2.4 - url: https://bosh.io/d/github.com/cloudfoundry/bosh-docker-cpi-release?v=0.2.4 - sha1: sha256:a485beee8d3f1d6a434800a4ac2dea5644c826b143988f04102cad894e81565d -- type: replace - path: /releases/name=os-conf - value: - name: os-conf - version: 23.0.0 - url: https://bosh.io/d/github.com/cloudfoundry/os-conf-release?v=23.0.0 - sha1: sha256:efcf30754ce4c5f308aedab3329d8d679f5967b2a4c3c453204c7cb10c7c5ed9 \ No newline at end of file diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index e8eb0b38..b21a74ea 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -set -eu -o pipefail +set -e if [[ -n "${DEBUG:-}" ]]; then set -x @@ -120,6 +120,7 @@ function start_docker() { mkdir -p /var/run sanitize_cgroups + echo "Sanitized cgroups for docker" >&2 # systemd inside nested Docker containers requires shared mount propagation mount --make-rshared / @@ -156,6 +157,7 @@ function start_docker() { EOF service docker start + echo "Started docker service" >&2 rc=1 for i in $(seq 1 100); do @@ -201,6 +203,8 @@ function main() { exit 1 fi + echo "Determined OUTER_CONTAINER_IP: ${OUTER_CONTAINER_IP}" >&2 + local certs_dir certs_dir=$(mktemp -d) @@ -217,6 +221,7 @@ EOF source "${local_bosh_dir}/docker-env" start_docker "${certs_dir}" + echo "Docker is up and running with TLS configured" >&2 local docker_network_name="director_network" local docker_network_cidr="10.245.0.0/16" @@ -246,7 +251,6 @@ EOF -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ - -o "$PWD/ci/noble-updates.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ -v internal_gw=10.245.0.1 \ From a927bb5407a5da5bb62202952a2becabf3c0a8f3 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Mon, 2 Mar 2026 12:24:08 +0100 Subject: [PATCH 07/28] POC: Noble and cgroupsv2, rollback changes --- ci/scripts/start-bosh.sh | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index b21a74ea..1e8b68a5 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -189,13 +189,11 @@ EOF } function main() { - OUTER_CONTAINER_IP=$( - ip addr \ - | grep 'inet ' \ - | grep -v -E ' (127\.|172\.|10\.245)' \ - | cut -d/ -f 1 \ - | cut -d' ' -f6 - ) + # ".first" - original code could return multiple IPs (e.g., container IP + docker0 bridge IP) + # which breaks the docker_tls JSON variable formatting + OUTER_CONTAINER_IP=$(ruby -rsocket -e 'puts Socket.ip_address_list + .reject { |addr| !addr.ip? || addr.ipv4_loopback? || addr.ipv6? } + .map { |addr| addr.ip_address }.first') export OUTER_CONTAINER_IP if [[ "${OUTER_CONTAINER_IP}" == *$'\n'* ]] ; then From d1af9d0147ea44efd63493c031f0433e28f86b16 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Mon, 2 Mar 2026 14:29:58 +0100 Subject: [PATCH 08/28] POC: Noble and cgroupsv2, rollback changes --- ci/scripts/start-bosh.sh | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 1e8b68a5..b61a3c08 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -114,6 +114,10 @@ function start_docker() { export DNS_IP="8.8.8.8" + # docker will fail starting with the new iptables. it throws: + # iptables v1.8.7 (nf_tables): Could not fetch rule set generation id: .... + update-alternatives --set iptables /usr/sbin/iptables-legacy + generate_certs "${certs_dir}" mkdir -p /var/log @@ -195,12 +199,6 @@ function main() { .reject { |addr| !addr.ip? || addr.ipv4_loopback? || addr.ipv6? } .map { |addr| addr.ip_address }.first') export OUTER_CONTAINER_IP - - if [[ "${OUTER_CONTAINER_IP}" == *$'\n'* ]] ; then - echo "OUTER_CONTAINER_IP had more than one ip: '${OUTER_CONTAINER_IP}'" >&2 - exit 1 - fi - echo "Determined OUTER_CONTAINER_IP: ${OUTER_CONTAINER_IP}" >&2 local certs_dir @@ -233,7 +231,7 @@ EOF pushd "${BOSH_DEPLOYMENT_PATH:-/usr/local/bosh-deployment}" > /dev/null echo "Current directory: $(pwd)" >&2 - export BOSH_DIRECTOR_IP="10.245.0.11" + export BOSH_DIRECTOR_IP="10.245.0.3" export BOSH_ENVIRONMENT="docker-director" cat < "${local_bosh_dir}/docker_tls.json" From 04929972c82fa5aaa64a3c56c5ee8f8a364ab059 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Mon, 2 Mar 2026 16:12:20 +0100 Subject: [PATCH 09/28] POC: Noble and cgroupsv2, rollback ops file --- ci/Dockerfile | 3 +++ ci/noble-updates.yml | 6 ++++++ ci/scripts/start-bosh.sh | 1 + 3 files changed, 10 insertions(+) create mode 100644 ci/noble-updates.yml diff --git a/ci/Dockerfile b/ci/Dockerfile index 20cf3cb8..7ee83d43 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -9,6 +9,9 @@ RUN apt-get update && \ # Set bosh env at login RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc +# Copy ops files +COPY noble-updates.yml /usr/local/bosh-deployment/noble-updates.yml + # Install Python libraries needed for scripts RUN python3 -m venv /opt/venv ENV PATH="/opt/venv/bin:${PATH}" diff --git a/ci/noble-updates.yml b/ci/noble-updates.yml new file mode 100644 index 00000000..beec1cbb --- /dev/null +++ b/ci/noble-updates.yml @@ -0,0 +1,6 @@ +- type: replace + path: /cloud_provider/properties/docker_cpi/start_containers_with_systemd? + value: true +- type: replace + path: /instance_groups/name=bosh/properties/docker_cpi/start_containers_with_systemd? + value: true diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index b61a3c08..26c109da 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -247,6 +247,7 @@ EOF -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ + -o "$PWD/noble-updates.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ -v internal_gw=10.245.0.1 \ From 8c41b5aed095cb1aeb31471fd7e2efae2a903eff Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 3 Mar 2026 11:14:44 +0100 Subject: [PATCH 10/28] POC: Noble and cgroupsv2, workaround --- ci/Dockerfile | 5 +++ ci/scripts/nft/monit-nft-watcher.service | 13 +++++++ ci/scripts/nft/update-monit-nft.sh | 46 ++++++++++++++++++++++++ ci/scripts/nft/watch-containers.sh | 29 +++++++++++++++ ci/scripts/start-bosh.sh | 3 ++ 5 files changed, 96 insertions(+) create mode 100644 ci/scripts/nft/monit-nft-watcher.service create mode 100644 ci/scripts/nft/update-monit-nft.sh create mode 100644 ci/scripts/nft/watch-containers.sh diff --git a/ci/Dockerfile b/ci/Dockerfile index 7ee83d43..19fd9af6 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -12,6 +12,11 @@ RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc # Copy ops files COPY noble-updates.yml /usr/local/bosh-deployment/noble-updates.yml +# Copy scripts +COPY --chmod=755 scripts/nft/update-monit-nft.sh /usr/local/sbin/update-monit-nft.sh +COPY --chmod=755 scripts/nft/watch-containers.sh /usr/local/sbin/watch-containers.sh +COPY scripts/nft/monit-nft-watcher.service /etc/systemd/system/monit-nft-watcher.service + # Install Python libraries needed for scripts RUN python3 -m venv /opt/venv ENV PATH="/opt/venv/bin:${PATH}" diff --git a/ci/scripts/nft/monit-nft-watcher.service b/ci/scripts/nft/monit-nft-watcher.service new file mode 100644 index 00000000..54c989f4 --- /dev/null +++ b/ci/scripts/nft/monit-nft-watcher.service @@ -0,0 +1,13 @@ +[Unit] +Description=Watch Docker container start events and update monit nft +After=docker.service +Requires=docker.service + +[Service] +Type=simple +ExecStart=/usr/local/sbin/watch-containers.sh +Restart=always +RestartSec=5 + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/ci/scripts/nft/update-monit-nft.sh b/ci/scripts/nft/update-monit-nft.sh new file mode 100644 index 00000000..a43cb24f --- /dev/null +++ b/ci/scripts/nft/update-monit-nft.sh @@ -0,0 +1,46 @@ +#!/bin/sh +set -eu + +DATE=$(date +%s) +NFT_FILE=/etc/nftables/monit.nft +BACKUP="${NFT_FILE}.bak.${DATE}" +TMP="$(mktemp /tmp/monit.nft.${DATE})" + +# Get ControlGroup value for bosh-agent.service +cg=$(systemctl show -p ControlGroup --value bosh-agent.service 2>/dev/null || true) +if [ -z "$cg" ]; then + echo "bosh-agent.service ControlGroup not found" >&2 + exit 1 +fi +cg=${cg#/} # remove leading slash if present +echo "Found ControlGroup for bosh-agent.service: $cg" + +# Replace the quoted cgroup path in the socket rule that matches the ip/tcp part +pattern='(^[[:space:]]*socket[[:space:]]+cgroupv2[[:space:]]+level[[:space:]]+[0-9]+[[:space:]]+")[^"]+("[[:space:]]+ip[[:space:]]+daddr[[:space:]]+127\.0\.0\.1[[:space:]]+tcp[[:space:]]+dport[[:space:]]+2822)' +esc=$(printf '%s' "$cg" | sed 's@[/&]@\&@g') # escape slashes and ampersands for sed +sed -E "s@$pattern@\1${esc}\2@" "$NFT_FILE" > "$TMP" +if cmp -s "$NFT_FILE" "$TMP"; then + rm -f "$TMP" + echo "monit.nft already up-to-date (using cgroup: $cg)" + exit 0 +else + echo "monit.nft needs update (new cgroup: $cg)" +fi + +# Backup & atomically replace and try to reload nft +cp -p "$NFT_FILE" "$BACKUP" +if mv "$TMP" "$NFT_FILE"; then + if nft -f "$NFT_FILE"; then + echo "Updated monit.nft to cgroup: $cg" + exit 0 + else + echo "nft load failed, restoring backup" >&2 + mv "$BACKUP" "$NFT_FILE" + nft -f "$NFT_FILE" || echo "failed to restore nft rules; check $NFT_FILE and $BACKUP" >&2 + exit 1 + fi +else + echo "failed to replace $NFT_FILE" >&2 + rm -f "$TMP" + exit 1 +fi \ No newline at end of file diff --git a/ci/scripts/nft/watch-containers.sh b/ci/scripts/nft/watch-containers.sh new file mode 100644 index 00000000..d6d8c21b --- /dev/null +++ b/ci/scripts/nft/watch-containers.sh @@ -0,0 +1,29 @@ +#!/bin/sh +set -eu + +SCRIPT_PATH=/usr/local/sbin/update-monit-nft.sh + +run_update_in_container() { + cid="$1" + if [ ! -r "$SCRIPT_PATH" ]; then + echo "missing host script: $SCRIPT_PATH" >&2 + return + fi + if ! docker exec -i "$cid" /bin/sh -s -- < "$SCRIPT_PATH"; then + echo "failed to run update-monit-nft.sh inside container $cid" >&2 + fi +} + +# initial update for any already-running containers +for cid in $(docker ps -q 2>/dev/null); do + run_update_in_container "$cid" +done + +# listen for docker start events and update when they occur +docker events --filter 'event=start' --format '{{.ID}} {{.Type}} {{.Action}}' | while read -r id type action; do + echo "Received docker event: ID=$id Type=$type Action=$action" + run_update_in_container "$id" +done || { + echo "docker events stream ended or failed; continuing without event watch" >&2 + true +} diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 26c109da..532b110f 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -257,6 +257,9 @@ EOF -v docker_tls="$(cat "${local_bosh_dir}/docker_tls.json")" \ "${@}" > "${local_bosh_dir}/bosh-director.yml" + echo "Starting monit-nft-watcher service to correct NF table for any starting container..." >&2 + systemctl enable --now monit-nft-watcher.service + echo "Creating BOSH director environment..." >&2 bosh create-env "${local_bosh_dir}/bosh-director.yml" \ --vars-store="${local_bosh_dir}/creds.yml" \ From 28e27a09ad3d2ad3e6a8d0daa211ad944240f604 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 3 Mar 2026 11:18:13 +0100 Subject: [PATCH 11/28] POC: Noble and cgroupsv2, fix --- ci/Dockerfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index 19fd9af6..fa830ba5 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -13,9 +13,10 @@ RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc COPY noble-updates.yml /usr/local/bosh-deployment/noble-updates.yml # Copy scripts -COPY --chmod=755 scripts/nft/update-monit-nft.sh /usr/local/sbin/update-monit-nft.sh -COPY --chmod=755 scripts/nft/watch-containers.sh /usr/local/sbin/watch-containers.sh +COPY scripts/nft/update-monit-nft.sh /usr/local/sbin/update-monit-nft.sh +COPY scripts/nft/watch-containers.sh /usr/local/sbin/watch-containers.sh COPY scripts/nft/monit-nft-watcher.service /etc/systemd/system/monit-nft-watcher.service +RUN chmod 755 /usr/local/sbin/update-monit-nft.sh /usr/local/sbin/watch-containers.sh # Install Python libraries needed for scripts RUN python3 -m venv /opt/venv From 1e33816a6695e4b7eb1270aa54e19c953d7338e7 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 3 Mar 2026 11:33:14 +0100 Subject: [PATCH 12/28] POC: Noble and cgroupsv2, watch without systemd --- ci/Dockerfile | 1 - ci/scripts/nft/monit-nft-watcher.service | 13 ------------- ci/scripts/nft/watch-containers.sh | 19 +++++++++++-------- ci/scripts/start-bosh.sh | 4 ++-- 4 files changed, 13 insertions(+), 24 deletions(-) delete mode 100644 ci/scripts/nft/monit-nft-watcher.service diff --git a/ci/Dockerfile b/ci/Dockerfile index fa830ba5..c6a17f12 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -15,7 +15,6 @@ COPY noble-updates.yml /usr/local/bosh-deployment/noble-updates.yml # Copy scripts COPY scripts/nft/update-monit-nft.sh /usr/local/sbin/update-monit-nft.sh COPY scripts/nft/watch-containers.sh /usr/local/sbin/watch-containers.sh -COPY scripts/nft/monit-nft-watcher.service /etc/systemd/system/monit-nft-watcher.service RUN chmod 755 /usr/local/sbin/update-monit-nft.sh /usr/local/sbin/watch-containers.sh # Install Python libraries needed for scripts diff --git a/ci/scripts/nft/monit-nft-watcher.service b/ci/scripts/nft/monit-nft-watcher.service deleted file mode 100644 index 54c989f4..00000000 --- a/ci/scripts/nft/monit-nft-watcher.service +++ /dev/null @@ -1,13 +0,0 @@ -[Unit] -Description=Watch Docker container start events and update monit nft -After=docker.service -Requires=docker.service - -[Service] -Type=simple -ExecStart=/usr/local/sbin/watch-containers.sh -Restart=always -RestartSec=5 - -[Install] -WantedBy=multi-user.target \ No newline at end of file diff --git a/ci/scripts/nft/watch-containers.sh b/ci/scripts/nft/watch-containers.sh index d6d8c21b..6016408b 100644 --- a/ci/scripts/nft/watch-containers.sh +++ b/ci/scripts/nft/watch-containers.sh @@ -3,6 +3,8 @@ set -eu SCRIPT_PATH=/usr/local/sbin/update-monit-nft.sh +trap 'echo "monit-nft watcher interrupted; exiting" >&2; exit 0' INT TERM + run_update_in_container() { cid="$1" if [ ! -r "$SCRIPT_PATH" ]; then @@ -19,11 +21,12 @@ for cid in $(docker ps -q 2>/dev/null); do run_update_in_container "$cid" done -# listen for docker start events and update when they occur -docker events --filter 'event=start' --format '{{.ID}} {{.Type}} {{.Action}}' | while read -r id type action; do - echo "Received docker event: ID=$id Type=$type Action=$action" - run_update_in_container "$id" -done || { - echo "docker events stream ended or failed; continuing without event watch" >&2 - true -} +# listen for docker start events and update when they occur forever +while true; do + docker events --filter 'event=start' --format '{{.ID}} {{.Type}} {{.Action}}' | while read -r id type action; do + echo "Received docker event: ID=$id Type=$type Action=$action" + run_update_in_container "$id" + done || true + echo "docker events stream ended or failed; retrying after 1s" >&2 + sleep 1 +done diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 532b110f..cb026054 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -257,8 +257,8 @@ EOF -v docker_tls="$(cat "${local_bosh_dir}/docker_tls.json")" \ "${@}" > "${local_bosh_dir}/bosh-director.yml" - echo "Starting monit-nft-watcher service to correct NF table for any starting container..." >&2 - systemctl enable --now monit-nft-watcher.service + echo "Starting monit-nft-watcher to correct NF table for any starting container..." >&2 + nohup /usr/local/sbin/watch-containers.sh >/var/log/monit-nft-watcher.log 2>&1 & echo "Creating BOSH director environment..." >&2 bosh create-env "${local_bosh_dir}/bosh-director.yml" \ From 30a5ca9d47d72065ab889398104d23b3f56edbb0 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 3 Mar 2026 11:53:34 +0100 Subject: [PATCH 13/28] POC: Noble and cgroupsv2, fix watch scripts --- ci/scripts/nft/update-monit-nft.sh | 5 +- ci/scripts/nft/watch-containers.sh | 11 +- ci/scripts/start-bosh.sh | 181 ++++++++++++++++++++++++++++- 3 files changed, 193 insertions(+), 4 deletions(-) diff --git a/ci/scripts/nft/update-monit-nft.sh b/ci/scripts/nft/update-monit-nft.sh index a43cb24f..a032a412 100644 --- a/ci/scripts/nft/update-monit-nft.sh +++ b/ci/scripts/nft/update-monit-nft.sh @@ -4,7 +4,7 @@ set -eu DATE=$(date +%s) NFT_FILE=/etc/nftables/monit.nft BACKUP="${NFT_FILE}.bak.${DATE}" -TMP="$(mktemp /tmp/monit.nft.${DATE})" +TMP="$(mktemp /tmp/monit.nft.XXXXXX)" # Get ControlGroup value for bosh-agent.service cg=$(systemctl show -p ControlGroup --value bosh-agent.service 2>/dev/null || true) @@ -16,7 +16,8 @@ cg=${cg#/} # remove leading slash if present echo "Found ControlGroup for bosh-agent.service: $cg" # Replace the quoted cgroup path in the socket rule that matches the ip/tcp part -pattern='(^[[:space:]]*socket[[:space:]]+cgroupv2[[:space:]]+level[[:space:]]+[0-9]+[[:space:]]+")[^"]+("[[:space:]]+ip[[:space:]]+daddr[[:space:]]+127\.0\.0\.1[[:space:]]+tcp[[:space:]]+dport[[:space:]]+2822)' +# The expected nft rule begins with: socket cgroupv2 level "" ip daddr 127.0.0.1 ... +pattern='(^[[:space:]]*socket[[:space:]]+cgroupv2[[:space:]]+level[[:space:]]+[0-9]+[[:space:]]+")[^"]+("[[:space:]]+ip[[:space:]]+daddr[[:space:]]+127\.0\.0\.1[[:space:]].*)' esc=$(printf '%s' "$cg" | sed 's@[/&]@\&@g') # escape slashes and ampersands for sed sed -E "s@$pattern@\1${esc}\2@" "$NFT_FILE" > "$TMP" if cmp -s "$NFT_FILE" "$TMP"; then diff --git a/ci/scripts/nft/watch-containers.sh b/ci/scripts/nft/watch-containers.sh index 6016408b..743ca891 100644 --- a/ci/scripts/nft/watch-containers.sh +++ b/ci/scripts/nft/watch-containers.sh @@ -1,6 +1,15 @@ #!/bin/sh set -eu +CERT_DIR=$(find /tmp -maxdepth 1 -type d -regex '/tmp/tmp\.[A-Za-z0-9][A-Za-z0-9]*' -print | head -n 1) +if [ -z "$CERT_DIR" ]; then + echo "DOCKER_CERT_PATH not found (no /tmp/tmp.* directory)" >&2 + exit 1 +fi +export DOCKER_HOST=tcp://172.17.0.2:4243 +export DOCKER_TLS_VERIFY=1 +export DOCKER_CERT_PATH="$CERT_DIR" + SCRIPT_PATH=/usr/local/sbin/update-monit-nft.sh trap 'echo "monit-nft watcher interrupted; exiting" >&2; exit 0' INT TERM @@ -23,7 +32,7 @@ done # listen for docker start events and update when they occur forever while true; do - docker events --filter 'event=start' --format '{{.ID}} {{.Type}} {{.Action}}' | while read -r id type action; do + docker events --filter 'event=start' --format '{{.Actor.ID}} {{.Type}} {{.Action}}' | while read -r id type action; do echo "Received docker event: ID=$id Type=$type Action=$action" run_update_in_container "$id" done || true diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index cb026054..7cc3be37 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -257,13 +257,110 @@ EOF -v docker_tls="$(cat "${local_bosh_dir}/docker_tls.json")" \ "${@}" > "${local_bosh_dir}/bosh-director.yml" + # #region agent log — monitor containers during create-env for systemd exit diagnostics + ( + while true; do + for cid in $(docker ps -a -q 2>/dev/null); do + cstatus=$(docker inspect --format '{{.State.Status}}' "$cid" 2>/dev/null) + if [ "$cstatus" = "exited" ] || [ "$cstatus" = "dead" ]; then + cname=$(docker inspect --format '{{.Name}}' "$cid" 2>/dev/null | sed 's|^/||') + exitcode=$(docker inspect --format '{{.State.ExitCode}}' "$cid" 2>/dev/null) + echo "=== DEBUG[58375b] $(date -u +%H:%M:%S) DEAD container: ${cname} (${cid}) exit=${exitcode} ===" + echo "=== DEBUG[58375b] container Cmd ===" + docker inspect --format '{{.Config.Cmd}}' "$cid" 2>/dev/null | head -5 || true + echo "=== DEBUG[58375b] container logs ===" + docker logs "$cid" 2>&1 | tail -30 || true + echo "=== DEBUG[58375b] dmesg last 20 ===" + dmesg 2>/dev/null | tail -20 || true + echo "=== DEBUG[58375b] container journal ===" + docker cp "$cid":/var/log/journal /tmp/journal-diag-"$cid" 2>/dev/null && \ + find /tmp/journal-diag-"$cid" -name '*.journal' -exec journalctl --file '{}' --no-pager \; 2>/dev/null | tail -50 || \ + echo "(no journal)" + echo "=== DEBUG[58375b] end dead container $cid ===" + fi + done + sleep 2 + done + ) & + CREATE_ENV_MONITOR_PID=$! + # #endregion agent log + echo "Starting monit-nft-watcher to correct NF table for any starting container..." >&2 nohup /usr/local/sbin/watch-containers.sh >/var/log/monit-nft-watcher.log 2>&1 & echo "Creating BOSH director environment..." >&2 + create_env_exit=0 bosh create-env "${local_bosh_dir}/bosh-director.yml" \ --vars-store="${local_bosh_dir}/creds.yml" \ - --state="${local_bosh_dir}/state.json" + --state="${local_bosh_dir}/state.json" || create_env_exit=$? + + kill $CREATE_ENV_MONITOR_PID 2>/dev/null || true + wait $CREATE_ENV_MONITOR_PID 2>/dev/null || true + + if [ "$create_env_exit" -ne 0 ]; then + echo "=== DEBUG[58375b] create-env failed (exit=${create_env_exit}) ===" + echo "=== DEBUG[58375b] all containers ===" + docker ps -a --format 'table {{.ID}}\t{{.Names}}\t{{.Status}}' || true + for cid in $(docker ps -a -q 2>/dev/null); do + cname=$(docker inspect --format '{{.Name}}' "$cid" 2>/dev/null | sed 's|^/||') + cstatus=$(docker inspect --format '{{.State.Status}}' "$cid" 2>/dev/null) + exitcode=$(docker inspect --format '{{.State.ExitCode}}' "$cid" 2>/dev/null) + echo "=== DEBUG[58375b] container ${cname} (${cid}): status=${cstatus} exit=${exitcode} ===" + echo "=== DEBUG[58375b] Cmd ===" + docker inspect --format '{{.Config.Cmd}}' "$cid" 2>/dev/null | head -3 || true + echo "=== DEBUG[58375b] HostConfig ===" + docker inspect --format 'Privileged={{.HostConfig.Privileged}} CgroupnsMode={{.HostConfig.CgroupnsMode}} Binds={{.HostConfig.Binds}}' "$cid" 2>/dev/null || true + echo "=== DEBUG[58375b] logs ===" + docker logs "$cid" 2>&1 | tail -30 || true + echo "=== DEBUG[58375b] cgroup info from inside container ===" + docker exec "$cid" bash -c 'cat /proc/self/cgroup 2>/dev/null; echo "---"; ls -la /sys/fs/cgroup/ 2>/dev/null; echo "---"; cat /sys/fs/cgroup/cgroup.controllers 2>/dev/null; echo "---"; cat /sys/fs/cgroup/cgroup.subtree_control 2>/dev/null' 2>/dev/null || true + echo "=== DEBUG[58375b] journal from container ===" + docker cp "$cid":/var/log/journal /tmp/journal-post-"$cid" 2>/dev/null && \ + find /tmp/journal-post-"$cid" -name '*.journal' -exec journalctl --file '{}' --no-pager \; 2>/dev/null | tail -100 || echo "(no journal)" + done + echo "=== DEBUG[58375b] dmesg (last 40) ===" + dmesg 2>/dev/null | tail -40 || true + + echo "=== DEBUG[58375b] reproducing with verbose startup to find failure point ===" + local failed_image + failed_image=$(docker inspect --format '{{.Config.Image}}' "$(docker ps -a -q | head -1)" 2>/dev/null) || true + if [ -n "$failed_image" ]; then + echo "=== DEBUG[58375b] test: running pre-start commands step by step ===" + docker run --rm --privileged --cgroupns=host \ + -v /sys/fs/cgroup:/sys/fs/cgroup:rw \ + -v /lib/modules:/usr/lib/modules \ + "$failed_image" bash -c ' + set -x + echo "step1: umount resolv.conf" && umount /etc/resolv.conf 2>&1; echo "exit=$?" + echo "step2: write resolv.conf" && printf "%s\n" "nameserver 8.8.8.8" > /etc/resolv.conf 2>&1; echo "exit=$?" + echo "step3: umount hosts" && umount /etc/hosts 2>&1; echo "exit=$?" + echo "step4: umount hostname" && umount /etc/hostname 2>&1; echo "exit=$?" + echo "step5: mkdir data/sys" && rm -rf /var/vcap/data/sys && mkdir -p /var/vcap/data/sys 2>&1; echo "exit=$?" + echo "step6: mkdir store" && mkdir -p /var/vcap/store 2>&1; echo "exit=$?" + echo "step7: sed chronyc" && sed -i "s/chronyc/# chronyc/g" /var/vcap/bosh/bin/sync-time 2>&1; echo "exit=$?" + echo "step8: rm sv" && rm -rf /etc/sv/{ssh,cron} && rm -rf /etc/service/{ssh,cron} 2>&1; echo "exit=$?" + echo "step9: find/delete units" && find /etc/systemd/system /lib/systemd/system -path "*.wants/*" \ + -not -name "*bosh-agent*" -not -name "*journald*" -not -name "*logrotate*" \ + -not -name "*runit*" -not -name "*ssh*" -not -name "*systemd-user-sessions*" \ + -not -name "*systemd-tmpfiles*" -exec rm {} \; 2>&1; echo "exit=$?" + echo "step10: cgroup state before init" + cat /proc/self/cgroup 2>&1 + ls /sys/fs/cgroup/ 2>&1 + cat /sys/fs/cgroup/cgroup.controllers 2>&1 || true + cat /sys/fs/cgroup/cgroup.subtree_control 2>&1 || true + MYCG=$(grep "^0::" /proc/self/cgroup | cut -d: -f3) + echo "my cgroup path: ${MYCG}" + ls "/sys/fs/cgroup${MYCG}/" 2>&1 || true + cat "/sys/fs/cgroup${MYCG}/cgroup.controllers" 2>&1 || true + cat "/sys/fs/cgroup${MYCG}/cgroup.subtree_control" 2>&1 || true + cat "/sys/fs/cgroup${MYCG}/cgroup.procs" 2>&1 || true + echo "step11: attempting /sbin/init with timeout" + timeout 5 /sbin/init --log-level=debug --log-target=console 2>&1 || echo "init exited with $?" + ' 2>&1 || echo "DEBUG[58375b] test container exited with $?" + fi + + exit "$create_env_exit" + fi echo "Extracting BOSH director credentials and CA certificate..." >&2 bosh int "${local_bosh_dir}/creds.yml" --path /director_ssl/ca > "${local_bosh_dir}/ca.crt" @@ -285,6 +382,88 @@ EOF echo "Updating BOSH cloud config with Docker network..." >&2 bosh -n update-cloud-config docker/cloud-config.yml -v network="${docker_network_name}" + # #region agent log — Hypothesis A: check if runsvdir-start exists in stemcell image + local stemcell_image + stemcell_image=$(docker images --format '{{.Repository}}:{{.Tag}}' | grep -v '' | head -1) + echo "=== DEBUG[58375b] stemcell image: ${stemcell_image} ===" + echo "=== DEBUG[58375b] checking runsvdir-start and /sbin/init in stemcell ===" + docker run --rm --entrypoint "" "${stemcell_image}" bash -c \ + 'echo "runsvdir-start exists: $(test -f /usr/sbin/runsvdir-start && echo YES || echo NO)"; \ + echo "sbin/init exists: $(test -f /sbin/init && echo YES || echo NO)"; \ + echo "systemd exists: $(test -f /lib/systemd/systemd && echo YES || echo NO)"; \ + ls -la /usr/sbin/runsvdir-start /sbin/init /lib/systemd/systemd 2>&1 || true' \ + || echo "DEBUG[58375b] failed to inspect stemcell image" + # #endregion agent log + + # #region agent log — Hypothesis B/D: monitor new containers during deploy + echo "=== DEBUG[58375b] pre-deploy container list ===" + docker ps -a --format 'table {{.ID}}\t{{.Names}}\t{{.Status}}' + + local director_cid_pre + director_cid_pre=$(docker ps -q --filter "expose=25555" | head -1) + echo "=== DEBUG[58375b] director container id: ${director_cid_pre} ===" + + ( + seen_containers="" + while true; do + for cid in $(docker ps -a -q); do + if [ "$cid" = "$director_cid_pre" ]; then + continue + fi + cname=$(docker inspect --format '{{.Name}}' "$cid" 2>/dev/null | sed 's|^/||') + cstatus=$(docker inspect --format '{{.State.Status}}' "$cid" 2>/dev/null) + if [[ "$cname" == c-* ]]; then + if ! echo "$seen_containers" | grep -q "$cid"; then + seen_containers="${seen_containers} ${cid}" + echo "=== DEBUG[58375b] $(date -u +%H:%M:%S) NEW non-director container: ${cname} (${cid}) status=${cstatus} ===" + echo "=== DEBUG[58375b] container cmd ===" + docker inspect --format '{{.Config.Cmd}}' "$cid" 2>/dev/null || true + echo "=== DEBUG[58375b] container hostconfig ===" + docker inspect --format 'Privileged={{.HostConfig.Privileged}} CgroupnsMode={{.HostConfig.CgroupnsMode}} Binds={{.HostConfig.Binds}}' "$cid" 2>/dev/null || true + fi + if [ "$cstatus" = "exited" ] || [ "$cstatus" = "dead" ]; then + echo "=== DEBUG[58375b] $(date -u +%H:%M:%S) CONTAINER DIED: ${cname} (${cid}) ===" + docker inspect --format 'ExitCode={{.State.ExitCode}} Error={{.State.Error}}' "$cid" 2>/dev/null || true + echo "=== DEBUG[58375b] container logs ===" + docker logs "$cid" 2>&1 | tail -80 || true + fi + fi + done + sleep 2 + done + ) & + MONITOR_PID=$! + # #endregion agent log + + deployment_name="haproxy" + deploy_exit=0 + bosh deploy --non-interactive \ + --deployment "${deployment_name}" \ + "${REPO_ROOT}/manifests/haproxy.yml" \ + --var haproxy-backend-port=12000 --var haproxy-backend-servers=["127.0.0.1"] || deploy_exit=$? + + # #region agent log — post-deploy diagnostics + echo "=== DEBUG[58375b] post-deploy container list ===" + docker ps -a --format 'table {{.ID}}\t{{.Names}}\t{{.Status}}\t{{.Ports}}' + + if [ "$deploy_exit" -ne 0 ]; then + echo "=== DEBUG[58375b] deploy failed (exit=${deploy_exit}), capturing CPI config ===" + docker exec "$(docker ps -q --filter name=c-)" bash -c \ + 'cat /var/vcap/jobs/docker_cpi/config/cpi.json 2>/dev/null' || echo "DEBUG[58375b] could not read cpi.json" + + echo "=== DEBUG[58375b] CPI debug log ===" + docker exec "$(docker ps -q --filter name=c-)" bash -c \ + 'find /var/vcap -name "cpi.log" -o -name "docker_cpi*" 2>/dev/null | while read f; do echo "--- $f ---"; tail -100 "$f"; done' || true + + echo "=== DEBUG[58375b] task debug log (last 200 lines) ===" + docker exec "$(docker ps -q --filter name=c-)" bash -c \ + 'find /var/vcap/data/director/tasks -name "debug" 2>/dev/null | sort -V | tail -1 | xargs tail -200 2>/dev/null' || true + fi + + kill $MONITOR_PID 2>/dev/null || true + wait $MONITOR_PID 2>/dev/null || true + # #endregion agent log + popd > /dev/null } From 84f2a123d1848b045cf19ebdcb1925d87ee2ce99 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 3 Mar 2026 13:01:54 +0100 Subject: [PATCH 14/28] POC: Noble and cgroupsv2, fix DNS --- ci/scripts/nft/update-monit-nft.sh | 33 ++++++++++++++++++++++-------- ci/scripts/nft/watch-containers.sh | 12 +++++++---- ci/scripts/start-bosh.sh | 1 + 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/ci/scripts/nft/update-monit-nft.sh b/ci/scripts/nft/update-monit-nft.sh index a032a412..8b326361 100644 --- a/ci/scripts/nft/update-monit-nft.sh +++ b/ci/scripts/nft/update-monit-nft.sh @@ -1,6 +1,14 @@ #!/bin/sh set -eu +# Enable DNS +tee /etc/resolv.conf >/dev/null <<'EOF' +nameserver 1.1.1.1 +nameserver 8.8.8.8 +options timeout:2 attempts:2 +EOF + +# Update NF tables rules for monit to use the correct cgroup path for the agent DATE=$(date +%s) NFT_FILE=/etc/nftables/monit.nft BACKUP="${NFT_FILE}.bak.${DATE}" @@ -17,15 +25,24 @@ echo "Found ControlGroup for bosh-agent.service: $cg" # Replace the quoted cgroup path in the socket rule that matches the ip/tcp part # The expected nft rule begins with: socket cgroupv2 level "" ip daddr 127.0.0.1 ... -pattern='(^[[:space:]]*socket[[:space:]]+cgroupv2[[:space:]]+level[[:space:]]+[0-9]+[[:space:]]+")[^"]+("[[:space:]]+ip[[:space:]]+daddr[[:space:]]+127\.0\.0\.1[[:space:]].*)' -esc=$(printf '%s' "$cg" | sed 's@[/&]@\&@g') # escape slashes and ampersands for sed -sed -E "s@$pattern@\1${esc}\2@" "$NFT_FILE" > "$TMP" -if cmp -s "$NFT_FILE" "$TMP"; then +awk_status=0 +awk -v new="$cg" ' +BEGIN { replaced = 0 } +/^[[:space:]]*socket[[:space:]]+cgroupv2[[:space:]]+level[[:space:]]+[0-9]+[[:space:]]+"[^"]+"[[:space:]]+ip[[:space:]]+daddr[[:space:]]+127\.0\.0\.1/ { + sub(/"[^"]+"/, "\"" new "\"", $0) + replaced = 1 +} +{ print } +END { if (replaced == 0) exit 3 } +' "$NFT_FILE" > "$TMP" || awk_status=$? +if [ "$awk_status" -eq 3 ]; then + echo "monit.nft socket rule not found; no changes made" >&2 rm -f "$TMP" - echo "monit.nft already up-to-date (using cgroup: $cg)" - exit 0 -else - echo "monit.nft needs update (new cgroup: $cg)" + exit 1 +elif [ "$awk_status" -ne 0 ]; then + echo "failed to update monit.nft (awk error $awk_status)" >&2 + rm -f "$TMP" + exit 1 fi # Backup & atomically replace and try to reload nft diff --git a/ci/scripts/nft/watch-containers.sh b/ci/scripts/nft/watch-containers.sh index 743ca891..682d3906 100644 --- a/ci/scripts/nft/watch-containers.sh +++ b/ci/scripts/nft/watch-containers.sh @@ -18,11 +18,15 @@ run_update_in_container() { cid="$1" if [ ! -r "$SCRIPT_PATH" ]; then echo "missing host script: $SCRIPT_PATH" >&2 - return - fi - if ! docker exec -i "$cid" /bin/sh -s -- < "$SCRIPT_PATH"; then - echo "failed to run update-monit-nft.sh inside container $cid" >&2 + return 1 fi + while true; do + if docker exec -i "$cid" /bin/sh -s -- < "$SCRIPT_PATH"; then + return 0 + fi + echo "failed to run update-monit-nft.sh inside container $cid; retrying in 1s" >&2 + sleep 1 + done } # initial update for any already-running containers diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 7cc3be37..b17d6c2c 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -244,6 +244,7 @@ EOF echo "Interpolating BOSH deployment manifest with Docker CPI and TLS configuration..." >&2 bosh int bosh.yml \ + -o runtime-configs/dns.yml \ -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ From 036113f3af17873daeab7bde95049af1deee94d9 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 3 Mar 2026 13:34:27 +0100 Subject: [PATCH 15/28] POC: Noble and cgroupsv2, fix DNS ops file --- ci/noble-updates.yml | 98 ++++++++++++++++++++++++++++++++++++++++ ci/scripts/start-bosh.sh | 1 - 2 files changed, 98 insertions(+), 1 deletion(-) diff --git a/ci/noble-updates.yml b/ci/noble-updates.yml index beec1cbb..17b01a86 100644 --- a/ci/noble-updates.yml +++ b/ci/noble-updates.yml @@ -4,3 +4,101 @@ - type: replace path: /instance_groups/name=bosh/properties/docker_cpi/start_containers_with_systemd? value: true +- type: replace + path: /addons/name=bosh-dns-systemd? + value: + include: + stemcell: + - os: ubuntu-noble + jobs: + - name: bosh-dns + properties: + api: + client: + tls: ((/dns_api_client_tls)) + server: + tls: ((/dns_api_server_tls)) + cache: + enabled: true + configure_systemd_resolved: true + disable_recursors: true + health: + client: + tls: ((/dns_healthcheck_client_tls)) + enabled: true + server: + tls: ((/dns_healthcheck_server_tls)) + override_nameserver: false + release: bosh-dns + name: bosh-dns-systemd +- type: replace + path: /releases/name=bosh-dns? + value: + name: bosh-dns + sha1: 494d9e6ff68909a3aaddf146464dd4599f9f16a8 + url: https://bosh.io/d/github.com/cloudfoundry/bosh-dns-release?v=1.39.21 + version: 1.39.21 +- type: replace + path: /variables/name=/dns_healthcheck_tls_ca? + value: + name: /dns_healthcheck_tls_ca + options: + common_name: dns-healthcheck-tls-ca + is_ca: true + type: certificate +- type: replace + path: /variables/name=/dns_healthcheck_server_tls? + value: + name: /dns_healthcheck_server_tls + options: + alternative_names: + - health.bosh-dns + ca: /dns_healthcheck_tls_ca + common_name: health.bosh-dns + extended_key_usage: + - server_auth + type: certificate +- type: replace + path: /variables/name=/dns_healthcheck_client_tls? + value: + name: /dns_healthcheck_client_tls + options: + alternative_names: + - health.bosh-dns + ca: /dns_healthcheck_tls_ca + common_name: health.bosh-dns + extended_key_usage: + - client_auth + type: certificate +- type: replace + path: /variables/name=/dns_api_tls_ca? + value: + name: /dns_api_tls_ca + options: + common_name: dns-api-tls-ca + is_ca: true + type: certificate +- type: replace + path: /variables/name=/dns_api_server_tls? + value: + name: /dns_api_server_tls + options: + alternative_names: + - api.bosh-dns + ca: /dns_api_tls_ca + common_name: api.bosh-dns + extended_key_usage: + - server_auth + type: certificate +- type: replace + path: /variables/name=/dns_api_client_tls? + value: + name: /dns_api_client_tls + options: + alternative_names: + - api.bosh-dns + ca: /dns_api_tls_ca + common_name: api.bosh-dns + extended_key_usage: + - client_auth + type: certificate \ No newline at end of file diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index b17d6c2c..7cc3be37 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -244,7 +244,6 @@ EOF echo "Interpolating BOSH deployment manifest with Docker CPI and TLS configuration..." >&2 bosh int bosh.yml \ - -o runtime-configs/dns.yml \ -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ From 6b27c22ae58228307c94f43bf83ce51d2d643518 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 3 Mar 2026 13:47:26 +0100 Subject: [PATCH 16/28] POC: Noble and cgroupsv2, fix DNS ops file --- ci/noble-updates.yml | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/ci/noble-updates.yml b/ci/noble-updates.yml index 17b01a86..c9ec337b 100644 --- a/ci/noble-updates.yml +++ b/ci/noble-updates.yml @@ -5,7 +5,7 @@ path: /instance_groups/name=bosh/properties/docker_cpi/start_containers_with_systemd? value: true - type: replace - path: /addons/name=bosh-dns-systemd? + path: /addons?/name=bosh-dns-systemd? value: include: stemcell: @@ -15,19 +15,19 @@ properties: api: client: - tls: ((/dns_api_client_tls)) + tls: ((dns_api_client_tls)) server: - tls: ((/dns_api_server_tls)) + tls: ((dns_api_server_tls)) cache: enabled: true configure_systemd_resolved: true disable_recursors: true health: client: - tls: ((/dns_healthcheck_client_tls)) + tls: ((dns_healthcheck_client_tls)) enabled: true server: - tls: ((/dns_healthcheck_server_tls)) + tls: ((dns_healthcheck_server_tls)) override_nameserver: false release: bosh-dns name: bosh-dns-systemd @@ -39,65 +39,65 @@ url: https://bosh.io/d/github.com/cloudfoundry/bosh-dns-release?v=1.39.21 version: 1.39.21 - type: replace - path: /variables/name=/dns_healthcheck_tls_ca? + path: /variables/name=dns_healthcheck_tls_ca? value: - name: /dns_healthcheck_tls_ca + name: dns_healthcheck_tls_ca options: common_name: dns-healthcheck-tls-ca is_ca: true type: certificate - type: replace - path: /variables/name=/dns_healthcheck_server_tls? + path: /variables/name=dns_healthcheck_server_tls? value: - name: /dns_healthcheck_server_tls + name: dns_healthcheck_server_tls options: alternative_names: - health.bosh-dns - ca: /dns_healthcheck_tls_ca + ca: dns_healthcheck_tls_ca common_name: health.bosh-dns extended_key_usage: - server_auth type: certificate - type: replace - path: /variables/name=/dns_healthcheck_client_tls? + path: /variables/name=dns_healthcheck_client_tls? value: - name: /dns_healthcheck_client_tls + name: dns_healthcheck_client_tls options: alternative_names: - health.bosh-dns - ca: /dns_healthcheck_tls_ca + ca: dns_healthcheck_tls_ca common_name: health.bosh-dns extended_key_usage: - client_auth type: certificate - type: replace - path: /variables/name=/dns_api_tls_ca? + path: /variables/name=dns_api_tls_ca? value: - name: /dns_api_tls_ca + name: dns_api_tls_ca options: common_name: dns-api-tls-ca is_ca: true type: certificate - type: replace - path: /variables/name=/dns_api_server_tls? + path: /variables/name=dns_api_server_tls? value: - name: /dns_api_server_tls + name: dns_api_server_tls options: alternative_names: - api.bosh-dns - ca: /dns_api_tls_ca + ca: dns_api_tls_ca common_name: api.bosh-dns extended_key_usage: - server_auth type: certificate - type: replace - path: /variables/name=/dns_api_client_tls? + path: /variables/name=dns_api_client_tls? value: - name: /dns_api_client_tls + name: dns_api_client_tls options: alternative_names: - api.bosh-dns - ca: /dns_api_tls_ca + ca: dns_api_tls_ca common_name: api.bosh-dns extended_key_usage: - client_auth From ae06706bb2e01c1b6a0ee425d498c3c3533989f3 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 3 Mar 2026 14:30:55 +0100 Subject: [PATCH 17/28] POC: Noble and cgroupsv2, fix workaround --- ci/scripts/nft/update-monit-nft.sh | 8 -- ci/scripts/nft/watch-containers.sh | 48 ++++++-- ci/scripts/start-bosh.sh | 182 +---------------------------- 3 files changed, 41 insertions(+), 197 deletions(-) diff --git a/ci/scripts/nft/update-monit-nft.sh b/ci/scripts/nft/update-monit-nft.sh index 8b326361..e09f560b 100644 --- a/ci/scripts/nft/update-monit-nft.sh +++ b/ci/scripts/nft/update-monit-nft.sh @@ -1,14 +1,6 @@ #!/bin/sh set -eu -# Enable DNS -tee /etc/resolv.conf >/dev/null <<'EOF' -nameserver 1.1.1.1 -nameserver 8.8.8.8 -options timeout:2 attempts:2 -EOF - -# Update NF tables rules for monit to use the correct cgroup path for the agent DATE=$(date +%s) NFT_FILE=/etc/nftables/monit.nft BACKUP="${NFT_FILE}.bak.${DATE}" diff --git a/ci/scripts/nft/watch-containers.sh b/ci/scripts/nft/watch-containers.sh index 682d3906..c97055ba 100644 --- a/ci/scripts/nft/watch-containers.sh +++ b/ci/scripts/nft/watch-containers.sh @@ -1,32 +1,62 @@ #!/bin/sh set -eu +# Find the first /tmp/tmp.* directory to use as DOCKER_CERT_PATH CERT_DIR=$(find /tmp -maxdepth 1 -type d -regex '/tmp/tmp\.[A-Za-z0-9][A-Za-z0-9]*' -print | head -n 1) if [ -z "$CERT_DIR" ]; then echo "DOCKER_CERT_PATH not found (no /tmp/tmp.* directory)" >&2 exit 1 fi + +# Setup environment variables to talk to the host's Docker daemon export DOCKER_HOST=tcp://172.17.0.2:4243 export DOCKER_TLS_VERIFY=1 export DOCKER_CERT_PATH="$CERT_DIR" SCRIPT_PATH=/usr/local/sbin/update-monit-nft.sh +BACKGROUND_PIDS="" -trap 'echo "monit-nft watcher interrupted; exiting" >&2; exit 0' INT TERM +# Kill any background update processes on exit +cleanup() { + for pid in $BACKGROUND_PIDS; do + kill "$pid" 2>/dev/null || true + done + echo "monit-nft watcher interrupted; exiting" >&2 + exit 0 +} +trap cleanup INT TERM +# Run the update script inside the container, retrying if it fails due to the container not being ready yet run_update_in_container() { cid="$1" if [ ! -r "$SCRIPT_PATH" ]; then echo "missing host script: $SCRIPT_PATH" >&2 return 1 fi - while true; do - if docker exec -i "$cid" /bin/sh -s -- < "$SCRIPT_PATH"; then - return 0 - fi - echo "failed to run update-monit-nft.sh inside container $cid; retrying in 1s" >&2 - sleep 1 - done + ( + while true; do + if output=$(docker exec -i "$cid" /bin/sh -s -- < "$SCRIPT_PATH" 2>&1); then + exit 0 + fi + case "$output" in + *"No such container"*) + echo "container $cid no longer exists; stop retrying" >&2 + exit 0 + ;; + *"is not running"*) + echo "container $cid is not running; stop retrying" >&2 + exit 0 + ;; + *) + echo "failed to run update-monit-nft.sh inside container $cid; retrying in 1s" >&2 + sleep 1 + ;; + esac + done + ) & + pid=$! + BACKGROUND_PIDS="$BACKGROUND_PIDS $pid" + echo "started background update for container $cid (pid $pid)" >&2 } # initial update for any already-running containers @@ -34,7 +64,7 @@ for cid in $(docker ps -q 2>/dev/null); do run_update_in_container "$cid" done -# listen for docker start events and update when they occur forever +# watch for new containers and run update inside them while true; do docker events --filter 'event=start' --format '{{.Actor.ID}} {{.Type}} {{.Action}}' | while read -r id type action; do echo "Received docker event: ID=$id Type=$type Action=$action" diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 7cc3be37..7ab8dae3 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -257,110 +257,14 @@ EOF -v docker_tls="$(cat "${local_bosh_dir}/docker_tls.json")" \ "${@}" > "${local_bosh_dir}/bosh-director.yml" - # #region agent log — monitor containers during create-env for systemd exit diagnostics - ( - while true; do - for cid in $(docker ps -a -q 2>/dev/null); do - cstatus=$(docker inspect --format '{{.State.Status}}' "$cid" 2>/dev/null) - if [ "$cstatus" = "exited" ] || [ "$cstatus" = "dead" ]; then - cname=$(docker inspect --format '{{.Name}}' "$cid" 2>/dev/null | sed 's|^/||') - exitcode=$(docker inspect --format '{{.State.ExitCode}}' "$cid" 2>/dev/null) - echo "=== DEBUG[58375b] $(date -u +%H:%M:%S) DEAD container: ${cname} (${cid}) exit=${exitcode} ===" - echo "=== DEBUG[58375b] container Cmd ===" - docker inspect --format '{{.Config.Cmd}}' "$cid" 2>/dev/null | head -5 || true - echo "=== DEBUG[58375b] container logs ===" - docker logs "$cid" 2>&1 | tail -30 || true - echo "=== DEBUG[58375b] dmesg last 20 ===" - dmesg 2>/dev/null | tail -20 || true - echo "=== DEBUG[58375b] container journal ===" - docker cp "$cid":/var/log/journal /tmp/journal-diag-"$cid" 2>/dev/null && \ - find /tmp/journal-diag-"$cid" -name '*.journal' -exec journalctl --file '{}' --no-pager \; 2>/dev/null | tail -50 || \ - echo "(no journal)" - echo "=== DEBUG[58375b] end dead container $cid ===" - fi - done - sleep 2 - done - ) & - CREATE_ENV_MONITOR_PID=$! - # #endregion agent log - + # TODO: remove this workaround once we have a fix for the underlying issue with nftables echo "Starting monit-nft-watcher to correct NF table for any starting container..." >&2 nohup /usr/local/sbin/watch-containers.sh >/var/log/monit-nft-watcher.log 2>&1 & echo "Creating BOSH director environment..." >&2 - create_env_exit=0 bosh create-env "${local_bosh_dir}/bosh-director.yml" \ --vars-store="${local_bosh_dir}/creds.yml" \ - --state="${local_bosh_dir}/state.json" || create_env_exit=$? - - kill $CREATE_ENV_MONITOR_PID 2>/dev/null || true - wait $CREATE_ENV_MONITOR_PID 2>/dev/null || true - - if [ "$create_env_exit" -ne 0 ]; then - echo "=== DEBUG[58375b] create-env failed (exit=${create_env_exit}) ===" - echo "=== DEBUG[58375b] all containers ===" - docker ps -a --format 'table {{.ID}}\t{{.Names}}\t{{.Status}}' || true - for cid in $(docker ps -a -q 2>/dev/null); do - cname=$(docker inspect --format '{{.Name}}' "$cid" 2>/dev/null | sed 's|^/||') - cstatus=$(docker inspect --format '{{.State.Status}}' "$cid" 2>/dev/null) - exitcode=$(docker inspect --format '{{.State.ExitCode}}' "$cid" 2>/dev/null) - echo "=== DEBUG[58375b] container ${cname} (${cid}): status=${cstatus} exit=${exitcode} ===" - echo "=== DEBUG[58375b] Cmd ===" - docker inspect --format '{{.Config.Cmd}}' "$cid" 2>/dev/null | head -3 || true - echo "=== DEBUG[58375b] HostConfig ===" - docker inspect --format 'Privileged={{.HostConfig.Privileged}} CgroupnsMode={{.HostConfig.CgroupnsMode}} Binds={{.HostConfig.Binds}}' "$cid" 2>/dev/null || true - echo "=== DEBUG[58375b] logs ===" - docker logs "$cid" 2>&1 | tail -30 || true - echo "=== DEBUG[58375b] cgroup info from inside container ===" - docker exec "$cid" bash -c 'cat /proc/self/cgroup 2>/dev/null; echo "---"; ls -la /sys/fs/cgroup/ 2>/dev/null; echo "---"; cat /sys/fs/cgroup/cgroup.controllers 2>/dev/null; echo "---"; cat /sys/fs/cgroup/cgroup.subtree_control 2>/dev/null' 2>/dev/null || true - echo "=== DEBUG[58375b] journal from container ===" - docker cp "$cid":/var/log/journal /tmp/journal-post-"$cid" 2>/dev/null && \ - find /tmp/journal-post-"$cid" -name '*.journal' -exec journalctl --file '{}' --no-pager \; 2>/dev/null | tail -100 || echo "(no journal)" - done - echo "=== DEBUG[58375b] dmesg (last 40) ===" - dmesg 2>/dev/null | tail -40 || true - - echo "=== DEBUG[58375b] reproducing with verbose startup to find failure point ===" - local failed_image - failed_image=$(docker inspect --format '{{.Config.Image}}' "$(docker ps -a -q | head -1)" 2>/dev/null) || true - if [ -n "$failed_image" ]; then - echo "=== DEBUG[58375b] test: running pre-start commands step by step ===" - docker run --rm --privileged --cgroupns=host \ - -v /sys/fs/cgroup:/sys/fs/cgroup:rw \ - -v /lib/modules:/usr/lib/modules \ - "$failed_image" bash -c ' - set -x - echo "step1: umount resolv.conf" && umount /etc/resolv.conf 2>&1; echo "exit=$?" - echo "step2: write resolv.conf" && printf "%s\n" "nameserver 8.8.8.8" > /etc/resolv.conf 2>&1; echo "exit=$?" - echo "step3: umount hosts" && umount /etc/hosts 2>&1; echo "exit=$?" - echo "step4: umount hostname" && umount /etc/hostname 2>&1; echo "exit=$?" - echo "step5: mkdir data/sys" && rm -rf /var/vcap/data/sys && mkdir -p /var/vcap/data/sys 2>&1; echo "exit=$?" - echo "step6: mkdir store" && mkdir -p /var/vcap/store 2>&1; echo "exit=$?" - echo "step7: sed chronyc" && sed -i "s/chronyc/# chronyc/g" /var/vcap/bosh/bin/sync-time 2>&1; echo "exit=$?" - echo "step8: rm sv" && rm -rf /etc/sv/{ssh,cron} && rm -rf /etc/service/{ssh,cron} 2>&1; echo "exit=$?" - echo "step9: find/delete units" && find /etc/systemd/system /lib/systemd/system -path "*.wants/*" \ - -not -name "*bosh-agent*" -not -name "*journald*" -not -name "*logrotate*" \ - -not -name "*runit*" -not -name "*ssh*" -not -name "*systemd-user-sessions*" \ - -not -name "*systemd-tmpfiles*" -exec rm {} \; 2>&1; echo "exit=$?" - echo "step10: cgroup state before init" - cat /proc/self/cgroup 2>&1 - ls /sys/fs/cgroup/ 2>&1 - cat /sys/fs/cgroup/cgroup.controllers 2>&1 || true - cat /sys/fs/cgroup/cgroup.subtree_control 2>&1 || true - MYCG=$(grep "^0::" /proc/self/cgroup | cut -d: -f3) - echo "my cgroup path: ${MYCG}" - ls "/sys/fs/cgroup${MYCG}/" 2>&1 || true - cat "/sys/fs/cgroup${MYCG}/cgroup.controllers" 2>&1 || true - cat "/sys/fs/cgroup${MYCG}/cgroup.subtree_control" 2>&1 || true - cat "/sys/fs/cgroup${MYCG}/cgroup.procs" 2>&1 || true - echo "step11: attempting /sbin/init with timeout" - timeout 5 /sbin/init --log-level=debug --log-target=console 2>&1 || echo "init exited with $?" - ' 2>&1 || echo "DEBUG[58375b] test container exited with $?" - fi - - exit "$create_env_exit" - fi + --state="${local_bosh_dir}/state.json" echo "Extracting BOSH director credentials and CA certificate..." >&2 bosh int "${local_bosh_dir}/creds.yml" --path /director_ssl/ca > "${local_bosh_dir}/ca.crt" @@ -382,88 +286,6 @@ EOF echo "Updating BOSH cloud config with Docker network..." >&2 bosh -n update-cloud-config docker/cloud-config.yml -v network="${docker_network_name}" - # #region agent log — Hypothesis A: check if runsvdir-start exists in stemcell image - local stemcell_image - stemcell_image=$(docker images --format '{{.Repository}}:{{.Tag}}' | grep -v '' | head -1) - echo "=== DEBUG[58375b] stemcell image: ${stemcell_image} ===" - echo "=== DEBUG[58375b] checking runsvdir-start and /sbin/init in stemcell ===" - docker run --rm --entrypoint "" "${stemcell_image}" bash -c \ - 'echo "runsvdir-start exists: $(test -f /usr/sbin/runsvdir-start && echo YES || echo NO)"; \ - echo "sbin/init exists: $(test -f /sbin/init && echo YES || echo NO)"; \ - echo "systemd exists: $(test -f /lib/systemd/systemd && echo YES || echo NO)"; \ - ls -la /usr/sbin/runsvdir-start /sbin/init /lib/systemd/systemd 2>&1 || true' \ - || echo "DEBUG[58375b] failed to inspect stemcell image" - # #endregion agent log - - # #region agent log — Hypothesis B/D: monitor new containers during deploy - echo "=== DEBUG[58375b] pre-deploy container list ===" - docker ps -a --format 'table {{.ID}}\t{{.Names}}\t{{.Status}}' - - local director_cid_pre - director_cid_pre=$(docker ps -q --filter "expose=25555" | head -1) - echo "=== DEBUG[58375b] director container id: ${director_cid_pre} ===" - - ( - seen_containers="" - while true; do - for cid in $(docker ps -a -q); do - if [ "$cid" = "$director_cid_pre" ]; then - continue - fi - cname=$(docker inspect --format '{{.Name}}' "$cid" 2>/dev/null | sed 's|^/||') - cstatus=$(docker inspect --format '{{.State.Status}}' "$cid" 2>/dev/null) - if [[ "$cname" == c-* ]]; then - if ! echo "$seen_containers" | grep -q "$cid"; then - seen_containers="${seen_containers} ${cid}" - echo "=== DEBUG[58375b] $(date -u +%H:%M:%S) NEW non-director container: ${cname} (${cid}) status=${cstatus} ===" - echo "=== DEBUG[58375b] container cmd ===" - docker inspect --format '{{.Config.Cmd}}' "$cid" 2>/dev/null || true - echo "=== DEBUG[58375b] container hostconfig ===" - docker inspect --format 'Privileged={{.HostConfig.Privileged}} CgroupnsMode={{.HostConfig.CgroupnsMode}} Binds={{.HostConfig.Binds}}' "$cid" 2>/dev/null || true - fi - if [ "$cstatus" = "exited" ] || [ "$cstatus" = "dead" ]; then - echo "=== DEBUG[58375b] $(date -u +%H:%M:%S) CONTAINER DIED: ${cname} (${cid}) ===" - docker inspect --format 'ExitCode={{.State.ExitCode}} Error={{.State.Error}}' "$cid" 2>/dev/null || true - echo "=== DEBUG[58375b] container logs ===" - docker logs "$cid" 2>&1 | tail -80 || true - fi - fi - done - sleep 2 - done - ) & - MONITOR_PID=$! - # #endregion agent log - - deployment_name="haproxy" - deploy_exit=0 - bosh deploy --non-interactive \ - --deployment "${deployment_name}" \ - "${REPO_ROOT}/manifests/haproxy.yml" \ - --var haproxy-backend-port=12000 --var haproxy-backend-servers=["127.0.0.1"] || deploy_exit=$? - - # #region agent log — post-deploy diagnostics - echo "=== DEBUG[58375b] post-deploy container list ===" - docker ps -a --format 'table {{.ID}}\t{{.Names}}\t{{.Status}}\t{{.Ports}}' - - if [ "$deploy_exit" -ne 0 ]; then - echo "=== DEBUG[58375b] deploy failed (exit=${deploy_exit}), capturing CPI config ===" - docker exec "$(docker ps -q --filter name=c-)" bash -c \ - 'cat /var/vcap/jobs/docker_cpi/config/cpi.json 2>/dev/null' || echo "DEBUG[58375b] could not read cpi.json" - - echo "=== DEBUG[58375b] CPI debug log ===" - docker exec "$(docker ps -q --filter name=c-)" bash -c \ - 'find /var/vcap -name "cpi.log" -o -name "docker_cpi*" 2>/dev/null | while read f; do echo "--- $f ---"; tail -100 "$f"; done' || true - - echo "=== DEBUG[58375b] task debug log (last 200 lines) ===" - docker exec "$(docker ps -q --filter name=c-)" bash -c \ - 'find /var/vcap/data/director/tasks -name "debug" 2>/dev/null | sort -V | tail -1 | xargs tail -200 2>/dev/null' || true - fi - - kill $MONITOR_PID 2>/dev/null || true - wait $MONITOR_PID 2>/dev/null || true - # #endregion agent log - popd > /dev/null } From 78015417e3efee692e5e3a96bb7f9a1b59469838 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 3 Mar 2026 14:42:47 +0100 Subject: [PATCH 18/28] POC: Noble and cgroupsv2, rollback change --- ci/scripts/nft/update-monit-nft.sh | 8 ++++++++ ci/scripts/start-bosh.sh | 4 +++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/ci/scripts/nft/update-monit-nft.sh b/ci/scripts/nft/update-monit-nft.sh index e09f560b..8b326361 100644 --- a/ci/scripts/nft/update-monit-nft.sh +++ b/ci/scripts/nft/update-monit-nft.sh @@ -1,6 +1,14 @@ #!/bin/sh set -eu +# Enable DNS +tee /etc/resolv.conf >/dev/null <<'EOF' +nameserver 1.1.1.1 +nameserver 8.8.8.8 +options timeout:2 attempts:2 +EOF + +# Update NF tables rules for monit to use the correct cgroup path for the agent DATE=$(date +%s) NFT_FILE=/etc/nftables/monit.nft BACKUP="${NFT_FILE}.bak.${DATE}" diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 7ab8dae3..691f9033 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -259,7 +259,9 @@ EOF # TODO: remove this workaround once we have a fix for the underlying issue with nftables echo "Starting monit-nft-watcher to correct NF table for any starting container..." >&2 - nohup /usr/local/sbin/watch-containers.sh >/var/log/monit-nft-watcher.log 2>&1 & + if ! pgrep -f '/usr/local/sbin/watch-containers.sh' >/dev/null; then + nohup /usr/local/sbin/watch-containers.sh >/var/log/monit-nft-watcher.log 2>&1 & + fi echo "Creating BOSH director environment..." >&2 bosh create-env "${local_bosh_dir}/bosh-director.yml" \ From 6e1d8abf37576e78f95f854c535bd6992bf6a6a5 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 3 Mar 2026 15:52:50 +0100 Subject: [PATCH 19/28] POC: Noble and cgroupsv2, scaling Bosh workers out --- acceptance-tests/run-local.sh | 9 +++++++-- ci/Dockerfile | 1 + ci/bosh-scaled-out.yml | 3 +++ ci/scripts/acceptance-tests | 12 ++++++++++-- ci/scripts/nft/watch-containers.sh | 12 ++++++++++-- ci/scripts/start-bosh.sh | 1 + 6 files changed, 32 insertions(+), 6 deletions(-) create mode 100644 ci/bosh-scaled-out.yml diff --git a/acceptance-tests/run-local.sh b/acceptance-tests/run-local.sh index 9ca9c5d2..51cc2045 100755 --- a/acceptance-tests/run-local.sh +++ b/acceptance-tests/run-local.sh @@ -4,12 +4,14 @@ set -eu REPO_DIR="$(cd "$(dirname "$0")/.." && pwd)" source "${REPO_DIR}/ci/scripts/functions-ci.sh" FOCUS="" +PARALLELISM="" KEEP_RUNNING="" usage() { echo -e "Usage: $0 [-F ] [-k] -F Focus on a particular test. Expects a Ginkgo test name. Keep bosh running afterwards. + -P Set Ginkgo parallel node count. Default is '-p' (smart parallelism). -k Keep bosh container running. Useful for debug." 1>&2; exit 1; } @@ -19,6 +21,9 @@ while getopts ":F:k" o; do FOCUS=${OPTARG} KEEP_RUNNING=true ;; + P) + PARALLELISM=${OPTARG} + ;; k) KEEP_RUNNING=true ;; @@ -81,9 +86,9 @@ if [ -n "$KEEP_RUNNING" ] ; then echo echo "*** KEEP_RUNNING enabled. Please clean up docker scratch after removing containers: ${DOCKER_SCRATCH}" echo - docker run --privileged -v "$REPO_DIR":/repo -v "${DOCKER_SCRATCH}":/scratch/docker -e REPO_ROOT=/repo -e FOCUS="${FOCUS}" -e KEEP_RUNNING="${KEEP_RUNNING}" haproxy-boshrelease-testflight bash -c "cd /repo/ci/scripts && ./acceptance-tests ; sleep infinity" + docker run --privileged -v "$REPO_DIR":/repo -v "${DOCKER_SCRATCH}":/scratch/docker -e REPO_ROOT=/repo -e FOCUS="${FOCUS}" -e PARALLELISM="${PARALLELISM}" -e KEEP_RUNNING="${KEEP_RUNNING}" haproxy-boshrelease-testflight bash -c "cd /repo/ci/scripts && ./acceptance-tests ; sleep infinity" else - docker run --rm --privileged -v "$REPO_DIR":/repo -v "${DOCKER_SCRATCH}":/scratch/docker -e REPO_ROOT=/repo -e KEEP_RUNNING="" haproxy-boshrelease-testflight bash -c "cd /repo/ci/scripts && ./acceptance-tests" + docker run --rm --privileged -v "$REPO_DIR":/repo -v "${DOCKER_SCRATCH}":/scratch/docker -e REPO_ROOT=/repo -e KEEP_RUNNING="" -e PARALLELISM="${PARALLELISM}" haproxy-boshrelease-testflight bash -c "cd /repo/ci/scripts && ./acceptance-tests" echo "Cleaning up docker scratch: ${DOCKER_SCRATCH}" sudo rm -rf "${DOCKER_SCRATCH}" fi diff --git a/ci/Dockerfile b/ci/Dockerfile index c6a17f12..ed63fdc5 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -11,6 +11,7 @@ RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc # Copy ops files COPY noble-updates.yml /usr/local/bosh-deployment/noble-updates.yml +COPY bosh-scaled-out.yml /usr/local/bosh-deployment/bosh-scaled-out.yml # Copy scripts COPY scripts/nft/update-monit-nft.sh /usr/local/sbin/update-monit-nft.sh diff --git a/ci/bosh-scaled-out.yml b/ci/bosh-scaled-out.yml new file mode 100644 index 00000000..93937df3 --- /dev/null +++ b/ci/bosh-scaled-out.yml @@ -0,0 +1,3 @@ +- type: replace + path: /instance_groups/name=bosh/properties/director/workers? + value: 12 \ No newline at end of file diff --git a/ci/scripts/acceptance-tests b/ci/scripts/acceptance-tests index 9cc17e83..4c98010a 100755 --- a/ci/scripts/acceptance-tests +++ b/ci/scripts/acceptance-tests @@ -35,10 +35,18 @@ echo "----- Running tests" export PATH=$PATH:$GOPATH/bin ginkgo version -PARALLELISM="-p" -if [ -n "$FOCUS" ]; then +echo "------------------------------------------------------------------" +if [ -n "${FOCUS:-}" ]; then PARALLELISM="--nodes=1" + echo "FOCUS is set, thus PARALLELISM is set to '$PARALLELISM'" +elif [ -n "${PARALLELISM:-}" ]; then + PARALLELISM="--nodes=$PARALLELISM" + echo "PARALLELISM is set. Will run ginkgo with '$PARALLELISM'" +else + PARALLELISM="-p" + echo "PARALLELISM is not set. Using default '$PARALLELISM'" fi +echo "------------------------------------------------------------------" ginkgo -v "$PARALLELISM" -r --trace --show-node-events --randomize-all --flake-attempts 5 "${ADDITIONAL_ARGS[@]}" diff --git a/ci/scripts/nft/watch-containers.sh b/ci/scripts/nft/watch-containers.sh index c97055ba..82875071 100644 --- a/ci/scripts/nft/watch-containers.sh +++ b/ci/scripts/nft/watch-containers.sh @@ -34,8 +34,12 @@ run_update_in_container() { return 1 fi ( - while true; do + attempt=0 + max_attempts=100 + while [ "$attempt" -lt "$max_attempts" ]; do + attempt=$((attempt + 1)) if output=$(docker exec -i "$cid" /bin/sh -s -- < "$SCRIPT_PATH" 2>&1); then + echo "successfully ran update-monit-nft.sh inside container $cid" >&2 exit 0 fi case "$output" in @@ -48,7 +52,11 @@ run_update_in_container() { exit 0 ;; *) - echo "failed to run update-monit-nft.sh inside container $cid; retrying in 1s" >&2 + if [ "$attempt" -ge "$max_attempts" ]; then + echo "failed to run update-monit-nft.sh inside container $cid after ${max_attempts} attempts: $output" >&2 + exit 1 + fi + echo "failed to run update-monit-nft.sh inside container $cid (attempt ${attempt}/${max_attempts}): $output; retrying in 1s" >&2 sleep 1 ;; esac diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 691f9033..b9ac091d 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -248,6 +248,7 @@ EOF -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ -o "$PWD/noble-updates.yml" \ + -o "$PWD/bosh-scaled-out.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ -v internal_gw=10.245.0.1 \ From 11487a06cbe5df8288f878403f4fcd6064537c2e Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Tue, 3 Mar 2026 15:55:35 +0100 Subject: [PATCH 20/28] POC: Noble and cgroupsv2, fix --- acceptance-tests/run-local.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/acceptance-tests/run-local.sh b/acceptance-tests/run-local.sh index 51cc2045..3ec74f89 100755 --- a/acceptance-tests/run-local.sh +++ b/acceptance-tests/run-local.sh @@ -8,14 +8,14 @@ PARALLELISM="" KEEP_RUNNING="" usage() { - echo -e "Usage: $0 [-F ] [-k] + echo -e "Usage: $0 [-F ] [-P ] [-k] -F Focus on a particular test. Expects a Ginkgo test name. Keep bosh running afterwards. -P Set Ginkgo parallel node count. Default is '-p' (smart parallelism). -k Keep bosh container running. Useful for debug." 1>&2; exit 1; } -while getopts ":F:k" o; do +while getopts ":F:P:k" o; do case "${o}" in F) FOCUS=${OPTARG} From aec0b1ba5e3be26230430d53258506cb24758351 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 4 Mar 2026 09:11:59 +0100 Subject: [PATCH 21/28] POC: Noble and cgroupsv2, give new Docker CPI release a try --- ci/noble-updates.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ci/noble-updates.yml b/ci/noble-updates.yml index c9ec337b..7031cd6c 100644 --- a/ci/noble-updates.yml +++ b/ci/noble-updates.yml @@ -4,6 +4,12 @@ - type: replace path: /instance_groups/name=bosh/properties/docker_cpi/start_containers_with_systemd? value: true +- type: replace + path: /stemcells/name=default? + value: + alias: default + os: ubuntu-noble + version: latest - type: replace path: /addons?/name=bosh-dns-systemd? value: From b6e80c1e662e2ad55dc84e7d45941897655cad6a Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 4 Mar 2026 09:12:09 +0100 Subject: [PATCH 22/28] POC: Noble and cgroupsv2, give new Docker CPI release a try --- ci/scripts/start-bosh.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index b9ac091d..1aa99167 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -259,10 +259,10 @@ EOF "${@}" > "${local_bosh_dir}/bosh-director.yml" # TODO: remove this workaround once we have a fix for the underlying issue with nftables - echo "Starting monit-nft-watcher to correct NF table for any starting container..." >&2 - if ! pgrep -f '/usr/local/sbin/watch-containers.sh' >/dev/null; then - nohup /usr/local/sbin/watch-containers.sh >/var/log/monit-nft-watcher.log 2>&1 & - fi + #echo "Starting monit-nft-watcher to correct NF table for any starting container..." >&2 + #if ! pgrep -f '/usr/local/sbin/watch-containers.sh' >/dev/null; then + # nohup /usr/local/sbin/watch-containers.sh >/var/log/monit-nft-watcher.log 2>&1 & + #fi echo "Creating BOSH director environment..." >&2 bosh create-env "${local_bosh_dir}/bosh-director.yml" \ From 03355888249d5cd8a75c93f2e047db1f9dbeb168 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 4 Mar 2026 09:20:51 +0100 Subject: [PATCH 23/28] POC: Noble and cgroupsv2, give new Docker CPI release a try --- ci/noble-updates.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ci/noble-updates.yml b/ci/noble-updates.yml index 7031cd6c..c9ec337b 100644 --- a/ci/noble-updates.yml +++ b/ci/noble-updates.yml @@ -4,12 +4,6 @@ - type: replace path: /instance_groups/name=bosh/properties/docker_cpi/start_containers_with_systemd? value: true -- type: replace - path: /stemcells/name=default? - value: - alias: default - os: ubuntu-noble - version: latest - type: replace path: /addons?/name=bosh-dns-systemd? value: From 1f32e942e7c438cff6c037ac567405dbe2b44b4f Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 4 Mar 2026 10:35:49 +0100 Subject: [PATCH 24/28] POC: Noble and cgroupsv2, give new Docker CPI release a try --- ci/scripts/start-bosh.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 1aa99167..0748451a 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -247,7 +247,6 @@ EOF -o docker/cpi.yml \ -o jumpbox-user.yml \ -o /usr/local/local-releases.yml \ - -o "$PWD/noble-updates.yml" \ -o "$PWD/bosh-scaled-out.yml" \ -v director_name=docker \ -v internal_cidr=${docker_network_cidr} \ From 411582f56c5e783f0a7ec8a6803313f2d5002f64 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 4 Mar 2026 11:12:27 +0100 Subject: [PATCH 25/28] POC: Noble and cgroupsv2 with new Docker CPI release --- acceptance-tests/run-local.sh | 11 ++- ci/Dockerfile | 6 -- ci/noble-updates.yml | 104 ----------------------------- ci/scripts/nft/update-monit-nft.sh | 64 ------------------ ci/scripts/nft/watch-containers.sh | 83 ----------------------- ci/scripts/start-bosh.sh | 6 -- 6 files changed, 4 insertions(+), 270 deletions(-) delete mode 100644 ci/noble-updates.yml delete mode 100644 ci/scripts/nft/update-monit-nft.sh delete mode 100644 ci/scripts/nft/watch-containers.sh diff --git a/acceptance-tests/run-local.sh b/acceptance-tests/run-local.sh index 3ec74f89..cefae2fb 100755 --- a/acceptance-tests/run-local.sh +++ b/acceptance-tests/run-local.sh @@ -37,14 +37,14 @@ shift $((OPTIND-1)) check_required_files() { PIDS="" REQUIRED_FILE_PATTERNS=( - ci/scripts/stemcell/bosh-stemcell-*-ubuntu-noble.tgz!https://storage.googleapis.com/bosh-core-stemcells/1.238/bosh-stemcell-1.238-warden-boshlite-ubuntu-noble.tgz!no - ci/scripts/stemcell-jammy/bosh-stemcell-*-ubuntu-jammy-*.tgz!https://bosh.io/d/stemcells/bosh-warden-boshlite-ubuntu-jammy-go_agent!yes + bosh upload-stemcell --sha1 fa990a329742e4be8a5ac1402d3ad9c726835f90 \ + ci/scripts/stemcell/bosh-stemcell-*-ubuntu-noble.tgz!https://bosh.io/d/stemcells/bosh-warden-boshlite-ubuntu-noble?v=1.267 + ci/scripts/stemcell-jammy/bosh-stemcell-*-ubuntu-jammy-*.tgz!https://bosh.io/d/stemcells/bosh-warden-boshlite-ubuntu-jammy-go_agent ) for entry in "${REQUIRED_FILE_PATTERNS[@]}"; do pattern=$(cut -f1 -d! <<<"$entry") url=$(cut -f2 -d! <<<"$entry") - to_resolve=$(cut -f3 -d! <<<"$entry") folder=$(realpath "$(dirname "$REPO_DIR/$pattern")") filepattern=$(basename "$pattern") pattern=$folder/$filepattern @@ -58,10 +58,7 @@ check_required_files() { ( echo "$filepattern not found, downloading." cd "$folder" - resolved="$url" - if [ "$to_resolve" == "yes" ]; then - resolved=$(curl -s --write-out '\n%{redirect_url}' "$url" | tail -n1 | tr -d '\n') - fi + resolved=$(curl -s --write-out '\n%{redirect_url}' "$url" | tail -n1 | tr -d '\n') echo "Resolved URL: $resolved" curl -s --remote-name --remote-header-name --location "$resolved" echo "Downloaded '$url' successfully." diff --git a/ci/Dockerfile b/ci/Dockerfile index ed63fdc5..cf690bc1 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -10,14 +10,8 @@ RUN apt-get update && \ RUN echo "source /tmp/local-bosh/director/env" >> /root/.bashrc # Copy ops files -COPY noble-updates.yml /usr/local/bosh-deployment/noble-updates.yml COPY bosh-scaled-out.yml /usr/local/bosh-deployment/bosh-scaled-out.yml -# Copy scripts -COPY scripts/nft/update-monit-nft.sh /usr/local/sbin/update-monit-nft.sh -COPY scripts/nft/watch-containers.sh /usr/local/sbin/watch-containers.sh -RUN chmod 755 /usr/local/sbin/update-monit-nft.sh /usr/local/sbin/watch-containers.sh - # Install Python libraries needed for scripts RUN python3 -m venv /opt/venv ENV PATH="/opt/venv/bin:${PATH}" diff --git a/ci/noble-updates.yml b/ci/noble-updates.yml deleted file mode 100644 index c9ec337b..00000000 --- a/ci/noble-updates.yml +++ /dev/null @@ -1,104 +0,0 @@ -- type: replace - path: /cloud_provider/properties/docker_cpi/start_containers_with_systemd? - value: true -- type: replace - path: /instance_groups/name=bosh/properties/docker_cpi/start_containers_with_systemd? - value: true -- type: replace - path: /addons?/name=bosh-dns-systemd? - value: - include: - stemcell: - - os: ubuntu-noble - jobs: - - name: bosh-dns - properties: - api: - client: - tls: ((dns_api_client_tls)) - server: - tls: ((dns_api_server_tls)) - cache: - enabled: true - configure_systemd_resolved: true - disable_recursors: true - health: - client: - tls: ((dns_healthcheck_client_tls)) - enabled: true - server: - tls: ((dns_healthcheck_server_tls)) - override_nameserver: false - release: bosh-dns - name: bosh-dns-systemd -- type: replace - path: /releases/name=bosh-dns? - value: - name: bosh-dns - sha1: 494d9e6ff68909a3aaddf146464dd4599f9f16a8 - url: https://bosh.io/d/github.com/cloudfoundry/bosh-dns-release?v=1.39.21 - version: 1.39.21 -- type: replace - path: /variables/name=dns_healthcheck_tls_ca? - value: - name: dns_healthcheck_tls_ca - options: - common_name: dns-healthcheck-tls-ca - is_ca: true - type: certificate -- type: replace - path: /variables/name=dns_healthcheck_server_tls? - value: - name: dns_healthcheck_server_tls - options: - alternative_names: - - health.bosh-dns - ca: dns_healthcheck_tls_ca - common_name: health.bosh-dns - extended_key_usage: - - server_auth - type: certificate -- type: replace - path: /variables/name=dns_healthcheck_client_tls? - value: - name: dns_healthcheck_client_tls - options: - alternative_names: - - health.bosh-dns - ca: dns_healthcheck_tls_ca - common_name: health.bosh-dns - extended_key_usage: - - client_auth - type: certificate -- type: replace - path: /variables/name=dns_api_tls_ca? - value: - name: dns_api_tls_ca - options: - common_name: dns-api-tls-ca - is_ca: true - type: certificate -- type: replace - path: /variables/name=dns_api_server_tls? - value: - name: dns_api_server_tls - options: - alternative_names: - - api.bosh-dns - ca: dns_api_tls_ca - common_name: api.bosh-dns - extended_key_usage: - - server_auth - type: certificate -- type: replace - path: /variables/name=dns_api_client_tls? - value: - name: dns_api_client_tls - options: - alternative_names: - - api.bosh-dns - ca: dns_api_tls_ca - common_name: api.bosh-dns - extended_key_usage: - - client_auth - type: certificate \ No newline at end of file diff --git a/ci/scripts/nft/update-monit-nft.sh b/ci/scripts/nft/update-monit-nft.sh deleted file mode 100644 index 8b326361..00000000 --- a/ci/scripts/nft/update-monit-nft.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/sh -set -eu - -# Enable DNS -tee /etc/resolv.conf >/dev/null <<'EOF' -nameserver 1.1.1.1 -nameserver 8.8.8.8 -options timeout:2 attempts:2 -EOF - -# Update NF tables rules for monit to use the correct cgroup path for the agent -DATE=$(date +%s) -NFT_FILE=/etc/nftables/monit.nft -BACKUP="${NFT_FILE}.bak.${DATE}" -TMP="$(mktemp /tmp/monit.nft.XXXXXX)" - -# Get ControlGroup value for bosh-agent.service -cg=$(systemctl show -p ControlGroup --value bosh-agent.service 2>/dev/null || true) -if [ -z "$cg" ]; then - echo "bosh-agent.service ControlGroup not found" >&2 - exit 1 -fi -cg=${cg#/} # remove leading slash if present -echo "Found ControlGroup for bosh-agent.service: $cg" - -# Replace the quoted cgroup path in the socket rule that matches the ip/tcp part -# The expected nft rule begins with: socket cgroupv2 level "" ip daddr 127.0.0.1 ... -awk_status=0 -awk -v new="$cg" ' -BEGIN { replaced = 0 } -/^[[:space:]]*socket[[:space:]]+cgroupv2[[:space:]]+level[[:space:]]+[0-9]+[[:space:]]+"[^"]+"[[:space:]]+ip[[:space:]]+daddr[[:space:]]+127\.0\.0\.1/ { - sub(/"[^"]+"/, "\"" new "\"", $0) - replaced = 1 -} -{ print } -END { if (replaced == 0) exit 3 } -' "$NFT_FILE" > "$TMP" || awk_status=$? -if [ "$awk_status" -eq 3 ]; then - echo "monit.nft socket rule not found; no changes made" >&2 - rm -f "$TMP" - exit 1 -elif [ "$awk_status" -ne 0 ]; then - echo "failed to update monit.nft (awk error $awk_status)" >&2 - rm -f "$TMP" - exit 1 -fi - -# Backup & atomically replace and try to reload nft -cp -p "$NFT_FILE" "$BACKUP" -if mv "$TMP" "$NFT_FILE"; then - if nft -f "$NFT_FILE"; then - echo "Updated monit.nft to cgroup: $cg" - exit 0 - else - echo "nft load failed, restoring backup" >&2 - mv "$BACKUP" "$NFT_FILE" - nft -f "$NFT_FILE" || echo "failed to restore nft rules; check $NFT_FILE and $BACKUP" >&2 - exit 1 - fi -else - echo "failed to replace $NFT_FILE" >&2 - rm -f "$TMP" - exit 1 -fi \ No newline at end of file diff --git a/ci/scripts/nft/watch-containers.sh b/ci/scripts/nft/watch-containers.sh deleted file mode 100644 index 82875071..00000000 --- a/ci/scripts/nft/watch-containers.sh +++ /dev/null @@ -1,83 +0,0 @@ -#!/bin/sh -set -eu - -# Find the first /tmp/tmp.* directory to use as DOCKER_CERT_PATH -CERT_DIR=$(find /tmp -maxdepth 1 -type d -regex '/tmp/tmp\.[A-Za-z0-9][A-Za-z0-9]*' -print | head -n 1) -if [ -z "$CERT_DIR" ]; then - echo "DOCKER_CERT_PATH not found (no /tmp/tmp.* directory)" >&2 - exit 1 -fi - -# Setup environment variables to talk to the host's Docker daemon -export DOCKER_HOST=tcp://172.17.0.2:4243 -export DOCKER_TLS_VERIFY=1 -export DOCKER_CERT_PATH="$CERT_DIR" - -SCRIPT_PATH=/usr/local/sbin/update-monit-nft.sh -BACKGROUND_PIDS="" - -# Kill any background update processes on exit -cleanup() { - for pid in $BACKGROUND_PIDS; do - kill "$pid" 2>/dev/null || true - done - echo "monit-nft watcher interrupted; exiting" >&2 - exit 0 -} -trap cleanup INT TERM - -# Run the update script inside the container, retrying if it fails due to the container not being ready yet -run_update_in_container() { - cid="$1" - if [ ! -r "$SCRIPT_PATH" ]; then - echo "missing host script: $SCRIPT_PATH" >&2 - return 1 - fi - ( - attempt=0 - max_attempts=100 - while [ "$attempt" -lt "$max_attempts" ]; do - attempt=$((attempt + 1)) - if output=$(docker exec -i "$cid" /bin/sh -s -- < "$SCRIPT_PATH" 2>&1); then - echo "successfully ran update-monit-nft.sh inside container $cid" >&2 - exit 0 - fi - case "$output" in - *"No such container"*) - echo "container $cid no longer exists; stop retrying" >&2 - exit 0 - ;; - *"is not running"*) - echo "container $cid is not running; stop retrying" >&2 - exit 0 - ;; - *) - if [ "$attempt" -ge "$max_attempts" ]; then - echo "failed to run update-monit-nft.sh inside container $cid after ${max_attempts} attempts: $output" >&2 - exit 1 - fi - echo "failed to run update-monit-nft.sh inside container $cid (attempt ${attempt}/${max_attempts}): $output; retrying in 1s" >&2 - sleep 1 - ;; - esac - done - ) & - pid=$! - BACKGROUND_PIDS="$BACKGROUND_PIDS $pid" - echo "started background update for container $cid (pid $pid)" >&2 -} - -# initial update for any already-running containers -for cid in $(docker ps -q 2>/dev/null); do - run_update_in_container "$cid" -done - -# watch for new containers and run update inside them -while true; do - docker events --filter 'event=start' --format '{{.Actor.ID}} {{.Type}} {{.Action}}' | while read -r id type action; do - echo "Received docker event: ID=$id Type=$type Action=$action" - run_update_in_container "$id" - done || true - echo "docker events stream ended or failed; retrying after 1s" >&2 - sleep 1 -done diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index 0748451a..be9bffd7 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -257,12 +257,6 @@ EOF -v docker_tls="$(cat "${local_bosh_dir}/docker_tls.json")" \ "${@}" > "${local_bosh_dir}/bosh-director.yml" - # TODO: remove this workaround once we have a fix for the underlying issue with nftables - #echo "Starting monit-nft-watcher to correct NF table for any starting container..." >&2 - #if ! pgrep -f '/usr/local/sbin/watch-containers.sh' >/dev/null; then - # nohup /usr/local/sbin/watch-containers.sh >/var/log/monit-nft-watcher.log 2>&1 & - #fi - echo "Creating BOSH director environment..." >&2 bosh create-env "${local_bosh_dir}/bosh-director.yml" \ --vars-store="${local_bosh_dir}/creds.yml" \ From b404255cdc192bdb26bc7d4c002713f2a43d0b22 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 4 Mar 2026 11:16:03 +0100 Subject: [PATCH 26/28] POC: Noble and cgroupsv2 with new Docker CPI release, remove trash --- acceptance-tests/run-local.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/acceptance-tests/run-local.sh b/acceptance-tests/run-local.sh index cefae2fb..8444a1cc 100755 --- a/acceptance-tests/run-local.sh +++ b/acceptance-tests/run-local.sh @@ -37,7 +37,6 @@ shift $((OPTIND-1)) check_required_files() { PIDS="" REQUIRED_FILE_PATTERNS=( - bosh upload-stemcell --sha1 fa990a329742e4be8a5ac1402d3ad9c726835f90 \ ci/scripts/stemcell/bosh-stemcell-*-ubuntu-noble.tgz!https://bosh.io/d/stemcells/bosh-warden-boshlite-ubuntu-noble?v=1.267 ci/scripts/stemcell-jammy/bosh-stemcell-*-ubuntu-jammy-*.tgz!https://bosh.io/d/stemcells/bosh-warden-boshlite-ubuntu-jammy-go_agent ) From fbb2551584b0b784f0609b889db1d16d31fb4765 Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 4 Mar 2026 13:24:39 +0100 Subject: [PATCH 27/28] cleanup: removed unused code --- .gitignore | 1 - acceptance-tests/run-local.sh | 10 +++++----- ci/bosh-scaled-out.yml | 2 +- ci/scripts/start-bosh.sh | 4 ---- 4 files changed, 6 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 54fa5535..881ead69 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,6 @@ releases/*.tgz releases/**/*.tgz ci/scripts/stemcell/*.tgz ci/scripts/stemcell-jammy/*.tgz -ci/scripts/stemcell-bionic/*.tgz dev_releases blobs/* .blobs diff --git a/acceptance-tests/run-local.sh b/acceptance-tests/run-local.sh index 8444a1cc..6a8cb411 100755 --- a/acceptance-tests/run-local.sh +++ b/acceptance-tests/run-local.sh @@ -56,11 +56,11 @@ check_required_files() { ( echo "$filepattern not found, downloading." - cd "$folder" - resolved=$(curl -s --write-out '\n%{redirect_url}' "$url" | tail -n1 | tr -d '\n') - echo "Resolved URL: $resolved" - curl -s --remote-name --remote-header-name --location "$resolved" - echo "Downloaded '$url' successfully." + cd "$folder" && \ + resolved=$(curl -s --write-out '\n%{redirect_url}' "$url" | tail -n1 | tr -d '\n') && \ + echo "Resolved URL: $resolved" && \ + curl -s --remote-name --remote-header-name --location "$resolved" && \ + echo "Downloaded '$url' successfully." && \ ls -1lh "$folder/"$filepattern )& diff --git a/ci/bosh-scaled-out.yml b/ci/bosh-scaled-out.yml index 93937df3..8c8d9b62 100644 --- a/ci/bosh-scaled-out.yml +++ b/ci/bosh-scaled-out.yml @@ -1,3 +1,3 @@ - type: replace path: /instance_groups/name=bosh/properties/director/workers? - value: 12 \ No newline at end of file + value: 12 diff --git a/ci/scripts/start-bosh.sh b/ci/scripts/start-bosh.sh index be9bffd7..482fbed8 100755 --- a/ci/scripts/start-bosh.sh +++ b/ci/scripts/start-bosh.sh @@ -104,10 +104,6 @@ function sanitize_cgroups() { done } -function stop_docker() { - service docker stop -} - function start_docker() { local certs_dir certs_dir="${1}" From f7ed91da05f89b58de566f860684ac519061a49f Mon Sep 17 00:00:00 2001 From: Mike Yeromko Date: Wed, 4 Mar 2026 13:29:35 +0100 Subject: [PATCH 28/28] refact: usage of link to the latest Noble stemcell --- acceptance-tests/run-local.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/acceptance-tests/run-local.sh b/acceptance-tests/run-local.sh index 6a8cb411..01d402f8 100755 --- a/acceptance-tests/run-local.sh +++ b/acceptance-tests/run-local.sh @@ -37,7 +37,7 @@ shift $((OPTIND-1)) check_required_files() { PIDS="" REQUIRED_FILE_PATTERNS=( - ci/scripts/stemcell/bosh-stemcell-*-ubuntu-noble.tgz!https://bosh.io/d/stemcells/bosh-warden-boshlite-ubuntu-noble?v=1.267 + ci/scripts/stemcell/bosh-stemcell-*-ubuntu-noble.tgz!https://bosh.io/d/stemcells/bosh-warden-boshlite-ubuntu-noble ci/scripts/stemcell-jammy/bosh-stemcell-*-ubuntu-jammy-*.tgz!https://bosh.io/d/stemcells/bosh-warden-boshlite-ubuntu-jammy-go_agent )