From e779ed41323e5a399e77f29ad786144014f33bf4 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 9 Feb 2026 20:54:32 -0500 Subject: [PATCH 01/11] Switch Phoenix GPU jobs to H200 nodes for faster scheduling Co-Authored-By: Claude Opus 4.6 --- .github/workflows/phoenix/submit-bench.sh | 5 ++--- .github/workflows/phoenix/submit.sh | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh index 7ae85e66fe..fc28b3046b 100644 --- a/.github/workflows/phoenix/submit-bench.sh +++ b/.github/workflows/phoenix/submit-bench.sh @@ -20,9 +20,8 @@ sbatch_cpu_opts="\ " sbatch_gpu_opts="\ -#SBATCH -CL40S -#SBATCH --ntasks-per-node=4 # Number of cores per node required -#SBATCH -G2\ +#SBATCH --gres=gpu:H200:2 +#SBATCH --ntasks-per-node=8 # Number of cores per node required\ " if [ "$2" = "cpu" ]; then diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh index 06a03e465a..5747c839f0 100755 --- a/.github/workflows/phoenix/submit.sh +++ b/.github/workflows/phoenix/submit.sh @@ -23,9 +23,8 @@ sbatch_cpu_opts="\ " sbatch_gpu_opts="\ -#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s -#SBATCH --ntasks-per-node=4 # Number of cores per node required -#SBATCH -G2\ +#SBATCH --gres=gpu:H200:2 +#SBATCH --ntasks-per-node=8 # Number of cores per node required\ " if [ "$2" = "cpu" ]; then From 9cf00d3ee1f479a8902a90012fde4488602308b8 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Wed, 11 Feb 2026 21:34:12 -0500 Subject: [PATCH 02/11] Fix bash segfault in monitor_slurm_job.sh from fractional read timeout read -t 0.1 (sub-second timeout) in a loop with process substitution file descriptors triggers a bash internal error (unwind_frame_run: read_builtin: frame not found) leading to a segfault. Use integer timeout (read -t 1) instead. Co-Authored-By: Claude Opus 4.6 --- .github/scripts/monitor_slurm_job.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh index 27472e01ef..232a894f8a 100755 --- a/.github/scripts/monitor_slurm_job.sh +++ b/.github/scripts/monitor_slurm_job.sh @@ -64,7 +64,7 @@ while true; do # Try to read from tail output (non-blocking via timeout) # Read multiple lines if available to avoid falling behind lines_read=0 - while IFS= read -r -t 0.1 line <&3 2>/dev/null; do + while IFS= read -r -t 1 line <&3 2>/dev/null; do echo "$line" lines_read=$((lines_read + 1)) last_heartbeat=$(date +%s) @@ -115,7 +115,7 @@ done # Drain any remaining output from tail after job completes echo "Draining remaining output..." drain_count=0 -while IFS= read -r -t 0.5 line <&3 2>/dev/null; do +while IFS= read -r -t 1 line <&3 2>/dev/null; do echo "$line" drain_count=$((drain_count + 1)) # Safety limit to avoid infinite loop From a59db02c83acbab62874c75c978b8ddb033d7323 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Feb 2026 13:56:50 -0500 Subject: [PATCH 03/11] Restore pull_request_review trigger for benchmark workflow PR #1124 changed bench.yml to use workflow_run (triggered after Test Suite completes), which broke the approve-to-run flow for fork PRs. Revert to the original pull_request + pull_request_review triggers while keeping improvements (frontier_amd matrix, concurrency group, timeout, run_parallel_benchmarks.sh). Co-Authored-By: Claude Opus 4.6 --- .github/workflows/bench.yml | 65 ++++--------------------------------- 1 file changed, 7 insertions(+), 58 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 6279f5f578..fd240b7a11 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -1,32 +1,24 @@ name: 'Benchmark' on: - # Trigger when Test Suite completes (no polling needed) - workflow_run: - workflows: ["Test Suite"] - types: [completed] + pull_request: + pull_request_review: + types: [submitted] workflow_dispatch: concurrency: - group: ${{ github.workflow }}-${{ github.event.workflow_run.head_branch || github.ref }} + group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: file-changes: name: Detect File Changes - # Only run if Test Suite passed (or manual dispatch) - if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' runs-on: 'ubuntu-latest' outputs: checkall: ${{ steps.changes.outputs.checkall }} - pr_number: ${{ steps.pr-info.outputs.pr_number }} - pr_approved: ${{ steps.pr-info.outputs.approved }} - pr_author: ${{ steps.pr-info.outputs.author }} steps: - name: Clone uses: actions/checkout@v4 - with: - ref: ${{ github.event.workflow_run.head_sha || github.sha }} - name: Detect Changes uses: dorny/paths-filter@v3 @@ -34,52 +26,10 @@ jobs: with: filters: ".github/file-filter.yml" - - name: Get PR Info - id: pr-info - env: - GH_TOKEN: ${{ github.token }} - run: | - if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - echo "pr_number=" >> $GITHUB_OUTPUT - echo "approved=true" >> $GITHUB_OUTPUT - echo "author=${{ github.actor }}" >> $GITHUB_OUTPUT - else - # Get PR number from workflow_run - PR_NUMBER="${{ github.event.workflow_run.pull_requests[0].number }}" - if [ -n "$PR_NUMBER" ]; then - echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT - - # Fetch actual PR author from API (workflow_run.actor is the re-runner, not PR author) - PR_AUTHOR=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER --jq '.user.login') - echo "author=$PR_AUTHOR" >> $GITHUB_OUTPUT - - # Check if PR is approved - APPROVED=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews \ - --jq '[.[] | select(.state == "APPROVED")] | length') - if [ "$APPROVED" -gt 0 ]; then - echo "approved=true" >> $GITHUB_OUTPUT - else - echo "approved=false" >> $GITHUB_OUTPUT - fi - else - echo "pr_number=" >> $GITHUB_OUTPUT - echo "approved=false" >> $GITHUB_OUTPUT - echo "author=" >> $GITHUB_OUTPUT - fi - fi - self: name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})" - if: > - github.repository == 'MFlowCode/MFC' && - needs.file-changes.outputs.checkall == 'true' && - ( - github.event_name == 'workflow_dispatch' || - needs.file-changes.outputs.pr_approved == 'true' || - needs.file-changes.outputs.pr_author == 'sbryngelson' || - needs.file-changes.outputs.pr_author == 'wilfonba' - ) - needs: [file-changes] + if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba')) || github.event_name=='workflow_dispatch') }} + needs: file-changes strategy: fail-fast: false matrix: @@ -143,7 +93,6 @@ jobs: - name: Clone - PR uses: actions/checkout@v4 with: - ref: ${{ github.event.workflow_run.head_sha || github.sha }} path: pr - name: Clone - Master @@ -155,7 +104,7 @@ jobs: - name: Setup & Build if: matrix.build_script != '' - run: | + run: | (cd pr && ${{ matrix.build_script }}) & (cd master && ${{ matrix.build_script }}) & wait %1 && wait %2 From 2efc61e1eb98a8ea1287275c407cff5539a153f1 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Feb 2026 17:28:56 -0500 Subject: [PATCH 04/11] Auto-retry sporadic test failures in CI Write failed test UUIDs to tests/failed_uuids.txt after a test run. In CI, if 1-5 tests fail, automatically re-run just those tests. If 6+ fail, treat it as a real issue and fail immediately. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/test.yml | 19 +++++++++++++++++-- toolchain/mfc/test/test.py | 9 +++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0be51076ec..3a5a0e33d7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -134,8 +134,23 @@ jobs: TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }} - name: Test - run: | - /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT + run: | + /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || true + + # Retry only if a small number of tests failed (sporadic failures) + if [ -f tests/failed_uuids.txt ]; then + NUM_FAILED=$(wc -l < tests/failed_uuids.txt) + if [ "$NUM_FAILED" -le 5 ]; then + FAILED=$(cat tests/failed_uuids.txt | tr '\n' ' ') + echo "" + echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ===" + echo "" + /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) --only $FAILED $TEST_ALL + else + echo "Too many failures ($NUM_FAILED) to retry — likely a real issue." + exit 1 + fi + fi env: TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }} TEST_PCT: ${{ matrix.debug == 'debug' && '-% 20' || '' }} diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 31a3771cb9..d6dce92436 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -206,6 +206,15 @@ def test(): # Build the summary report _print_test_summary(nPASS, nFAIL, nSKIP, minutes, seconds, failed_tests, skipped_cases) + # Write failed UUIDs to file for CI retry logic + failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt") + if failed_tests: + with open(failed_uuids_path, "w") as f: + for test_info in failed_tests: + f.write(test_info['uuid'] + "\n") + elif os.path.exists(failed_uuids_path): + os.remove(failed_uuids_path) + exit(nFAIL) From 0658bd348512de9f051b8ca4c7adbbb9a19f576b Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Feb 2026 17:40:49 -0500 Subject: [PATCH 05/11] Preserve exit code for catastrophic test failures Don't mask non-zero exit codes when tests crash before writing failed_uuids.txt. Only suppress the exit code when the file exists (meaning the test framework ran to completion and we can retry). Co-Authored-By: Claude Opus 4.6 --- .github/workflows/test.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3a5a0e33d7..eec9d19fd0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -135,7 +135,8 @@ jobs: - name: Test run: | - /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || true + TEST_EXIT=0 + /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || TEST_EXIT=$? # Retry only if a small number of tests failed (sporadic failures) if [ -f tests/failed_uuids.txt ]; then @@ -150,6 +151,8 @@ jobs: echo "Too many failures ($NUM_FAILED) to retry — likely a real issue." exit 1 fi + elif [ "$TEST_EXIT" -ne 0 ]; then + exit $TEST_EXIT fi env: TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }} From c6b6f8134409a0f99a375327752e4a5eee0c834d Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Feb 2026 21:48:39 -0500 Subject: [PATCH 06/11] Harden SLURM monitor: robust state checks, orphan cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace squeue exit-code polling with get_job_state() that parses the actual state string (squeue + sacct fallback). Never give up on UNKNOWN state — CI timeout is the backstop. Cancel orphaned SLURM jobs on abnormal monitor exit. Include job state in heartbeats. Incorporates changes from PR #1140. Co-Authored-By: Claude Opus 4.6 --- .github/scripts/monitor_slurm_job.sh | 138 +++++++++++++++++---------- 1 file changed, 85 insertions(+), 53 deletions(-) diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh index 232a894f8a..408d205aab 100755 --- a/.github/scripts/monitor_slurm_job.sh +++ b/.github/scripts/monitor_slurm_job.sh @@ -4,11 +4,17 @@ set -euo pipefail -# Cleanup handler to prevent orphaned tail processes +# Cleanup handler to prevent orphaned tail processes and cancel orphaned jobs cleanup() { if [ -n "${tail_pid:-}" ]; then kill "${tail_pid}" 2>/dev/null || true fi + # Cancel the SLURM job if the monitor is exiting due to an error + # (e.g., the CI runner is being killed). Don't cancel on success. + if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then + echo "Monitor exiting abnormally — cancelling SLURM job $job_id" + scancel "$job_id" 2>/dev/null || true + fi } trap cleanup EXIT @@ -23,30 +29,78 @@ output_file="$2" echo "Submitted batch job $job_id" echo "Monitoring output file: $output_file" -# Wait for file to appear with retry logic for transient squeue failures +# Robustly check SLURM job state using squeue with sacct fallback. +# Returns the state string (PENDING, RUNNING, COMPLETED, FAILED, etc.) +# or "UNKNOWN" if both commands fail. +get_job_state() { + local jid="$1" + local state + + # Try squeue first (fast, works for active jobs) + state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ') + if [ -n "$state" ]; then + echo "$state" + return + fi + + # Fallback to sacct (works for completed/historical jobs) + if command -v sacct >/dev/null 2>&1; then + state=$(sacct -j "$jid" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}') + if [ -n "$state" ]; then + echo "$state" + return + fi + fi + + echo "UNKNOWN" +} + +# Check if a state is terminal (job is done, for better or worse) +is_terminal_state() { + case "$1" in + COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|PREEMPTED|BOOT_FAIL|DEADLINE) + return 0 ;; + *) + return 1 ;; + esac +} + +# Wait for file to appear, using robust state checking. +# Never give up due to transient squeue/sacct failures — the CI job timeout +# is the ultimate backstop. echo "Waiting for job to start..." -squeue_retries=0 -max_squeue_retries=5 +unknown_count=0 while [ ! -f "$output_file" ]; do - # Check if job is still queued/running - if squeue -j "$job_id" &>/dev/null; then - squeue_retries=0 # Reset on success - sleep 5 - else - squeue_retries=$((squeue_retries + 1)) - if [ $squeue_retries -ge $max_squeue_retries ]; then - # Job not in queue and output file doesn't exist - if [ ! -f "$output_file" ]; then - echo "ERROR: Job $job_id not in queue and output file not created" + state=$(get_job_state "$job_id") + + case "$state" in + PENDING|CONFIGURING) + unknown_count=0 + sleep 5 + ;; + RUNNING|COMPLETING) + unknown_count=0 + # Job is running but output file not yet visible (NFS delay) + sleep 2 + ;; + UNKNOWN) + unknown_count=$((unknown_count + 1)) + # Only print warning periodically to avoid log spam + if [ $((unknown_count % 12)) -eq 1 ]; then + echo "Warning: Could not query job $job_id state (SLURM may be temporarily unavailable)..." + fi + sleep 5 + ;; + *) + # Terminal state — job finished without creating output + if is_terminal_state "$state"; then + echo "ERROR: Job $job_id reached terminal state ($state) without creating output file" exit 1 fi - break - fi - # Exponential backoff - sleep_time=$((2 ** squeue_retries)) - echo "Warning: squeue check failed, retrying in ${sleep_time}s..." - sleep $sleep_time - fi + # Unrecognized state, keep waiting + sleep 5 + ;; + esac done echo "=== Streaming output for job $job_id ===" @@ -57,7 +111,6 @@ exec 3< <(stdbuf -oL -eL tail -f "$output_file" 2>&1) tail_pid=$! # Monitor job status and stream output simultaneously -squeue_failures=0 last_heartbeat=$(date +%s) while true; do @@ -73,41 +126,22 @@ while true; do break fi done - + # Check job status current_time=$(date +%s) - if ! squeue -j "$job_id" &>/dev/null; then - squeue_failures=$((squeue_failures + 1)) - # Check if job actually completed using sacct (if available) - if [ $squeue_failures -ge 3 ]; then - if command -v sacct >/dev/null 2>&1; then - state=$(sacct -j "$job_id" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}') - # Consider job done only if it reached a terminal state - case "$state" in - COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY) - echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state" - break - ;; - *) - # treat as transient failure, reset failures and continue polling - squeue_failures=0 - ;; - esac - else - # No sacct: assume job completed after 3 failures - echo "[$(date +%H:%M:%S)] Job $job_id no longer in queue" - break - fi - fi + state=$(get_job_state "$job_id") + + if is_terminal_state "$state"; then + echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state" + break else - squeue_failures=0 # Print heartbeat if no output for 60 seconds if [ $((current_time - last_heartbeat)) -ge 60 ]; then - echo "[$(date +%H:%M:%S)] Job $job_id still running (no new output for 60s)..." + echo "[$(date +%H:%M:%S)] Job $job_id state=$state (no new output for 60s)..." last_heartbeat=$current_time fi fi - + # Sleep briefly between status checks sleep 1 done @@ -128,6 +162,7 @@ done # Close the file descriptor and kill tail exec 3<&- kill "${tail_pid}" 2>/dev/null || true +tail_pid="" # Wait for output file to finish growing (stabilize) before stopping tail if [ -f "$output_file" ]; then @@ -149,9 +184,6 @@ if [ -f "$output_file" ]; then done fi -# Stop tailing (trap will also handle this on exit) -kill "${tail_pid}" 2>/dev/null || true - echo "" echo "=== Final output ===" cat "$output_file" @@ -187,6 +219,6 @@ if [ "$exit_code" != "0:0" ]; then exit 1 fi +monitor_success=1 echo "Job $job_id completed successfully" exit 0 - From a82959e1e793bfba5983cad3f0c4c84c85da795c Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Feb 2026 22:26:38 -0500 Subject: [PATCH 07/11] Use parsable sacct flags for robust state parsing Use -n -X -P flags with sacct: -X restricts to job allocation (not steps), -P gives pipe-delimited output for reliable parsing. Co-Authored-By: Claude Opus 4.6 --- .github/scripts/monitor_slurm_job.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh index 408d205aab..d9f2237032 100755 --- a/.github/scripts/monitor_slurm_job.sh +++ b/.github/scripts/monitor_slurm_job.sh @@ -45,7 +45,7 @@ get_job_state() { # Fallback to sacct (works for completed/historical jobs) if command -v sacct >/dev/null 2>&1; then - state=$(sacct -j "$jid" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}') + state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1) if [ -n "$state" ]; then echo "$state" return From 80229694a2283a5a5b9eac5ad5c5ef123934c669 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Feb 2026 22:49:45 -0500 Subject: [PATCH 08/11] Guard squeue/sacct pipelines against set -euo pipefail With pipefail, a transient squeue failure would exit the script instead of falling through to return UNKNOWN. Add || true to both pipelines. Also fix stale comment about tail stopping. Co-Authored-By: Claude Opus 4.6 --- .github/scripts/monitor_slurm_job.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh index d9f2237032..4981e5e607 100755 --- a/.github/scripts/monitor_slurm_job.sh +++ b/.github/scripts/monitor_slurm_job.sh @@ -37,7 +37,7 @@ get_job_state() { local state # Try squeue first (fast, works for active jobs) - state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ') + state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ' || true) if [ -n "$state" ]; then echo "$state" return @@ -45,7 +45,7 @@ get_job_state() { # Fallback to sacct (works for completed/historical jobs) if command -v sacct >/dev/null 2>&1; then - state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1) + state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 || true) if [ -n "$state" ]; then echo "$state" return @@ -164,7 +164,7 @@ exec 3<&- kill "${tail_pid}" 2>/dev/null || true tail_pid="" -# Wait for output file to finish growing (stabilize) before stopping tail +# Wait for output file to stabilize (NFS flush) before final read if [ -f "$output_file" ]; then last_size=-1 same_count=0 From 88d19ce4b562f14c9abf832c6cae19e4fc0851ea Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 13 Feb 2026 09:29:42 -0500 Subject: [PATCH 09/11] Retry delete_directory on Lustre ENOTEMPTY race shutil.rmtree can fail with "Directory not empty" on networked filesystems (Lustre) due to metadata propagation delays. Retry up to 5 times with 1s backoff before raising. Co-Authored-By: Claude Opus 4.6 --- toolchain/mfc/common.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/toolchain/mfc/common.py b/toolchain/mfc/common.py index ce02e8251c..e56c6a9eb4 100644 --- a/toolchain/mfc/common.py +++ b/toolchain/mfc/common.py @@ -1,4 +1,4 @@ -import os, yaml, typing, shutil, subprocess, logging +import os, yaml, typing, shutil, subprocess, logging, time from os.path import join, abspath, normpath, dirname, realpath @@ -122,8 +122,16 @@ def create_directory(dirpath: str) -> None: def delete_directory(dirpath: str) -> None: - if os.path.isdir(dirpath): - shutil.rmtree(dirpath) + for attempt in range(5): + if not os.path.isdir(dirpath): + return + try: + shutil.rmtree(dirpath) + return + except OSError: + if attempt == 4: + raise + time.sleep(1) def get_program_output(arguments: typing.List[str] = None, cwd=None): From 05d28f37bb01c099d40b365ab2857c169e6954ab Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 13 Feb 2026 09:44:16 -0500 Subject: [PATCH 10/11] Remove stale failed_uuids.txt before test run On self-hosted runners the workspace persists between runs, so a leftover file could trigger spurious retries. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index eec9d19fd0..21e52d5a5e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -135,6 +135,7 @@ jobs: - name: Test run: | + rm -f tests/failed_uuids.txt TEST_EXIT=0 /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || TEST_EXIT=$? From 9eed0c65d2d114cc7aa8134fc3d3cf809b18c0b5 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 13 Feb 2026 10:08:33 -0500 Subject: [PATCH 11/11] Split benchmark concurrency group by event type Bot review events (pull_request_review) were racing against and cancelling legitimate push-triggered (pull_request) benchmark runs via the shared concurrency group. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/bench.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index fd240b7a11..53efac21ed 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -7,7 +7,7 @@ on: workflow_dispatch: concurrency: - group: ${{ github.workflow }}-${{ github.ref }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} cancel-in-progress: true jobs: