From e779ed41323e5a399e77f29ad786144014f33bf4 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 9 Feb 2026 20:54:32 -0500
Subject: [PATCH 01/11] Switch Phoenix GPU jobs to H200 nodes for faster
 scheduling

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/phoenix/submit-bench.sh | 5 ++---
 .github/workflows/phoenix/submit.sh       | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh
index 7ae85e66fe..fc28b3046b 100644
--- a/.github/workflows/phoenix/submit-bench.sh
+++ b/.github/workflows/phoenix/submit-bench.sh
@@ -20,9 +20,8 @@ sbatch_cpu_opts="\
 "
 
 sbatch_gpu_opts="\
-#SBATCH -CL40S
-#SBATCH --ntasks-per-node=4       # Number of cores per node required
-#SBATCH -G2\
+#SBATCH --gres=gpu:H200:2
+#SBATCH --ntasks-per-node=8       # Number of cores per node required\
 "
 
 if [ "$2" = "cpu" ]; then
diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh
index 06a03e465a..5747c839f0 100755
--- a/.github/workflows/phoenix/submit.sh
+++ b/.github/workflows/phoenix/submit.sh
@@ -23,9 +23,8 @@ sbatch_cpu_opts="\
 "
 
 sbatch_gpu_opts="\
-#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s
-#SBATCH --ntasks-per-node=4       # Number of cores per node required
-#SBATCH -G2\
+#SBATCH --gres=gpu:H200:2
+#SBATCH --ntasks-per-node=8       # Number of cores per node required\
 "
 
 if [ "$2" = "cpu" ]; then

From 9cf00d3ee1f479a8902a90012fde4488602308b8 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Wed, 11 Feb 2026 21:34:12 -0500
Subject: [PATCH 02/11] Fix bash segfault in monitor_slurm_job.sh from
 fractional read timeout

read -t 0.1 (sub-second timeout) in a loop with process substitution
file descriptors triggers a bash internal error (unwind_frame_run:
read_builtin: frame not found) leading to a segfault. Use integer
timeout (read -t 1) instead.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/scripts/monitor_slurm_job.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh
index 27472e01ef..232a894f8a 100755
--- a/.github/scripts/monitor_slurm_job.sh
+++ b/.github/scripts/monitor_slurm_job.sh
@@ -64,7 +64,7 @@ while true; do
   # Try to read from tail output (non-blocking via timeout)
   # Read multiple lines if available to avoid falling behind
   lines_read=0
-  while IFS= read -r -t 0.1 line <&3 2>/dev/null; do
+  while IFS= read -r -t 1 line <&3 2>/dev/null; do
     echo "$line"
     lines_read=$((lines_read + 1))
     last_heartbeat=$(date +%s)
@@ -115,7 +115,7 @@ done
 # Drain any remaining output from tail after job completes
 echo "Draining remaining output..."
 drain_count=0
-while IFS= read -r -t 0.5 line <&3 2>/dev/null; do
+while IFS= read -r -t 1 line <&3 2>/dev/null; do
   echo "$line"
   drain_count=$((drain_count + 1))
   # Safety limit to avoid infinite loop

From a59db02c83acbab62874c75c978b8ddb033d7323 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Feb 2026 13:56:50 -0500
Subject: [PATCH 03/11] Restore pull_request_review trigger for benchmark
 workflow

PR #1124 changed bench.yml to use workflow_run (triggered after Test
Suite completes), which broke the approve-to-run flow for fork PRs.
Revert to the original pull_request + pull_request_review triggers
while keeping improvements (frontier_amd matrix, concurrency group,
timeout, run_parallel_benchmarks.sh).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/bench.yml | 65 ++++---------------------------------
 1 file changed, 7 insertions(+), 58 deletions(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 6279f5f578..fd240b7a11 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -1,32 +1,24 @@
 name: 'Benchmark'
 
 on:
-  # Trigger when Test Suite completes (no polling needed)
-  workflow_run:
-    workflows: ["Test Suite"]
-    types: [completed]
+  pull_request:
+  pull_request_review:
+    types: [submitted]
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.workflow_run.head_branch || github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:
   file-changes:
     name: Detect File Changes
-    # Only run if Test Suite passed (or manual dispatch)
-    if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success'
     runs-on: 'ubuntu-latest'
     outputs:
       checkall: ${{ steps.changes.outputs.checkall }}
-      pr_number: ${{ steps.pr-info.outputs.pr_number }}
-      pr_approved: ${{ steps.pr-info.outputs.approved }}
-      pr_author: ${{ steps.pr-info.outputs.author }}
     steps:
       - name: Clone
         uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event.workflow_run.head_sha || github.sha }}
 
       - name: Detect Changes
         uses: dorny/paths-filter@v3
@@ -34,52 +26,10 @@ jobs:
         with:
           filters: ".github/file-filter.yml"
 
-      - name: Get PR Info
-        id: pr-info
-        env:
-          GH_TOKEN: ${{ github.token }}
-        run: |
-          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
-            echo "pr_number=" >> $GITHUB_OUTPUT
-            echo "approved=true" >> $GITHUB_OUTPUT
-            echo "author=${{ github.actor }}" >> $GITHUB_OUTPUT
-          else
-            # Get PR number from workflow_run
-            PR_NUMBER="${{ github.event.workflow_run.pull_requests[0].number }}"
-            if [ -n "$PR_NUMBER" ]; then
-              echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT
-
-              # Fetch actual PR author from API (workflow_run.actor is the re-runner, not PR author)
-              PR_AUTHOR=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER --jq '.user.login')
-              echo "author=$PR_AUTHOR" >> $GITHUB_OUTPUT
-
-              # Check if PR is approved
-              APPROVED=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews \
-                --jq '[.[] | select(.state == "APPROVED")] | length')
-              if [ "$APPROVED" -gt 0 ]; then
-                echo "approved=true" >> $GITHUB_OUTPUT
-              else
-                echo "approved=false" >> $GITHUB_OUTPUT
-              fi
-            else
-              echo "pr_number=" >> $GITHUB_OUTPUT
-              echo "approved=false" >> $GITHUB_OUTPUT
-              echo "author=" >> $GITHUB_OUTPUT
-            fi
-          fi
-
   self:
     name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
-    if: >
-      github.repository == 'MFlowCode/MFC' &&
-      needs.file-changes.outputs.checkall == 'true' &&
-      (
-        github.event_name == 'workflow_dispatch' ||
-        needs.file-changes.outputs.pr_approved == 'true' ||
-        needs.file-changes.outputs.pr_author == 'sbryngelson' ||
-        needs.file-changes.outputs.pr_author == 'wilfonba'
-      )
-    needs: [file-changes]
+    if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba')) || github.event_name=='workflow_dispatch') }}
+    needs: file-changes
     strategy:
       fail-fast: false
       matrix:
@@ -143,7 +93,6 @@ jobs:
       - name: Clone - PR
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event.workflow_run.head_sha || github.sha }}
           path: pr
 
       - name: Clone - Master
@@ -155,7 +104,7 @@ jobs:
 
       - name: Setup & Build
         if: matrix.build_script != ''
-        run: | 
+        run: |
           (cd pr     && ${{ matrix.build_script }}) &
           (cd master && ${{ matrix.build_script }}) &
           wait %1 && wait %2

From 2efc61e1eb98a8ea1287275c407cff5539a153f1 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Feb 2026 17:28:56 -0500
Subject: [PATCH 04/11] Auto-retry sporadic test failures in CI

Write failed test UUIDs to tests/failed_uuids.txt after a test run.
In CI, if 1-5 tests fail, automatically re-run just those tests.
If 6+ fail, treat it as a real issue and fail immediately.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/test.yml | 19 +++++++++++++++++--
 toolchain/mfc/test/test.py |  9 +++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 0be51076ec..3a5a0e33d7 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -134,8 +134,23 @@ jobs:
           TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
 
       - name: Test
-        run:  |
-          /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT
+        run: |
+          /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || true
+
+          # Retry only if a small number of tests failed (sporadic failures)
+          if [ -f tests/failed_uuids.txt ]; then
+            NUM_FAILED=$(wc -l < tests/failed_uuids.txt)
+            if [ "$NUM_FAILED" -le 5 ]; then
+              FAILED=$(cat tests/failed_uuids.txt | tr '\n' ' ')
+              echo ""
+              echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ==="
+              echo ""
+              /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) --only $FAILED $TEST_ALL
+            else
+              echo "Too many failures ($NUM_FAILED) to retry — likely a real issue."
+              exit 1
+            fi
+          fi
         env:
           TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
           TEST_PCT: ${{ matrix.debug == 'debug' && '-% 20' || '' }}
diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
index 31a3771cb9..d6dce92436 100644
--- a/toolchain/mfc/test/test.py
+++ b/toolchain/mfc/test/test.py
@@ -206,6 +206,15 @@ def test():
     # Build the summary report
     _print_test_summary(nPASS, nFAIL, nSKIP, minutes, seconds, failed_tests, skipped_cases)
 
+    # Write failed UUIDs to file for CI retry logic
+    failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt")
+    if failed_tests:
+        with open(failed_uuids_path, "w") as f:
+            for test_info in failed_tests:
+                f.write(test_info['uuid'] + "\n")
+    elif os.path.exists(failed_uuids_path):
+        os.remove(failed_uuids_path)
+
     exit(nFAIL)
 
 

From 0658bd348512de9f051b8ca4c7adbbb9a19f576b Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Feb 2026 17:40:49 -0500
Subject: [PATCH 05/11] Preserve exit code for catastrophic test failures

Don't mask non-zero exit codes when tests crash before writing
failed_uuids.txt. Only suppress the exit code when the file exists
(meaning the test framework ran to completion and we can retry).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/test.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 3a5a0e33d7..eec9d19fd0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -135,7 +135,8 @@ jobs:
 
       - name: Test
         run: |
-          /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || true
+          TEST_EXIT=0
+          /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || TEST_EXIT=$?
 
           # Retry only if a small number of tests failed (sporadic failures)
           if [ -f tests/failed_uuids.txt ]; then
@@ -150,6 +151,8 @@ jobs:
               echo "Too many failures ($NUM_FAILED) to retry — likely a real issue."
               exit 1
             fi
+          elif [ "$TEST_EXIT" -ne 0 ]; then
+            exit $TEST_EXIT
           fi
         env:
           TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}

From c6b6f8134409a0f99a375327752e4a5eee0c834d Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Feb 2026 21:48:39 -0500
Subject: [PATCH 06/11] Harden SLURM monitor: robust state checks, orphan
 cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace squeue exit-code polling with get_job_state() that parses
the actual state string (squeue + sacct fallback). Never give up on
UNKNOWN state — CI timeout is the backstop. Cancel orphaned SLURM
jobs on abnormal monitor exit. Include job state in heartbeats.

Incorporates changes from PR #1140.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/scripts/monitor_slurm_job.sh | 138 +++++++++++++++++----------
 1 file changed, 85 insertions(+), 53 deletions(-)

diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh
index 232a894f8a..408d205aab 100755
--- a/.github/scripts/monitor_slurm_job.sh
+++ b/.github/scripts/monitor_slurm_job.sh
@@ -4,11 +4,17 @@
 
 set -euo pipefail
 
-# Cleanup handler to prevent orphaned tail processes
+# Cleanup handler to prevent orphaned tail processes and cancel orphaned jobs
 cleanup() {
   if [ -n "${tail_pid:-}" ]; then
     kill "${tail_pid}" 2>/dev/null || true
   fi
+  # Cancel the SLURM job if the monitor is exiting due to an error
+  # (e.g., the CI runner is being killed). Don't cancel on success.
+  if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then
+    echo "Monitor exiting abnormally — cancelling SLURM job $job_id"
+    scancel "$job_id" 2>/dev/null || true
+  fi
 }
 trap cleanup EXIT
 
@@ -23,30 +29,78 @@ output_file="$2"
 echo "Submitted batch job $job_id"
 echo "Monitoring output file: $output_file"
 
-# Wait for file to appear with retry logic for transient squeue failures
+# Robustly check SLURM job state using squeue with sacct fallback.
+# Returns the state string (PENDING, RUNNING, COMPLETED, FAILED, etc.)
+# or "UNKNOWN" if both commands fail.
+get_job_state() {
+  local jid="$1"
+  local state
+
+  # Try squeue first (fast, works for active jobs)
+  state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ')
+  if [ -n "$state" ]; then
+    echo "$state"
+    return
+  fi
+
+  # Fallback to sacct (works for completed/historical jobs)
+  if command -v sacct >/dev/null 2>&1; then
+    state=$(sacct -j "$jid" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}')
+    if [ -n "$state" ]; then
+      echo "$state"
+      return
+    fi
+  fi
+
+  echo "UNKNOWN"
+}
+
+# Check if a state is terminal (job is done, for better or worse)
+is_terminal_state() {
+  case "$1" in
+    COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|PREEMPTED|BOOT_FAIL|DEADLINE)
+      return 0 ;;
+    *)
+      return 1 ;;
+  esac
+}
+
+# Wait for file to appear, using robust state checking.
+# Never give up due to transient squeue/sacct failures — the CI job timeout
+# is the ultimate backstop.
 echo "Waiting for job to start..."
-squeue_retries=0
-max_squeue_retries=5
+unknown_count=0
 while [ ! -f "$output_file" ]; do
-  # Check if job is still queued/running
-  if squeue -j "$job_id" &>/dev/null; then
-    squeue_retries=0  # Reset on success
-    sleep 5
-  else
-    squeue_retries=$((squeue_retries + 1))
-    if [ $squeue_retries -ge $max_squeue_retries ]; then
-      # Job not in queue and output file doesn't exist
-      if [ ! -f "$output_file" ]; then
-        echo "ERROR: Job $job_id not in queue and output file not created"
+  state=$(get_job_state "$job_id")
+
+  case "$state" in
+    PENDING|CONFIGURING)
+      unknown_count=0
+      sleep 5
+      ;;
+    RUNNING|COMPLETING)
+      unknown_count=0
+      # Job is running but output file not yet visible (NFS delay)
+      sleep 2
+      ;;
+    UNKNOWN)
+      unknown_count=$((unknown_count + 1))
+      # Only print warning periodically to avoid log spam
+      if [ $((unknown_count % 12)) -eq 1 ]; then
+        echo "Warning: Could not query job $job_id state (SLURM may be temporarily unavailable)..."
+      fi
+      sleep 5
+      ;;
+    *)
+      # Terminal state — job finished without creating output
+      if is_terminal_state "$state"; then
+        echo "ERROR: Job $job_id reached terminal state ($state) without creating output file"
         exit 1
       fi
-      break
-    fi
-    # Exponential backoff
-    sleep_time=$((2 ** squeue_retries))
-    echo "Warning: squeue check failed, retrying in ${sleep_time}s..."
-    sleep $sleep_time
-  fi
+      # Unrecognized state, keep waiting
+      sleep 5
+      ;;
+  esac
 done
 
 echo "=== Streaming output for job $job_id ==="
@@ -57,7 +111,6 @@ exec 3< <(stdbuf -oL -eL tail -f "$output_file" 2>&1)
 tail_pid=$!
 
 # Monitor job status and stream output simultaneously
-squeue_failures=0
 last_heartbeat=$(date +%s)
 
 while true; do
@@ -73,41 +126,22 @@ while true; do
       break
     fi
   done
-  
+
   # Check job status
   current_time=$(date +%s)
-  if ! squeue -j "$job_id" &>/dev/null; then
-    squeue_failures=$((squeue_failures + 1))
-    # Check if job actually completed using sacct (if available)
-    if [ $squeue_failures -ge 3 ]; then
-      if command -v sacct >/dev/null 2>&1; then
-        state=$(sacct -j "$job_id" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}')
-        # Consider job done only if it reached a terminal state
-        case "$state" in
-          COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY)
-            echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
-            break
-            ;;
-          *)
-            # treat as transient failure, reset failures and continue polling
-            squeue_failures=0
-            ;;
-        esac
-      else
-        # No sacct: assume job completed after 3 failures
-        echo "[$(date +%H:%M:%S)] Job $job_id no longer in queue"
-        break
-      fi
-    fi
+  state=$(get_job_state "$job_id")
+
+  if is_terminal_state "$state"; then
+    echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
+    break
   else
-    squeue_failures=0
     # Print heartbeat if no output for 60 seconds
     if [ $((current_time - last_heartbeat)) -ge 60 ]; then
-      echo "[$(date +%H:%M:%S)] Job $job_id still running (no new output for 60s)..."
+      echo "[$(date +%H:%M:%S)] Job $job_id state=$state (no new output for 60s)..."
       last_heartbeat=$current_time
     fi
   fi
-  
+
   # Sleep briefly between status checks
   sleep 1
 done
@@ -128,6 +162,7 @@ done
 # Close the file descriptor and kill tail
 exec 3<&-
 kill "${tail_pid}" 2>/dev/null || true
+tail_pid=""
 
 # Wait for output file to finish growing (stabilize) before stopping tail
 if [ -f "$output_file" ]; then
@@ -149,9 +184,6 @@ if [ -f "$output_file" ]; then
   done
 fi
 
-# Stop tailing (trap will also handle this on exit)
-kill "${tail_pid}" 2>/dev/null || true
-
 echo ""
 echo "=== Final output ==="
 cat "$output_file"
@@ -187,6 +219,6 @@ if [ "$exit_code" != "0:0" ]; then
   exit 1
 fi
 
+monitor_success=1
 echo "Job $job_id completed successfully"
 exit 0
-

From a82959e1e793bfba5983cad3f0c4c84c85da795c Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Feb 2026 22:26:38 -0500
Subject: [PATCH 07/11] Use parsable sacct flags for robust state parsing

Use -n -X -P flags with sacct: -X restricts to job allocation (not
steps), -P gives pipe-delimited output for reliable parsing.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/scripts/monitor_slurm_job.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh
index 408d205aab..d9f2237032 100755
--- a/.github/scripts/monitor_slurm_job.sh
+++ b/.github/scripts/monitor_slurm_job.sh
@@ -45,7 +45,7 @@ get_job_state() {
 
   # Fallback to sacct (works for completed/historical jobs)
   if command -v sacct >/dev/null 2>&1; then
-    state=$(sacct -j "$jid" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}')
+    state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1)
     if [ -n "$state" ]; then
       echo "$state"
       return

From 80229694a2283a5a5b9eac5ad5c5ef123934c669 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Feb 2026 22:49:45 -0500
Subject: [PATCH 08/11] Guard squeue/sacct pipelines against set -euo pipefail

With pipefail, a transient squeue failure would exit the script
instead of falling through to return UNKNOWN. Add || true to both
pipelines. Also fix stale comment about tail stopping.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/scripts/monitor_slurm_job.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh
index d9f2237032..4981e5e607 100755
--- a/.github/scripts/monitor_slurm_job.sh
+++ b/.github/scripts/monitor_slurm_job.sh
@@ -37,7 +37,7 @@ get_job_state() {
   local state
 
   # Try squeue first (fast, works for active jobs)
-  state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ')
+  state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ' || true)
   if [ -n "$state" ]; then
     echo "$state"
     return
@@ -45,7 +45,7 @@ get_job_state() {
 
   # Fallback to sacct (works for completed/historical jobs)
   if command -v sacct >/dev/null 2>&1; then
-    state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1)
+    state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 || true)
     if [ -n "$state" ]; then
       echo "$state"
       return
@@ -164,7 +164,7 @@ exec 3<&-
 kill "${tail_pid}" 2>/dev/null || true
 tail_pid=""
 
-# Wait for output file to finish growing (stabilize) before stopping tail
+# Wait for output file to stabilize (NFS flush) before final read
 if [ -f "$output_file" ]; then
   last_size=-1
   same_count=0

From 88d19ce4b562f14c9abf832c6cae19e4fc0851ea Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 13 Feb 2026 09:29:42 -0500
Subject: [PATCH 09/11] Retry delete_directory on Lustre ENOTEMPTY race

shutil.rmtree can fail with "Directory not empty" on networked
filesystems (Lustre) due to metadata propagation delays. Retry
up to 5 times with 1s backoff before raising.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 toolchain/mfc/common.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/toolchain/mfc/common.py b/toolchain/mfc/common.py
index ce02e8251c..e56c6a9eb4 100644
--- a/toolchain/mfc/common.py
+++ b/toolchain/mfc/common.py
@@ -1,4 +1,4 @@
-import os, yaml, typing, shutil, subprocess, logging
+import os, yaml, typing, shutil, subprocess, logging, time
 
 from os.path import join, abspath, normpath, dirname, realpath
 
@@ -122,8 +122,16 @@ def create_directory(dirpath: str) -> None:
 
 
 def delete_directory(dirpath: str) -> None:
-    if os.path.isdir(dirpath):
-        shutil.rmtree(dirpath)
+    for attempt in range(5):
+        if not os.path.isdir(dirpath):
+            return
+        try:
+            shutil.rmtree(dirpath)
+            return
+        except OSError:
+            if attempt == 4:
+                raise
+            time.sleep(1)
 
 
 def get_program_output(arguments: typing.List[str] = None, cwd=None):

From 05d28f37bb01c099d40b365ab2857c169e6954ab Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 13 Feb 2026 09:44:16 -0500
Subject: [PATCH 10/11] Remove stale failed_uuids.txt before test run

On self-hosted runners the workspace persists between runs, so a
leftover file could trigger spurious retries.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index eec9d19fd0..21e52d5a5e 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -135,6 +135,7 @@ jobs:
 
       - name: Test
         run: |
+          rm -f tests/failed_uuids.txt
           TEST_EXIT=0
           /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || TEST_EXIT=$?
 

From 9eed0c65d2d114cc7aa8134fc3d3cf809b18c0b5 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 13 Feb 2026 10:08:33 -0500
Subject: [PATCH 11/11] Split benchmark concurrency group by event type

Bot review events (pull_request_review) were racing against and
cancelling legitimate push-triggered (pull_request) benchmark runs
via the shared concurrency group.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/bench.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index fd240b7a11..53efac21ed 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -7,7 +7,7 @@ on:
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
   cancel-in-progress: true
 
 jobs: