Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 88 additions & 56 deletions .github/scripts/monitor_slurm_job.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,17 @@

set -euo pipefail

# Cleanup handler to prevent orphaned tail processes
# Cleanup handler to prevent orphaned tail processes and cancel orphaned jobs
cleanup() {
if [ -n "${tail_pid:-}" ]; then
kill "${tail_pid}" 2>/dev/null || true
fi
# Cancel the SLURM job if the monitor is exiting due to an error
# (e.g., the CI runner is being killed). Don't cancel on success.
if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then
echo "Monitor exiting abnormally — cancelling SLURM job $job_id"
scancel "$job_id" 2>/dev/null || true
fi
}
trap cleanup EXIT

Expand All @@ -23,30 +29,78 @@ output_file="$2"
echo "Submitted batch job $job_id"
echo "Monitoring output file: $output_file"

# Wait for file to appear with retry logic for transient squeue failures
# Robustly check SLURM job state using squeue with sacct fallback.
# Returns the state string (PENDING, RUNNING, COMPLETED, FAILED, etc.)
# or "UNKNOWN" if both commands fail.
get_job_state() {
local jid="$1"
local state

# Try squeue first (fast, works for active jobs)
state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ' || true)
if [ -n "$state" ]; then
echo "$state"
return
fi

# Fallback to sacct (works for completed/historical jobs)
if command -v sacct >/dev/null 2>&1; then
state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 || true)
if [ -n "$state" ]; then
echo "$state"
return
fi
fi

echo "UNKNOWN"
}

# Check if a state is terminal (job is done, for better or worse)
is_terminal_state() {
case "$1" in
COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|PREEMPTED|BOOT_FAIL|DEADLINE)
return 0 ;;
*)
return 1 ;;
esac
}

# Wait for file to appear, using robust state checking.
# Never give up due to transient squeue/sacct failures — the CI job timeout
# is the ultimate backstop.
echo "Waiting for job to start..."
squeue_retries=0
max_squeue_retries=5
unknown_count=0
while [ ! -f "$output_file" ]; do
# Check if job is still queued/running
if squeue -j "$job_id" &>/dev/null; then
squeue_retries=0 # Reset on success
sleep 5
else
squeue_retries=$((squeue_retries + 1))
if [ $squeue_retries -ge $max_squeue_retries ]; then
# Job not in queue and output file doesn't exist
if [ ! -f "$output_file" ]; then
echo "ERROR: Job $job_id not in queue and output file not created"
state=$(get_job_state "$job_id")

case "$state" in
PENDING|CONFIGURING)
unknown_count=0
sleep 5
;;
RUNNING|COMPLETING)
unknown_count=0
# Job is running but output file not yet visible (NFS delay)
sleep 2
;;
UNKNOWN)
unknown_count=$((unknown_count + 1))
# Only print warning periodically to avoid log spam
if [ $((unknown_count % 12)) -eq 1 ]; then
echo "Warning: Could not query job $job_id state (SLURM may be temporarily unavailable)..."
fi
sleep 5
;;
*)
# Terminal state — job finished without creating output
if is_terminal_state "$state"; then
echo "ERROR: Job $job_id reached terminal state ($state) without creating output file"
exit 1
fi
break
fi
# Exponential backoff
sleep_time=$((2 ** squeue_retries))
echo "Warning: squeue check failed, retrying in ${sleep_time}s..."
sleep $sleep_time
fi
# Unrecognized state, keep waiting
sleep 5
;;
esac
done

echo "=== Streaming output for job $job_id ==="
Expand All @@ -57,14 +111,13 @@ exec 3< <(stdbuf -oL -eL tail -f "$output_file" 2>&1)
tail_pid=$!

# Monitor job status and stream output simultaneously
squeue_failures=0
last_heartbeat=$(date +%s)

while true; do
# Try to read from tail output (non-blocking via timeout)
# Read multiple lines if available to avoid falling behind
lines_read=0
while IFS= read -r -t 0.1 line <&3 2>/dev/null; do
while IFS= read -r -t 1 line <&3 2>/dev/null; do
echo "$line"
lines_read=$((lines_read + 1))
last_heartbeat=$(date +%s)
Expand All @@ -73,49 +126,30 @@ while true; do
break
fi
done

# Check job status
current_time=$(date +%s)
if ! squeue -j "$job_id" &>/dev/null; then
squeue_failures=$((squeue_failures + 1))
# Check if job actually completed using sacct (if available)
if [ $squeue_failures -ge 3 ]; then
if command -v sacct >/dev/null 2>&1; then
state=$(sacct -j "$job_id" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}')
# Consider job done only if it reached a terminal state
case "$state" in
COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY)
echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
break
;;
*)
# treat as transient failure, reset failures and continue polling
squeue_failures=0
;;
esac
else
# No sacct: assume job completed after 3 failures
echo "[$(date +%H:%M:%S)] Job $job_id no longer in queue"
break
fi
fi
state=$(get_job_state "$job_id")

if is_terminal_state "$state"; then
echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
break
else
squeue_failures=0
# Print heartbeat if no output for 60 seconds
if [ $((current_time - last_heartbeat)) -ge 60 ]; then
echo "[$(date +%H:%M:%S)] Job $job_id still running (no new output for 60s)..."
echo "[$(date +%H:%M:%S)] Job $job_id state=$state (no new output for 60s)..."
last_heartbeat=$current_time
fi
fi

# Sleep briefly between status checks
sleep 1
done

# Drain any remaining output from tail after job completes
echo "Draining remaining output..."
drain_count=0
while IFS= read -r -t 0.5 line <&3 2>/dev/null; do
while IFS= read -r -t 1 line <&3 2>/dev/null; do
echo "$line"
drain_count=$((drain_count + 1))
# Safety limit to avoid infinite loop
Expand All @@ -128,8 +162,9 @@ done
# Close the file descriptor and kill tail
exec 3<&-
kill "${tail_pid}" 2>/dev/null || true
tail_pid=""

# Wait for output file to finish growing (stabilize) before stopping tail
# Wait for output file to stabilize (NFS flush) before final read
if [ -f "$output_file" ]; then
last_size=-1
same_count=0
Expand All @@ -149,9 +184,6 @@ if [ -f "$output_file" ]; then
done
fi

# Stop tailing (trap will also handle this on exit)
kill "${tail_pid}" 2>/dev/null || true

echo ""
echo "=== Final output ==="
cat "$output_file"
Expand Down Expand Up @@ -187,6 +219,6 @@ if [ "$exit_code" != "0:0" ]; then
exit 1
fi

monitor_success=1
echo "Job $job_id completed successfully"
exit 0

65 changes: 7 additions & 58 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
@@ -1,85 +1,35 @@
name: 'Benchmark'

on:
# Trigger when Test Suite completes (no polling needed)
workflow_run:
workflows: ["Test Suite"]
types: [completed]
pull_request:
pull_request_review:
types: [submitted]
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.event.workflow_run.head_branch || github.ref }}
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
cancel-in-progress: true

jobs:
file-changes:
name: Detect File Changes
# Only run if Test Suite passed (or manual dispatch)
if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success'
runs-on: 'ubuntu-latest'
outputs:
checkall: ${{ steps.changes.outputs.checkall }}
pr_number: ${{ steps.pr-info.outputs.pr_number }}
pr_approved: ${{ steps.pr-info.outputs.approved }}
pr_author: ${{ steps.pr-info.outputs.author }}
steps:
- name: Clone
uses: actions/checkout@v4
with:
ref: ${{ github.event.workflow_run.head_sha || github.sha }}

- name: Detect Changes
uses: dorny/paths-filter@v3
id: changes
with:
filters: ".github/file-filter.yml"

- name: Get PR Info
id: pr-info
env:
GH_TOKEN: ${{ github.token }}
run: |
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "pr_number=" >> $GITHUB_OUTPUT
echo "approved=true" >> $GITHUB_OUTPUT
echo "author=${{ github.actor }}" >> $GITHUB_OUTPUT
else
# Get PR number from workflow_run
PR_NUMBER="${{ github.event.workflow_run.pull_requests[0].number }}"
if [ -n "$PR_NUMBER" ]; then
echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT

# Fetch actual PR author from API (workflow_run.actor is the re-runner, not PR author)
PR_AUTHOR=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER --jq '.user.login')
echo "author=$PR_AUTHOR" >> $GITHUB_OUTPUT

# Check if PR is approved
APPROVED=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews \
--jq '[.[] | select(.state == "APPROVED")] | length')
if [ "$APPROVED" -gt 0 ]; then
echo "approved=true" >> $GITHUB_OUTPUT
else
echo "approved=false" >> $GITHUB_OUTPUT
fi
else
echo "pr_number=" >> $GITHUB_OUTPUT
echo "approved=false" >> $GITHUB_OUTPUT
echo "author=" >> $GITHUB_OUTPUT
fi
fi

self:
name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
if: >
github.repository == 'MFlowCode/MFC' &&
needs.file-changes.outputs.checkall == 'true' &&
(
github.event_name == 'workflow_dispatch' ||
needs.file-changes.outputs.pr_approved == 'true' ||
needs.file-changes.outputs.pr_author == 'sbryngelson' ||
needs.file-changes.outputs.pr_author == 'wilfonba'
)
needs: [file-changes]
if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba')) || github.event_name=='workflow_dispatch') }}
needs: file-changes
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -143,7 +93,6 @@ jobs:
- name: Clone - PR
uses: actions/checkout@v4
with:
ref: ${{ github.event.workflow_run.head_sha || github.sha }}
path: pr

- name: Clone - Master
Expand All @@ -155,7 +104,7 @@ jobs:

- name: Setup & Build
if: matrix.build_script != ''
run: |
run: |
(cd pr && ${{ matrix.build_script }}) &
(cd master && ${{ matrix.build_script }}) &
wait %1 && wait %2
Expand Down
5 changes: 2 additions & 3 deletions .github/workflows/phoenix/submit-bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,8 @@ sbatch_cpu_opts="\
"

sbatch_gpu_opts="\
#SBATCH -CL40S
#SBATCH --ntasks-per-node=4 # Number of cores per node required
#SBATCH -G2\
#SBATCH --gres=gpu:H200:2
#SBATCH --ntasks-per-node=8 # Number of cores per node required\
"

if [ "$2" = "cpu" ]; then
Expand Down
5 changes: 2 additions & 3 deletions .github/workflows/phoenix/submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,8 @@ sbatch_cpu_opts="\
"

sbatch_gpu_opts="\
#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s
#SBATCH --ntasks-per-node=4 # Number of cores per node required
#SBATCH -G2\
#SBATCH --gres=gpu:H200:2
#SBATCH --ntasks-per-node=8 # Number of cores per node required\
"

if [ "$2" = "cpu" ]; then
Expand Down
23 changes: 21 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,27 @@ jobs:
TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}

- name: Test
run: |
/bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT
run: |
rm -f tests/failed_uuids.txt
TEST_EXIT=0
/bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || TEST_EXIT=$?

# Retry only if a small number of tests failed (sporadic failures)
if [ -f tests/failed_uuids.txt ]; then
NUM_FAILED=$(wc -l < tests/failed_uuids.txt)
if [ "$NUM_FAILED" -le 5 ]; then
FAILED=$(cat tests/failed_uuids.txt | tr '\n' ' ')
echo ""
echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ==="
echo ""
/bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) --only $FAILED $TEST_ALL
else
echo "Too many failures ($NUM_FAILED) to retry — likely a real issue."
exit 1
fi
elif [ "$TEST_EXIT" -ne 0 ]; then
exit $TEST_EXIT
fi
env:
TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
TEST_PCT: ${{ matrix.debug == 'debug' && '-% 20' || '' }}
Expand Down
Loading
Loading