diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh index 27472e01ef..4981e5e607 100755 --- a/.github/scripts/monitor_slurm_job.sh +++ b/.github/scripts/monitor_slurm_job.sh @@ -4,11 +4,17 @@ set -euo pipefail -# Cleanup handler to prevent orphaned tail processes +# Cleanup handler to prevent orphaned tail processes and cancel orphaned jobs cleanup() { if [ -n "${tail_pid:-}" ]; then kill "${tail_pid}" 2>/dev/null || true fi + # Cancel the SLURM job if the monitor is exiting due to an error + # (e.g., the CI runner is being killed). Don't cancel on success. + if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then + echo "Monitor exiting abnormally — cancelling SLURM job $job_id" + scancel "$job_id" 2>/dev/null || true + fi } trap cleanup EXIT @@ -23,30 +29,78 @@ output_file="$2" echo "Submitted batch job $job_id" echo "Monitoring output file: $output_file" -# Wait for file to appear with retry logic for transient squeue failures +# Robustly check SLURM job state using squeue with sacct fallback. +# Returns the state string (PENDING, RUNNING, COMPLETED, FAILED, etc.) +# or "UNKNOWN" if both commands fail. +get_job_state() { + local jid="$1" + local state + + # Try squeue first (fast, works for active jobs) + state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ' || true) + if [ -n "$state" ]; then + echo "$state" + return + fi + + # Fallback to sacct (works for completed/historical jobs) + if command -v sacct >/dev/null 2>&1; then + state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 || true) + if [ -n "$state" ]; then + echo "$state" + return + fi + fi + + echo "UNKNOWN" +} + +# Check if a state is terminal (job is done, for better or worse) +is_terminal_state() { + case "$1" in + COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|PREEMPTED|BOOT_FAIL|DEADLINE) + return 0 ;; + *) + return 1 ;; + esac +} + +# Wait for file to appear, using robust state checking. +# Never give up due to transient squeue/sacct failures — the CI job timeout +# is the ultimate backstop. echo "Waiting for job to start..." -squeue_retries=0 -max_squeue_retries=5 +unknown_count=0 while [ ! -f "$output_file" ]; do - # Check if job is still queued/running - if squeue -j "$job_id" &>/dev/null; then - squeue_retries=0 # Reset on success - sleep 5 - else - squeue_retries=$((squeue_retries + 1)) - if [ $squeue_retries -ge $max_squeue_retries ]; then - # Job not in queue and output file doesn't exist - if [ ! -f "$output_file" ]; then - echo "ERROR: Job $job_id not in queue and output file not created" + state=$(get_job_state "$job_id") + + case "$state" in + PENDING|CONFIGURING) + unknown_count=0 + sleep 5 + ;; + RUNNING|COMPLETING) + unknown_count=0 + # Job is running but output file not yet visible (NFS delay) + sleep 2 + ;; + UNKNOWN) + unknown_count=$((unknown_count + 1)) + # Only print warning periodically to avoid log spam + if [ $((unknown_count % 12)) -eq 1 ]; then + echo "Warning: Could not query job $job_id state (SLURM may be temporarily unavailable)..." + fi + sleep 5 + ;; + *) + # Terminal state — job finished without creating output + if is_terminal_state "$state"; then + echo "ERROR: Job $job_id reached terminal state ($state) without creating output file" exit 1 fi - break - fi - # Exponential backoff - sleep_time=$((2 ** squeue_retries)) - echo "Warning: squeue check failed, retrying in ${sleep_time}s..." - sleep $sleep_time - fi + # Unrecognized state, keep waiting + sleep 5 + ;; + esac done echo "=== Streaming output for job $job_id ===" @@ -57,14 +111,13 @@ exec 3< <(stdbuf -oL -eL tail -f "$output_file" 2>&1) tail_pid=$! # Monitor job status and stream output simultaneously -squeue_failures=0 last_heartbeat=$(date +%s) while true; do # Try to read from tail output (non-blocking via timeout) # Read multiple lines if available to avoid falling behind lines_read=0 - while IFS= read -r -t 0.1 line <&3 2>/dev/null; do + while IFS= read -r -t 1 line <&3 2>/dev/null; do echo "$line" lines_read=$((lines_read + 1)) last_heartbeat=$(date +%s) @@ -73,41 +126,22 @@ while true; do break fi done - + # Check job status current_time=$(date +%s) - if ! squeue -j "$job_id" &>/dev/null; then - squeue_failures=$((squeue_failures + 1)) - # Check if job actually completed using sacct (if available) - if [ $squeue_failures -ge 3 ]; then - if command -v sacct >/dev/null 2>&1; then - state=$(sacct -j "$job_id" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}') - # Consider job done only if it reached a terminal state - case "$state" in - COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY) - echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state" - break - ;; - *) - # treat as transient failure, reset failures and continue polling - squeue_failures=0 - ;; - esac - else - # No sacct: assume job completed after 3 failures - echo "[$(date +%H:%M:%S)] Job $job_id no longer in queue" - break - fi - fi + state=$(get_job_state "$job_id") + + if is_terminal_state "$state"; then + echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state" + break else - squeue_failures=0 # Print heartbeat if no output for 60 seconds if [ $((current_time - last_heartbeat)) -ge 60 ]; then - echo "[$(date +%H:%M:%S)] Job $job_id still running (no new output for 60s)..." + echo "[$(date +%H:%M:%S)] Job $job_id state=$state (no new output for 60s)..." last_heartbeat=$current_time fi fi - + # Sleep briefly between status checks sleep 1 done @@ -115,7 +149,7 @@ done # Drain any remaining output from tail after job completes echo "Draining remaining output..." drain_count=0 -while IFS= read -r -t 0.5 line <&3 2>/dev/null; do +while IFS= read -r -t 1 line <&3 2>/dev/null; do echo "$line" drain_count=$((drain_count + 1)) # Safety limit to avoid infinite loop @@ -128,8 +162,9 @@ done # Close the file descriptor and kill tail exec 3<&- kill "${tail_pid}" 2>/dev/null || true +tail_pid="" -# Wait for output file to finish growing (stabilize) before stopping tail +# Wait for output file to stabilize (NFS flush) before final read if [ -f "$output_file" ]; then last_size=-1 same_count=0 @@ -149,9 +184,6 @@ if [ -f "$output_file" ]; then done fi -# Stop tailing (trap will also handle this on exit) -kill "${tail_pid}" 2>/dev/null || true - echo "" echo "=== Final output ===" cat "$output_file" @@ -187,6 +219,6 @@ if [ "$exit_code" != "0:0" ]; then exit 1 fi +monitor_success=1 echo "Job $job_id completed successfully" exit 0 - diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 6279f5f578..53efac21ed 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -1,32 +1,24 @@ name: 'Benchmark' on: - # Trigger when Test Suite completes (no polling needed) - workflow_run: - workflows: ["Test Suite"] - types: [completed] + pull_request: + pull_request_review: + types: [submitted] workflow_dispatch: concurrency: - group: ${{ github.workflow }}-${{ github.event.workflow_run.head_branch || github.ref }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} cancel-in-progress: true jobs: file-changes: name: Detect File Changes - # Only run if Test Suite passed (or manual dispatch) - if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' runs-on: 'ubuntu-latest' outputs: checkall: ${{ steps.changes.outputs.checkall }} - pr_number: ${{ steps.pr-info.outputs.pr_number }} - pr_approved: ${{ steps.pr-info.outputs.approved }} - pr_author: ${{ steps.pr-info.outputs.author }} steps: - name: Clone uses: actions/checkout@v4 - with: - ref: ${{ github.event.workflow_run.head_sha || github.sha }} - name: Detect Changes uses: dorny/paths-filter@v3 @@ -34,52 +26,10 @@ jobs: with: filters: ".github/file-filter.yml" - - name: Get PR Info - id: pr-info - env: - GH_TOKEN: ${{ github.token }} - run: | - if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - echo "pr_number=" >> $GITHUB_OUTPUT - echo "approved=true" >> $GITHUB_OUTPUT - echo "author=${{ github.actor }}" >> $GITHUB_OUTPUT - else - # Get PR number from workflow_run - PR_NUMBER="${{ github.event.workflow_run.pull_requests[0].number }}" - if [ -n "$PR_NUMBER" ]; then - echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT - - # Fetch actual PR author from API (workflow_run.actor is the re-runner, not PR author) - PR_AUTHOR=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER --jq '.user.login') - echo "author=$PR_AUTHOR" >> $GITHUB_OUTPUT - - # Check if PR is approved - APPROVED=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews \ - --jq '[.[] | select(.state == "APPROVED")] | length') - if [ "$APPROVED" -gt 0 ]; then - echo "approved=true" >> $GITHUB_OUTPUT - else - echo "approved=false" >> $GITHUB_OUTPUT - fi - else - echo "pr_number=" >> $GITHUB_OUTPUT - echo "approved=false" >> $GITHUB_OUTPUT - echo "author=" >> $GITHUB_OUTPUT - fi - fi - self: name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})" - if: > - github.repository == 'MFlowCode/MFC' && - needs.file-changes.outputs.checkall == 'true' && - ( - github.event_name == 'workflow_dispatch' || - needs.file-changes.outputs.pr_approved == 'true' || - needs.file-changes.outputs.pr_author == 'sbryngelson' || - needs.file-changes.outputs.pr_author == 'wilfonba' - ) - needs: [file-changes] + if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba')) || github.event_name=='workflow_dispatch') }} + needs: file-changes strategy: fail-fast: false matrix: @@ -143,7 +93,6 @@ jobs: - name: Clone - PR uses: actions/checkout@v4 with: - ref: ${{ github.event.workflow_run.head_sha || github.sha }} path: pr - name: Clone - Master @@ -155,7 +104,7 @@ jobs: - name: Setup & Build if: matrix.build_script != '' - run: | + run: | (cd pr && ${{ matrix.build_script }}) & (cd master && ${{ matrix.build_script }}) & wait %1 && wait %2 diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh index 7ae85e66fe..fc28b3046b 100644 --- a/.github/workflows/phoenix/submit-bench.sh +++ b/.github/workflows/phoenix/submit-bench.sh @@ -20,9 +20,8 @@ sbatch_cpu_opts="\ " sbatch_gpu_opts="\ -#SBATCH -CL40S -#SBATCH --ntasks-per-node=4 # Number of cores per node required -#SBATCH -G2\ +#SBATCH --gres=gpu:H200:2 +#SBATCH --ntasks-per-node=8 # Number of cores per node required\ " if [ "$2" = "cpu" ]; then diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh index 06a03e465a..5747c839f0 100755 --- a/.github/workflows/phoenix/submit.sh +++ b/.github/workflows/phoenix/submit.sh @@ -23,9 +23,8 @@ sbatch_cpu_opts="\ " sbatch_gpu_opts="\ -#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s -#SBATCH --ntasks-per-node=4 # Number of cores per node required -#SBATCH -G2\ +#SBATCH --gres=gpu:H200:2 +#SBATCH --ntasks-per-node=8 # Number of cores per node required\ " if [ "$2" = "cpu" ]; then diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0be51076ec..21e52d5a5e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -134,8 +134,27 @@ jobs: TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }} - name: Test - run: | - /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT + run: | + rm -f tests/failed_uuids.txt + TEST_EXIT=0 + /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || TEST_EXIT=$? + + # Retry only if a small number of tests failed (sporadic failures) + if [ -f tests/failed_uuids.txt ]; then + NUM_FAILED=$(wc -l < tests/failed_uuids.txt) + if [ "$NUM_FAILED" -le 5 ]; then + FAILED=$(cat tests/failed_uuids.txt | tr '\n' ' ') + echo "" + echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ===" + echo "" + /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) --only $FAILED $TEST_ALL + else + echo "Too many failures ($NUM_FAILED) to retry — likely a real issue." + exit 1 + fi + elif [ "$TEST_EXIT" -ne 0 ]; then + exit $TEST_EXIT + fi env: TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }} TEST_PCT: ${{ matrix.debug == 'debug' && '-% 20' || '' }} diff --git a/toolchain/mfc/common.py b/toolchain/mfc/common.py index ce02e8251c..e56c6a9eb4 100644 --- a/toolchain/mfc/common.py +++ b/toolchain/mfc/common.py @@ -1,4 +1,4 @@ -import os, yaml, typing, shutil, subprocess, logging +import os, yaml, typing, shutil, subprocess, logging, time from os.path import join, abspath, normpath, dirname, realpath @@ -122,8 +122,16 @@ def create_directory(dirpath: str) -> None: def delete_directory(dirpath: str) -> None: - if os.path.isdir(dirpath): - shutil.rmtree(dirpath) + for attempt in range(5): + if not os.path.isdir(dirpath): + return + try: + shutil.rmtree(dirpath) + return + except OSError: + if attempt == 4: + raise + time.sleep(1) def get_program_output(arguments: typing.List[str] = None, cwd=None): diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 31a3771cb9..d6dce92436 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -206,6 +206,15 @@ def test(): # Build the summary report _print_test_summary(nPASS, nFAIL, nSKIP, minutes, seconds, failed_tests, skipped_cases) + # Write failed UUIDs to file for CI retry logic + failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt") + if failed_tests: + with open(failed_uuids_path, "w") as f: + for test_info in failed_tests: + f.write(test_info['uuid'] + "\n") + elif os.path.exists(failed_uuids_path): + os.remove(failed_uuids_path) + exit(nFAIL)