From ea707d931636faf60865a803f80ff1d85002b446 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Feb 2026 19:43:27 -0500 Subject: [PATCH 1/4] Add --deps-only flag to separate dependency fetching from source builds This allows CI to fetch and build dependencies (FFTW, HDF5, etc.) on login nodes with internet access, then build MFC source code on compute nodes that may have no network connectivity. Key changes: - New `--deps-only` CLI flag for `./mfc.sh build` - Already-configured dependencies are skipped entirely during regular builds, guaranteeing no network access in the source build step - All clusters (Phoenix, Frontier, Frontier AMD) now follow the same pattern: deps on login node, source build + test on compute node Co-Authored-By: Claude Opus 4.6 --- .github/workflows/bench.yml | 17 +++++----- .github/workflows/frontier/bench.sh | 38 ++++++++++++++++++++--- .github/workflows/frontier/build.sh | 41 +++---------------------- .github/workflows/frontier/test.sh | 22 +++++++++++++ .github/workflows/frontier_amd/bench.sh | 38 ++++++++++++++++++++--- .github/workflows/frontier_amd/build.sh | 41 +++---------------------- .github/workflows/frontier_amd/test.sh | 22 +++++++++++++ .github/workflows/phoenix/build.sh | 20 ++++++++++++ .github/workflows/test.yml | 11 +++---- toolchain/mfc/build.py | 26 ++++++++++++++++ toolchain/mfc/cli/commands.py | 7 +++++ 11 files changed, 186 insertions(+), 97 deletions(-) create mode 100644 .github/workflows/phoenix/build.sh diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 6279f5f578..ac2f026dab 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -91,7 +91,7 @@ jobs: flag: p device: cpu interface: none - build_script: "" + build_script: "bash .github/workflows/phoenix/build.sh cpu none" - cluster: phoenix name: Georgia Tech | Phoenix (NVHPC) group: phoenix @@ -99,7 +99,7 @@ jobs: flag: p device: gpu interface: acc - build_script: "" + build_script: "bash .github/workflows/phoenix/build.sh gpu acc" - cluster: phoenix name: Georgia Tech | Phoenix (NVHPC) group: phoenix @@ -107,7 +107,7 @@ jobs: flag: p device: gpu interface: omp - build_script: "" + build_script: "bash .github/workflows/phoenix/build.sh gpu omp" - cluster: frontier name: Oak Ridge | Frontier (CCE) group: phoenix @@ -115,7 +115,7 @@ jobs: flag: f device: gpu interface: acc - build_script: "bash .github/workflows/frontier/build.sh gpu acc bench" + build_script: "bash .github/workflows/frontier/build.sh gpu acc" - cluster: frontier name: Oak Ridge | Frontier (CCE) group: phoenix @@ -123,7 +123,7 @@ jobs: flag: f device: gpu interface: omp - build_script: "bash .github/workflows/frontier/build.sh gpu omp bench" + build_script: "bash .github/workflows/frontier/build.sh gpu omp" - cluster: frontier_amd name: Oak Ridge | Frontier (AMD) group: phoenix @@ -131,7 +131,7 @@ jobs: flag: famd device: gpu interface: omp - build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp bench" + build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp" runs-on: group: ${{ matrix.group }} labels: ${{ matrix.labels }} @@ -153,9 +153,8 @@ jobs: ref: master path: master - - name: Setup & Build - if: matrix.build_script != '' - run: | + - name: Fetch Dependencies + run: | (cd pr && ${{ matrix.build_script }}) & (cd master && ${{ matrix.build_script }}) & wait %1 && wait %2 diff --git a/.github/workflows/frontier/bench.sh b/.github/workflows/frontier/bench.sh index 35b4c5950e..13bbbddc2c 100644 --- a/.github/workflows/frontier/bench.sh +++ b/.github/workflows/frontier/bench.sh @@ -1,20 +1,50 @@ #!/bin/bash n_ranks=12 +build_opts="" device_opts="" if [ "$job_device" = "gpu" ]; then gpus=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ') n_ranks=$(echo "$gpus" | wc -w) # number of GPUs on node gpu_ids=$(echo "$gpus" | tr ' ' '\n' | tr '\n' ' ' | sed 's/ $//') # GPU IDs from rocm-smi - device_opts+="--gpu" + build_opts+="--gpu" if [ "$job_interface" = "acc" ]; then - device_opts+=" acc" + build_opts+=" acc" elif [ "$job_interface" = "omp" ]; then - device_opts+=" mp" + build_opts+=" mp" fi - device_opts+=" -g $gpu_ids" + device_opts="$build_opts -g $gpu_ids" fi +# Build case-optimized binaries on compute node (deps already fetched on login node) +max_attempts=3 +attempt=1 +while [ $attempt -le $max_attempts ]; do + echo "Build attempt $attempt of $max_attempts..." + build_cmd_ok=true + for dir in benchmarks/*/; do + if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then + build_cmd_ok=false + break + fi + done + + if [ "$build_cmd_ok" = true ]; then + echo "Build succeeded on attempt $attempt." + break + fi + + if [ $attempt -lt $max_attempts ]; then + echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." + ./mfc.sh clean + sleep 30 + else + echo "Build failed after $max_attempts attempts." + exit 1 + fi + attempt=$((attempt + 1)) +done + if [ "$job_device" = "gpu" ]; then ./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c frontier $device_opts -n $n_ranks else diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index 18cddc96ca..84b67020ca 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -1,11 +1,13 @@ #!/bin/bash +# Fetch dependencies on login node (internet access). +# Source code is built on compute nodes via test.sh / bench.sh. + # Ignore SIGHUP to survive login node session drops trap '' HUP job_device=$1 job_interface=$2 -run_bench=$3 build_opts="" if [ "$job_device" = "gpu" ]; then build_opts+="--gpu" @@ -18,39 +20,4 @@ fi . ./mfc.sh load -c f -m g -max_attempts=3 -attempt=1 -while [ $attempt -le $max_attempts ]; do - echo "Build attempt $attempt of $max_attempts..." - if [ "$run_bench" == "bench" ]; then - build_cmd_ok=true - for dir in benchmarks/*/; do - dirname=$(basename "$dir") - if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then - build_cmd_ok=false - break - fi - done - else - if ./mfc.sh test -v -a --dry-run --rdma-mpi -j 8 $build_opts; then - build_cmd_ok=true - else - build_cmd_ok=false - fi - fi - - if [ "$build_cmd_ok" = true ]; then - echo "Build succeeded on attempt $attempt." - exit 0 - fi - - if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." - ./mfc.sh clean - sleep 30 - fi - attempt=$((attempt + 1)) -done - -echo "Build failed after $max_attempts attempts." -exit 1 +./mfc.sh build --deps-only -j 8 $build_opts diff --git a/.github/workflows/frontier/test.sh b/.github/workflows/frontier/test.sh index 17fbbaf8e5..ec790944c0 100644 --- a/.github/workflows/frontier/test.sh +++ b/.github/workflows/frontier/test.sh @@ -13,6 +13,28 @@ if [ "$job_device" = "gpu" ]; then fi fi +# Build source code on compute node (deps already fetched on login node) +max_attempts=3 +attempt=1 +while [ $attempt -le $max_attempts ]; do + echo "Build attempt $attempt of $max_attempts..." + if ./mfc.sh test -v -a --dry-run --rdma-mpi -j 8 $device_opts; then + echo "Build succeeded on attempt $attempt." + break + fi + + if [ $attempt -lt $max_attempts ]; then + echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." + ./mfc.sh clean + sleep 30 + else + echo "Build failed after $max_attempts attempts." + exit 1 + fi + attempt=$((attempt + 1)) +done + +# Run tests if [ "$job_device" = "gpu" ]; then ./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts -- -c frontier else diff --git a/.github/workflows/frontier_amd/bench.sh b/.github/workflows/frontier_amd/bench.sh index 6e01687e79..fd263dd800 100644 --- a/.github/workflows/frontier_amd/bench.sh +++ b/.github/workflows/frontier_amd/bench.sh @@ -1,20 +1,50 @@ #!/bin/bash n_ranks=12 +build_opts="" device_opts="" if [ "$job_device" = "gpu" ]; then gpus=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ') n_ranks=$(echo "$gpus" | wc -w) # number of GPUs on node gpu_ids=$(echo "$gpus" | tr ' ' '\n' | tr '\n' ' ' | sed 's/ $//') # GPU IDs from rocm-smi - device_opts+="--gpu" + build_opts+="--gpu" if [ "$job_interface" = "acc" ]; then - device_opts+=" acc" + build_opts+=" acc" elif [ "$job_interface" = "omp" ]; then - device_opts+=" mp" + build_opts+=" mp" fi - device_opts+=" -g $gpu_ids" + device_opts="$build_opts -g $gpu_ids" fi +# Build case-optimized binaries on compute node (deps already fetched on login node) +max_attempts=3 +attempt=1 +while [ $attempt -le $max_attempts ]; do + echo "Build attempt $attempt of $max_attempts..." + build_cmd_ok=true + for dir in benchmarks/*/; do + if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then + build_cmd_ok=false + break + fi + done + + if [ "$build_cmd_ok" = true ]; then + echo "Build succeeded on attempt $attempt." + break + fi + + if [ $attempt -lt $max_attempts ]; then + echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." + ./mfc.sh clean + sleep 30 + else + echo "Build failed after $max_attempts attempts." + exit 1 + fi + attempt=$((attempt + 1)) +done + if [ "$job_device" = "gpu" ]; then ./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c frontier_amd $device_opts -n $n_ranks else diff --git a/.github/workflows/frontier_amd/build.sh b/.github/workflows/frontier_amd/build.sh index 56c47d8ff4..1b120ae6f7 100644 --- a/.github/workflows/frontier_amd/build.sh +++ b/.github/workflows/frontier_amd/build.sh @@ -1,11 +1,13 @@ #!/bin/bash +# Fetch dependencies on login node (internet access). +# Source code is built on compute nodes via test.sh / bench.sh. + # Ignore SIGHUP to survive login node session drops trap '' HUP job_device=$1 job_interface=$2 -run_bench=$3 build_opts="" if [ "$job_device" = "gpu" ]; then build_opts+="--gpu" @@ -18,39 +20,4 @@ fi . ./mfc.sh load -c famd -m g -max_attempts=3 -attempt=1 -while [ $attempt -le $max_attempts ]; do - echo "Build attempt $attempt of $max_attempts..." - if [ "$run_bench" == "bench" ]; then - build_cmd_ok=true - for dir in benchmarks/*/; do - dirname=$(basename "$dir") - if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then - build_cmd_ok=false - break - fi - done - else - if ./mfc.sh test -v -a --dry-run -j 8 $build_opts; then - build_cmd_ok=true - else - build_cmd_ok=false - fi - fi - - if [ "$build_cmd_ok" = true ]; then - echo "Build succeeded on attempt $attempt." - exit 0 - fi - - if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." - ./mfc.sh clean - sleep 30 - fi - attempt=$((attempt + 1)) -done - -echo "Build failed after $max_attempts attempts." -exit 1 +./mfc.sh build --deps-only -j 8 $build_opts diff --git a/.github/workflows/frontier_amd/test.sh b/.github/workflows/frontier_amd/test.sh index ff65aa2b0e..654072a754 100644 --- a/.github/workflows/frontier_amd/test.sh +++ b/.github/workflows/frontier_amd/test.sh @@ -13,6 +13,28 @@ if [ "$job_device" = "gpu" ]; then fi fi +# Build source code on compute node (deps already fetched on login node) +max_attempts=3 +attempt=1 +while [ $attempt -le $max_attempts ]; do + echo "Build attempt $attempt of $max_attempts..." + if ./mfc.sh test -v -a --dry-run -j 8 $device_opts; then + echo "Build succeeded on attempt $attempt." + break + fi + + if [ $attempt -lt $max_attempts ]; then + echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." + ./mfc.sh clean + sleep 30 + else + echo "Build failed after $max_attempts attempts." + exit 1 + fi + attempt=$((attempt + 1)) +done + +# Run tests if [ "$job_device" = "gpu" ]; then ./mfc.sh test -v -a --max-attempts 3 -j $ngpus $device_opts -- -c frontier_amd else diff --git a/.github/workflows/phoenix/build.sh b/.github/workflows/phoenix/build.sh new file mode 100644 index 0000000000..75bf6fc8a8 --- /dev/null +++ b/.github/workflows/phoenix/build.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Fetch dependencies on login node (internet access). +# Source code is built on compute nodes via test.sh / bench.sh. + +job_device=$1 +job_interface=$2 +build_opts="" +if [ "$job_device" = "gpu" ]; then + build_opts+="--gpu" + if [ "$job_interface" = "acc" ]; then + build_opts+=" acc" + elif [ "$job_interface" = "omp" ]; then + build_opts+=" mp" + fi +fi + +. ./mfc.sh load -c p -m $job_device + +./mfc.sh build --deps-only -j $(nproc) $build_opts diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0be51076ec..d79e61fecd 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -149,7 +149,7 @@ jobs: strategy: matrix: include: - # Phoenix (GT) — build+test combined in SLURM job + # Phoenix (GT) — deps on login node, build+test in SLURM job - runner: 'gt' cluster: 'phoenix' cluster_name: 'Georgia Tech | Phoenix' @@ -165,7 +165,7 @@ jobs: cluster_name: 'Georgia Tech | Phoenix' device: 'cpu' interface: 'none' - # Frontier (ORNL) — build on login node, test via SLURM + # Frontier (ORNL) — deps on login node, build+test via SLURM - runner: 'frontier' cluster: 'frontier' cluster_name: 'Oak Ridge | Frontier' @@ -181,7 +181,7 @@ jobs: cluster_name: 'Oak Ridge | Frontier' device: 'cpu' interface: 'none' - # Frontier AMD — build on login node, test via SLURM + # Frontier AMD — deps on login node, build+test via SLURM - runner: 'frontier' cluster: 'frontier_amd' cluster_name: 'Oak Ridge | Frontier (AMD)' @@ -203,11 +203,10 @@ jobs: - name: Clone uses: actions/checkout@v4 - - name: Build - if: matrix.cluster != 'phoenix' + - name: Fetch Dependencies run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }} - - name: Test + - name: Build & Test run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} - name: Print Logs diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index 6430f7ad35..e55ba8c8f1 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -566,6 +566,12 @@ def __build_target(target: typing.Union[MFCTarget, str], case: input.MFCInputFil history.add(target.name) + # Dependencies are pinned to fixed versions. If already configured + # (built & installed by a prior --deps-only step), skip entirely + # to avoid re-entering the superbuild (which may access the network). + if target.isDependency and target.is_configured(case): + return + for dep in target.requires.compute(): # If we have already built and installed this target, # do not do so again. This can be inferred by whether @@ -611,6 +617,26 @@ def build(targets = None, case: input.MFCInputFile = None, history: typing.Set[s case = case or input.load(ARG("input"), ARG("--"), {}) case.validate_params() + if ARG("deps_only", False): + all_deps = set() + for target in targets: + target = get_target(target) + for dep in target.requires.compute(): + all_deps.add(dep) + + if len(history) == 0: + cons.print(f"[bold]Fetch Dependencies | {format_list_to_string([d.name for d in all_deps], 'magenta', 'None')}[/bold]") + cons.print(no_indent=True) + + if not all_deps: + cons.print("[yellow]No dependencies to build for the requested targets.[/yellow]") + return + + for dep in all_deps: + __build_target(dep, case, history) + + return + if len(history) == 0: cons.print(__generate_header(case, targets)) cons.print(no_indent=True) diff --git a/toolchain/mfc/cli/commands.py b/toolchain/mfc/cli/commands.py index 8ad8c4bd07..bb9fddb76d 100644 --- a/toolchain/mfc/cli/commands.py +++ b/toolchain/mfc/cli/commands.py @@ -154,6 +154,13 @@ default=False, dest="case_optimization", ), + Argument( + name="deps-only", + help="Only fetch and build dependencies, do not build MFC targets.", + action=ArgAction.STORE_TRUE, + default=False, + dest="deps_only", + ), ], examples=[ Example("./mfc.sh build", "Build all default targets (CPU)"), From 2474b4640d138b81c8dfd8b6dc5bc6b11f4ec1d7 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Feb 2026 19:56:56 -0500 Subject: [PATCH 2/4] Address review feedback: sort deps, fix --deps-only -t , add --no-gpu consistency - Sort dependency list for deterministic build order and log output - Include dependency targets themselves in --deps-only (fixes --deps-only -t fftw doing nothing) - Remove redundant get_target() call (targets already resolved) - Add trap '' HUP to phoenix/build.sh for SSH resilience - Add --no-gpu to CPU dry-run builds in frontier test scripts for consistency Co-Authored-By: Claude Opus 4.6 --- .github/workflows/frontier/test.sh | 2 ++ .github/workflows/frontier_amd/test.sh | 2 ++ .github/workflows/phoenix/build.sh | 3 +++ toolchain/mfc/build.py | 11 +++++++---- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/.github/workflows/frontier/test.sh b/.github/workflows/frontier/test.sh index ec790944c0..5ba2d18c1a 100644 --- a/.github/workflows/frontier/test.sh +++ b/.github/workflows/frontier/test.sh @@ -11,6 +11,8 @@ if [ "$job_device" = "gpu" ]; then elif [ "$job_interface" = "omp" ]; then device_opts+=" mp" fi +else + device_opts+=" --no-gpu" fi # Build source code on compute node (deps already fetched on login node) diff --git a/.github/workflows/frontier_amd/test.sh b/.github/workflows/frontier_amd/test.sh index 654072a754..a3f9bd21fa 100644 --- a/.github/workflows/frontier_amd/test.sh +++ b/.github/workflows/frontier_amd/test.sh @@ -11,6 +11,8 @@ if [ "$job_device" = "gpu" ]; then elif [ "$job_interface" = "omp" ]; then device_opts+=" mp" fi +else + device_opts+=" --no-gpu" fi # Build source code on compute node (deps already fetched on login node) diff --git a/.github/workflows/phoenix/build.sh b/.github/workflows/phoenix/build.sh index 75bf6fc8a8..8a850fca6f 100644 --- a/.github/workflows/phoenix/build.sh +++ b/.github/workflows/phoenix/build.sh @@ -3,6 +3,9 @@ # Fetch dependencies on login node (internet access). # Source code is built on compute nodes via test.sh / bench.sh. +# Ignore SIGHUP to survive login node session drops +trap '' HUP + job_device=$1 job_interface=$2 build_opts="" diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index e55ba8c8f1..8ed5e693b0 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -620,19 +620,22 @@ def build(targets = None, case: input.MFCInputFile = None, history: typing.Set[s if ARG("deps_only", False): all_deps = set() for target in targets: - target = get_target(target) + if target.isDependency: + all_deps.add(target) for dep in target.requires.compute(): all_deps.add(dep) + sorted_deps = sorted(all_deps, key=lambda t: t.name) + if len(history) == 0: - cons.print(f"[bold]Fetch Dependencies | {format_list_to_string([d.name for d in all_deps], 'magenta', 'None')}[/bold]") + cons.print(f"[bold]Fetch Dependencies | {format_list_to_string([d.name for d in sorted_deps], 'magenta', 'None')}[/bold]") cons.print(no_indent=True) - if not all_deps: + if not sorted_deps: cons.print("[yellow]No dependencies to build for the requested targets.[/yellow]") return - for dep in all_deps: + for dep in sorted_deps: __build_target(dep, case, history) return From 7179e09f58b7ea2f9e01b3b54a33e8869cf7128e Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Feb 2026 20:34:08 -0500 Subject: [PATCH 3/4] Fix retry clean wiping pre-fetched deps; harden dep skip check - Clean only source targets (not deps) in compute-node retry loops, so pre-fetched dependencies survive build failures on offline nodes - Also check install dir exists (not just CMakeCache.txt) before skipping a dependency, guarding against configure-ok-but-build-failed Co-Authored-By: Claude Opus 4.6 --- .github/workflows/frontier/bench.sh | 4 ++-- .github/workflows/frontier/test.sh | 4 ++-- .github/workflows/frontier_amd/bench.sh | 4 ++-- .github/workflows/frontier_amd/test.sh | 4 ++-- toolchain/mfc/build.py | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/frontier/bench.sh b/.github/workflows/frontier/bench.sh index 13bbbddc2c..1aafa19e76 100644 --- a/.github/workflows/frontier/bench.sh +++ b/.github/workflows/frontier/bench.sh @@ -35,8 +35,8 @@ while [ $attempt -le $max_attempts ]; do fi if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." - ./mfc.sh clean + echo "Build failed on attempt $attempt. Cleaning source targets and retrying in 30s..." + ./mfc.sh clean -t pre_process simulation post_process syscheck sleep 30 else echo "Build failed after $max_attempts attempts." diff --git a/.github/workflows/frontier/test.sh b/.github/workflows/frontier/test.sh index 5ba2d18c1a..3240ced6b8 100644 --- a/.github/workflows/frontier/test.sh +++ b/.github/workflows/frontier/test.sh @@ -26,8 +26,8 @@ while [ $attempt -le $max_attempts ]; do fi if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." - ./mfc.sh clean + echo "Build failed on attempt $attempt. Cleaning source targets and retrying in 30s..." + ./mfc.sh clean -t pre_process simulation post_process syscheck sleep 30 else echo "Build failed after $max_attempts attempts." diff --git a/.github/workflows/frontier_amd/bench.sh b/.github/workflows/frontier_amd/bench.sh index fd263dd800..173251c679 100644 --- a/.github/workflows/frontier_amd/bench.sh +++ b/.github/workflows/frontier_amd/bench.sh @@ -35,8 +35,8 @@ while [ $attempt -le $max_attempts ]; do fi if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." - ./mfc.sh clean + echo "Build failed on attempt $attempt. Cleaning source targets and retrying in 30s..." + ./mfc.sh clean -t pre_process simulation post_process syscheck sleep 30 else echo "Build failed after $max_attempts attempts." diff --git a/.github/workflows/frontier_amd/test.sh b/.github/workflows/frontier_amd/test.sh index a3f9bd21fa..afc944b8d3 100644 --- a/.github/workflows/frontier_amd/test.sh +++ b/.github/workflows/frontier_amd/test.sh @@ -26,8 +26,8 @@ while [ $attempt -le $max_attempts ]; do fi if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." - ./mfc.sh clean + echo "Build failed on attempt $attempt. Cleaning source targets and retrying in 30s..." + ./mfc.sh clean -t pre_process simulation post_process syscheck sleep 30 else echo "Build failed after $max_attempts attempts." diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index 8ed5e693b0..eca3a4f2c0 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -569,7 +569,7 @@ def __build_target(target: typing.Union[MFCTarget, str], case: input.MFCInputFil # Dependencies are pinned to fixed versions. If already configured # (built & installed by a prior --deps-only step), skip entirely # to avoid re-entering the superbuild (which may access the network). - if target.isDependency and target.is_configured(case): + if target.isDependency and target.is_configured(case) and os.path.isdir(target.get_install_dirpath(case)): return for dep in target.requires.compute(): From e71b8cc3a25c65933a9cb2e51669202257a8aff3 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 13 Feb 2026 09:20:23 -0500 Subject: [PATCH 4/4] Fix Phoenix test/bench scripts: preserve deps on retry, add --no-gpu for CPU Same fixes previously applied to Frontier scripts but missed for Phoenix: - Use targeted clean (-t) to avoid wiping pre-fetched deps on retry - Add --no-gpu for CPU test jobs Co-Authored-By: Claude Opus 4.6 --- .github/workflows/phoenix/bench.sh | 4 ++-- .github/workflows/phoenix/test.sh | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh index 447bd710d9..b3e26178b9 100644 --- a/.github/workflows/phoenix/bench.sh +++ b/.github/workflows/phoenix/bench.sh @@ -48,8 +48,8 @@ while [ $attempt -le $max_attempts ]; do fi if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." - ./mfc.sh clean + echo "Build failed on attempt $attempt. Cleaning source targets and retrying in 30s..." + ./mfc.sh clean -t pre_process simulation post_process syscheck sleep 30 else echo "Build failed after $max_attempts attempts." diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh index 3920d96ed7..1706713df3 100644 --- a/.github/workflows/phoenix/test.sh +++ b/.github/workflows/phoenix/test.sh @@ -10,6 +10,7 @@ if [ "$job_device" = "gpu" ]; then fi fi +# Build source code on compute node (deps already fetched on login node) max_attempts=3 attempt=1 while [ $attempt -le $max_attempts ]; do @@ -20,8 +21,8 @@ while [ $attempt -le $max_attempts ]; do fi if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." - ./mfc.sh clean + echo "Build failed on attempt $attempt. Cleaning source targets and retrying in 30s..." + ./mfc.sh clean -t pre_process simulation post_process syscheck sleep 30 else echo "Build failed after $max_attempts attempts." @@ -37,6 +38,8 @@ if [ "$job_device" = "gpu" ]; then gpu_ids=$(seq -s ' ' 0 $(($gpu_count-1))) # 0,1,2,...,gpu_count-1 device_opts="-g $gpu_ids" n_test_threads=`expr $gpu_count \* 2` +else + device_opts="--no-gpu" fi ./mfc.sh test -v --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix