diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 6279f5f578..ac2f026dab 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -91,7 +91,7 @@ jobs: flag: p device: cpu interface: none - build_script: "" + build_script: "bash .github/workflows/phoenix/build.sh cpu none" - cluster: phoenix name: Georgia Tech | Phoenix (NVHPC) group: phoenix @@ -99,7 +99,7 @@ jobs: flag: p device: gpu interface: acc - build_script: "" + build_script: "bash .github/workflows/phoenix/build.sh gpu acc" - cluster: phoenix name: Georgia Tech | Phoenix (NVHPC) group: phoenix @@ -107,7 +107,7 @@ jobs: flag: p device: gpu interface: omp - build_script: "" + build_script: "bash .github/workflows/phoenix/build.sh gpu omp" - cluster: frontier name: Oak Ridge | Frontier (CCE) group: phoenix @@ -115,7 +115,7 @@ jobs: flag: f device: gpu interface: acc - build_script: "bash .github/workflows/frontier/build.sh gpu acc bench" + build_script: "bash .github/workflows/frontier/build.sh gpu acc" - cluster: frontier name: Oak Ridge | Frontier (CCE) group: phoenix @@ -123,7 +123,7 @@ jobs: flag: f device: gpu interface: omp - build_script: "bash .github/workflows/frontier/build.sh gpu omp bench" + build_script: "bash .github/workflows/frontier/build.sh gpu omp" - cluster: frontier_amd name: Oak Ridge | Frontier (AMD) group: phoenix @@ -131,7 +131,7 @@ jobs: flag: famd device: gpu interface: omp - build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp bench" + build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp" runs-on: group: ${{ matrix.group }} labels: ${{ matrix.labels }} @@ -153,9 +153,8 @@ jobs: ref: master path: master - - name: Setup & Build - if: matrix.build_script != '' - run: | + - name: Fetch Dependencies + run: | (cd pr && ${{ matrix.build_script }}) & (cd master && ${{ matrix.build_script }}) & wait %1 && wait %2 diff --git a/.github/workflows/frontier/bench.sh b/.github/workflows/frontier/bench.sh index 35b4c5950e..1aafa19e76 100644 --- a/.github/workflows/frontier/bench.sh +++ b/.github/workflows/frontier/bench.sh @@ -1,20 +1,50 @@ #!/bin/bash n_ranks=12 +build_opts="" device_opts="" if [ "$job_device" = "gpu" ]; then gpus=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ') n_ranks=$(echo "$gpus" | wc -w) # number of GPUs on node gpu_ids=$(echo "$gpus" | tr ' ' '\n' | tr '\n' ' ' | sed 's/ $//') # GPU IDs from rocm-smi - device_opts+="--gpu" + build_opts+="--gpu" if [ "$job_interface" = "acc" ]; then - device_opts+=" acc" + build_opts+=" acc" elif [ "$job_interface" = "omp" ]; then - device_opts+=" mp" + build_opts+=" mp" fi - device_opts+=" -g $gpu_ids" + device_opts="$build_opts -g $gpu_ids" fi +# Build case-optimized binaries on compute node (deps already fetched on login node) +max_attempts=3 +attempt=1 +while [ $attempt -le $max_attempts ]; do + echo "Build attempt $attempt of $max_attempts..." + build_cmd_ok=true + for dir in benchmarks/*/; do + if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then + build_cmd_ok=false + break + fi + done + + if [ "$build_cmd_ok" = true ]; then + echo "Build succeeded on attempt $attempt." + break + fi + + if [ $attempt -lt $max_attempts ]; then + echo "Build failed on attempt $attempt. Cleaning source targets and retrying in 30s..." + ./mfc.sh clean -t pre_process simulation post_process syscheck + sleep 30 + else + echo "Build failed after $max_attempts attempts." + exit 1 + fi + attempt=$((attempt + 1)) +done + if [ "$job_device" = "gpu" ]; then ./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c frontier $device_opts -n $n_ranks else diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index 18cddc96ca..84b67020ca 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -1,11 +1,13 @@ #!/bin/bash +# Fetch dependencies on login node (internet access). +# Source code is built on compute nodes via test.sh / bench.sh. + # Ignore SIGHUP to survive login node session drops trap '' HUP job_device=$1 job_interface=$2 -run_bench=$3 build_opts="" if [ "$job_device" = "gpu" ]; then build_opts+="--gpu" @@ -18,39 +20,4 @@ fi . ./mfc.sh load -c f -m g -max_attempts=3 -attempt=1 -while [ $attempt -le $max_attempts ]; do - echo "Build attempt $attempt of $max_attempts..." - if [ "$run_bench" == "bench" ]; then - build_cmd_ok=true - for dir in benchmarks/*/; do - dirname=$(basename "$dir") - if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then - build_cmd_ok=false - break - fi - done - else - if ./mfc.sh test -v -a --dry-run --rdma-mpi -j 8 $build_opts; then - build_cmd_ok=true - else - build_cmd_ok=false - fi - fi - - if [ "$build_cmd_ok" = true ]; then - echo "Build succeeded on attempt $attempt." - exit 0 - fi - - if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." - ./mfc.sh clean - sleep 30 - fi - attempt=$((attempt + 1)) -done - -echo "Build failed after $max_attempts attempts." -exit 1 +./mfc.sh build --deps-only -j 8 $build_opts diff --git a/.github/workflows/frontier/test.sh b/.github/workflows/frontier/test.sh index 17fbbaf8e5..3240ced6b8 100644 --- a/.github/workflows/frontier/test.sh +++ b/.github/workflows/frontier/test.sh @@ -11,8 +11,32 @@ if [ "$job_device" = "gpu" ]; then elif [ "$job_interface" = "omp" ]; then device_opts+=" mp" fi +else + device_opts+=" --no-gpu" fi +# Build source code on compute node (deps already fetched on login node) +max_attempts=3 +attempt=1 +while [ $attempt -le $max_attempts ]; do + echo "Build attempt $attempt of $max_attempts..." + if ./mfc.sh test -v -a --dry-run --rdma-mpi -j 8 $device_opts; then + echo "Build succeeded on attempt $attempt." + break + fi + + if [ $attempt -lt $max_attempts ]; then + echo "Build failed on attempt $attempt. Cleaning source targets and retrying in 30s..." + ./mfc.sh clean -t pre_process simulation post_process syscheck + sleep 30 + else + echo "Build failed after $max_attempts attempts." + exit 1 + fi + attempt=$((attempt + 1)) +done + +# Run tests if [ "$job_device" = "gpu" ]; then ./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts -- -c frontier else diff --git a/.github/workflows/frontier_amd/bench.sh b/.github/workflows/frontier_amd/bench.sh index 6e01687e79..173251c679 100644 --- a/.github/workflows/frontier_amd/bench.sh +++ b/.github/workflows/frontier_amd/bench.sh @@ -1,20 +1,50 @@ #!/bin/bash n_ranks=12 +build_opts="" device_opts="" if [ "$job_device" = "gpu" ]; then gpus=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ') n_ranks=$(echo "$gpus" | wc -w) # number of GPUs on node gpu_ids=$(echo "$gpus" | tr ' ' '\n' | tr '\n' ' ' | sed 's/ $//') # GPU IDs from rocm-smi - device_opts+="--gpu" + build_opts+="--gpu" if [ "$job_interface" = "acc" ]; then - device_opts+=" acc" + build_opts+=" acc" elif [ "$job_interface" = "omp" ]; then - device_opts+=" mp" + build_opts+=" mp" fi - device_opts+=" -g $gpu_ids" + device_opts="$build_opts -g $gpu_ids" fi +# Build case-optimized binaries on compute node (deps already fetched on login node) +max_attempts=3 +attempt=1 +while [ $attempt -le $max_attempts ]; do + echo "Build attempt $attempt of $max_attempts..." + build_cmd_ok=true + for dir in benchmarks/*/; do + if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then + build_cmd_ok=false + break + fi + done + + if [ "$build_cmd_ok" = true ]; then + echo "Build succeeded on attempt $attempt." + break + fi + + if [ $attempt -lt $max_attempts ]; then + echo "Build failed on attempt $attempt. Cleaning source targets and retrying in 30s..." + ./mfc.sh clean -t pre_process simulation post_process syscheck + sleep 30 + else + echo "Build failed after $max_attempts attempts." + exit 1 + fi + attempt=$((attempt + 1)) +done + if [ "$job_device" = "gpu" ]; then ./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c frontier_amd $device_opts -n $n_ranks else diff --git a/.github/workflows/frontier_amd/build.sh b/.github/workflows/frontier_amd/build.sh index 56c47d8ff4..1b120ae6f7 100644 --- a/.github/workflows/frontier_amd/build.sh +++ b/.github/workflows/frontier_amd/build.sh @@ -1,11 +1,13 @@ #!/bin/bash +# Fetch dependencies on login node (internet access). +# Source code is built on compute nodes via test.sh / bench.sh. + # Ignore SIGHUP to survive login node session drops trap '' HUP job_device=$1 job_interface=$2 -run_bench=$3 build_opts="" if [ "$job_device" = "gpu" ]; then build_opts+="--gpu" @@ -18,39 +20,4 @@ fi . ./mfc.sh load -c famd -m g -max_attempts=3 -attempt=1 -while [ $attempt -le $max_attempts ]; do - echo "Build attempt $attempt of $max_attempts..." - if [ "$run_bench" == "bench" ]; then - build_cmd_ok=true - for dir in benchmarks/*/; do - dirname=$(basename "$dir") - if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then - build_cmd_ok=false - break - fi - done - else - if ./mfc.sh test -v -a --dry-run -j 8 $build_opts; then - build_cmd_ok=true - else - build_cmd_ok=false - fi - fi - - if [ "$build_cmd_ok" = true ]; then - echo "Build succeeded on attempt $attempt." - exit 0 - fi - - if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." - ./mfc.sh clean - sleep 30 - fi - attempt=$((attempt + 1)) -done - -echo "Build failed after $max_attempts attempts." -exit 1 +./mfc.sh build --deps-only -j 8 $build_opts diff --git a/.github/workflows/frontier_amd/test.sh b/.github/workflows/frontier_amd/test.sh index ff65aa2b0e..afc944b8d3 100644 --- a/.github/workflows/frontier_amd/test.sh +++ b/.github/workflows/frontier_amd/test.sh @@ -11,8 +11,32 @@ if [ "$job_device" = "gpu" ]; then elif [ "$job_interface" = "omp" ]; then device_opts+=" mp" fi +else + device_opts+=" --no-gpu" fi +# Build source code on compute node (deps already fetched on login node) +max_attempts=3 +attempt=1 +while [ $attempt -le $max_attempts ]; do + echo "Build attempt $attempt of $max_attempts..." + if ./mfc.sh test -v -a --dry-run -j 8 $device_opts; then + echo "Build succeeded on attempt $attempt." + break + fi + + if [ $attempt -lt $max_attempts ]; then + echo "Build failed on attempt $attempt. Cleaning source targets and retrying in 30s..." + ./mfc.sh clean -t pre_process simulation post_process syscheck + sleep 30 + else + echo "Build failed after $max_attempts attempts." + exit 1 + fi + attempt=$((attempt + 1)) +done + +# Run tests if [ "$job_device" = "gpu" ]; then ./mfc.sh test -v -a --max-attempts 3 -j $ngpus $device_opts -- -c frontier_amd else diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh index 447bd710d9..b3e26178b9 100644 --- a/.github/workflows/phoenix/bench.sh +++ b/.github/workflows/phoenix/bench.sh @@ -48,8 +48,8 @@ while [ $attempt -le $max_attempts ]; do fi if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." - ./mfc.sh clean + echo "Build failed on attempt $attempt. Cleaning source targets and retrying in 30s..." + ./mfc.sh clean -t pre_process simulation post_process syscheck sleep 30 else echo "Build failed after $max_attempts attempts." diff --git a/.github/workflows/phoenix/build.sh b/.github/workflows/phoenix/build.sh new file mode 100644 index 0000000000..8a850fca6f --- /dev/null +++ b/.github/workflows/phoenix/build.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Fetch dependencies on login node (internet access). +# Source code is built on compute nodes via test.sh / bench.sh. + +# Ignore SIGHUP to survive login node session drops +trap '' HUP + +job_device=$1 +job_interface=$2 +build_opts="" +if [ "$job_device" = "gpu" ]; then + build_opts+="--gpu" + if [ "$job_interface" = "acc" ]; then + build_opts+=" acc" + elif [ "$job_interface" = "omp" ]; then + build_opts+=" mp" + fi +fi + +. ./mfc.sh load -c p -m $job_device + +./mfc.sh build --deps-only -j $(nproc) $build_opts diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh index 3920d96ed7..1706713df3 100644 --- a/.github/workflows/phoenix/test.sh +++ b/.github/workflows/phoenix/test.sh @@ -10,6 +10,7 @@ if [ "$job_device" = "gpu" ]; then fi fi +# Build source code on compute node (deps already fetched on login node) max_attempts=3 attempt=1 while [ $attempt -le $max_attempts ]; do @@ -20,8 +21,8 @@ while [ $attempt -le $max_attempts ]; do fi if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..." - ./mfc.sh clean + echo "Build failed on attempt $attempt. Cleaning source targets and retrying in 30s..." + ./mfc.sh clean -t pre_process simulation post_process syscheck sleep 30 else echo "Build failed after $max_attempts attempts." @@ -37,6 +38,8 @@ if [ "$job_device" = "gpu" ]; then gpu_ids=$(seq -s ' ' 0 $(($gpu_count-1))) # 0,1,2,...,gpu_count-1 device_opts="-g $gpu_ids" n_test_threads=`expr $gpu_count \* 2` +else + device_opts="--no-gpu" fi ./mfc.sh test -v --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0be51076ec..d79e61fecd 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -149,7 +149,7 @@ jobs: strategy: matrix: include: - # Phoenix (GT) — build+test combined in SLURM job + # Phoenix (GT) — deps on login node, build+test in SLURM job - runner: 'gt' cluster: 'phoenix' cluster_name: 'Georgia Tech | Phoenix' @@ -165,7 +165,7 @@ jobs: cluster_name: 'Georgia Tech | Phoenix' device: 'cpu' interface: 'none' - # Frontier (ORNL) — build on login node, test via SLURM + # Frontier (ORNL) — deps on login node, build+test via SLURM - runner: 'frontier' cluster: 'frontier' cluster_name: 'Oak Ridge | Frontier' @@ -181,7 +181,7 @@ jobs: cluster_name: 'Oak Ridge | Frontier' device: 'cpu' interface: 'none' - # Frontier AMD — build on login node, test via SLURM + # Frontier AMD — deps on login node, build+test via SLURM - runner: 'frontier' cluster: 'frontier_amd' cluster_name: 'Oak Ridge | Frontier (AMD)' @@ -203,11 +203,10 @@ jobs: - name: Clone uses: actions/checkout@v4 - - name: Build - if: matrix.cluster != 'phoenix' + - name: Fetch Dependencies run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }} - - name: Test + - name: Build & Test run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} - name: Print Logs diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index 6430f7ad35..eca3a4f2c0 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -566,6 +566,12 @@ def __build_target(target: typing.Union[MFCTarget, str], case: input.MFCInputFil history.add(target.name) + # Dependencies are pinned to fixed versions. If already configured + # (built & installed by a prior --deps-only step), skip entirely + # to avoid re-entering the superbuild (which may access the network). + if target.isDependency and target.is_configured(case) and os.path.isdir(target.get_install_dirpath(case)): + return + for dep in target.requires.compute(): # If we have already built and installed this target, # do not do so again. This can be inferred by whether @@ -611,6 +617,29 @@ def build(targets = None, case: input.MFCInputFile = None, history: typing.Set[s case = case or input.load(ARG("input"), ARG("--"), {}) case.validate_params() + if ARG("deps_only", False): + all_deps = set() + for target in targets: + if target.isDependency: + all_deps.add(target) + for dep in target.requires.compute(): + all_deps.add(dep) + + sorted_deps = sorted(all_deps, key=lambda t: t.name) + + if len(history) == 0: + cons.print(f"[bold]Fetch Dependencies | {format_list_to_string([d.name for d in sorted_deps], 'magenta', 'None')}[/bold]") + cons.print(no_indent=True) + + if not sorted_deps: + cons.print("[yellow]No dependencies to build for the requested targets.[/yellow]") + return + + for dep in sorted_deps: + __build_target(dep, case, history) + + return + if len(history) == 0: cons.print(__generate_header(case, targets)) cons.print(no_indent=True) diff --git a/toolchain/mfc/cli/commands.py b/toolchain/mfc/cli/commands.py index 8ad8c4bd07..bb9fddb76d 100644 --- a/toolchain/mfc/cli/commands.py +++ b/toolchain/mfc/cli/commands.py @@ -154,6 +154,13 @@ default=False, dest="case_optimization", ), + Argument( + name="deps-only", + help="Only fetch and build dependencies, do not build MFC targets.", + action=ArgAction.STORE_TRUE, + default=False, + dest="deps_only", + ), ], examples=[ Example("./mfc.sh build", "Build all default targets (CPU)"),