diff --git a/cuda_bindings/cuda/bindings/_test_helpers/arch_check.py b/cuda_bindings/cuda/bindings/_test_helpers/arch_check.py
index 9b1e5e23a7..ea2c03a223 100644
--- a/cuda_bindings/cuda/bindings/_test_helpers/arch_check.py
+++ b/cuda_bindings/cuda/bindings/_test_helpers/arch_check.py
@@ -2,12 +2,18 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 
+import os
 from contextlib import contextmanager
 from functools import cache
 
 import pytest
 
 from cuda.bindings import nvml
+from cuda.pathfinder import DynamicLibNotFoundError
+
+
+def _running_in_ci() -> bool:
+    return os.environ.get("CI") is not None
 
 
 @cache
@@ -16,16 +22,27 @@ def hardware_supports_nvml():
     Tries to call the simplest NVML API possible to see if just the basics
     works.  If not we are probably on one of the platforms where NVML is not
     supported at all (e.g. Jetson Orin).
+
+    Runtime-load/init failures are treated as "unsupported" on local/dev
+    machines so NVML test modules skip cleanly.  In CI we re-raise those
+    failures to avoid masking real infrastructure regressions.
     """
-    nvml.init_v2()
+    initialized = False
     try:
+        nvml.init_v2()
+        initialized = True
         nvml.system_get_driver_branch()
     except (nvml.NotSupportedError, nvml.UnknownError):
         return False
+    except (DynamicLibNotFoundError, nvml.NvmlError):
+        if _running_in_ci():
+            raise
+        return False
     else:
         return True
     finally:
-        nvml.shutdown()
+        if initialized:
+            nvml.shutdown()
 
 
 @contextmanager
diff --git a/cuda_bindings/tests/conftest.py b/cuda_bindings/tests/conftest.py
index f30500c134..6a4c4fa534 100644
--- a/cuda_bindings/tests/conftest.py
+++ b/cuda_bindings/tests/conftest.py
@@ -8,6 +8,7 @@
 import pytest
 
 import cuda.bindings.driver as cuda
+from cuda.bindings._test_helpers.arch_check import hardware_supports_nvml
 
 # Import shared test helpers for tests across subprojects.
 # PLEASE KEEP IN SYNC with copies in other conftest.py in this repo.
@@ -46,3 +47,9 @@ def ctx(device):
     yield ctx
     (err,) = cuda.cuCtxDestroy(ctx)
     assert err == cuda.CUresult.CUDA_SUCCESS
+
+
+@pytest.fixture(scope="session")
+def require_nvml_runtime_or_skip_local():
+    if not hardware_supports_nvml():
+        pytest.skip("NVML runtime is unavailable on this system")
diff --git a/cuda_bindings/tests/nvml/conftest.py b/cuda_bindings/tests/nvml/conftest.py
index f350610c7c..acc791df84 100644
--- a/cuda_bindings/tests/nvml/conftest.py
+++ b/cuda_bindings/tests/nvml/conftest.py
@@ -8,6 +8,8 @@
 from cuda.bindings import nvml
 from cuda.bindings._test_helpers.arch_check import unsupported_before  # noqa: F401
 
+pytestmark = pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
+
 
 class NVMLInitializer:
     def __init__(self):
@@ -27,7 +29,8 @@ def nvml_init():
 
 
 @pytest.fixture(scope="session", autouse=True)
-def device_info():
+def device_info(request):
+    request.getfixturevalue("require_nvml_runtime_or_skip_local")
     dev_count = None
     bus_id_to_board_details = {}
 
diff --git a/cuda_bindings/tests/nvml/test_device.py b/cuda_bindings/tests/nvml/test_device.py
index 7344a93efe..85f240906c 100644
--- a/cuda_bindings/tests/nvml/test_device.py
+++ b/cuda_bindings/tests/nvml/test_device.py
@@ -25,6 +25,13 @@ def cuda_version_less_than(target):
     return get_cuda_version() < target
 
 
+@pytest.fixture(scope="module")
+def require_cuda_13_1_or_skip(request):
+    request.getfixturevalue("require_nvml_runtime_or_skip_local")
+    if cuda_version_less_than(13010):
+        pytest.skip("Introduced in 13.1")
+
+
 def test_device_capabilities(all_devices):
     for device in all_devices:
         capabilities = nvml.device_get_capabilities(device)
@@ -94,7 +101,7 @@ def test_device_get_performance_modes(all_devices):
         assert isinstance(modes, str)
 
 
-@pytest.mark.skipif(cuda_version_less_than(13010), reason="Introduced in 13.1")
+@pytest.mark.usefixtures("require_cuda_13_1_or_skip")
 def test_device_get_unrepairable_memory_flag(all_devices):
     for device in all_devices:
         with unsupported_before(device, None):
@@ -109,7 +116,7 @@ def test_device_vgpu_get_heterogeneous_mode(all_devices):
         assert isinstance(mode, int)
 
 
-@pytest.mark.skipif(cuda_version_less_than(13010), reason="Introduced in 13.1")
+@pytest.mark.usefixtures("require_cuda_13_1_or_skip")
 def test_read_prm_counters(all_devices):
     for device in all_devices:
         counters = nvml.PRMCounter_v1(5)
diff --git a/cuda_bindings/tests/test_arch_check.py b/cuda_bindings/tests/test_arch_check.py
new file mode 100644
index 0000000000..5421967757
--- /dev/null
+++ b/cuda_bindings/tests/test_arch_check.py
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+
+import pytest
+
+from cuda.bindings import nvml
+from cuda.bindings._test_helpers import arch_check
+from cuda.pathfinder import DynamicLibNotFoundError
+
+
+def _raise(exc):
+    def _inner():
+        raise exc
+
+    return _inner
+
+
+def _make_not_supported_error():
+    err = nvml.NotSupportedError.__new__(nvml.NotSupportedError)
+    Exception.__init__(err, "Not supported")
+    return err
+
+
+def _make_lib_rm_version_mismatch_error():
+    err = nvml.LibRmVersionMismatchError.__new__(nvml.LibRmVersionMismatchError)
+    Exception.__init__(err, "Driver/library version mismatch")
+    return err
+
+
+def _make_dynamic_lib_not_found_error():
+    return DynamicLibNotFoundError("Failure finding libnvml.so")
+
+
+@pytest.fixture(autouse=True)
+def clear_hardware_supports_nvml_cache():
+    arch_check.hardware_supports_nvml.cache_clear()
+    yield
+    arch_check.hardware_supports_nvml.cache_clear()
+
+
+def test_hardware_supports_nvml_returns_true_when_probe_succeeds(monkeypatch):
+    calls = []
+
+    monkeypatch.setattr(arch_check.nvml, "init_v2", lambda: calls.append("init"))
+    monkeypatch.setattr(arch_check.nvml, "system_get_driver_branch", lambda: "560")
+    monkeypatch.setattr(arch_check.nvml, "shutdown", lambda: calls.append("shutdown"))
+
+    assert arch_check.hardware_supports_nvml() is True
+    assert calls == ["init", "shutdown"]
+
+
+def test_hardware_supports_nvml_returns_false_for_not_supported(monkeypatch):
+    calls = []
+
+    monkeypatch.setattr(arch_check.nvml, "init_v2", lambda: calls.append("init"))
+    monkeypatch.setattr(arch_check.nvml, "system_get_driver_branch", _raise(_make_not_supported_error()))
+    monkeypatch.setattr(arch_check.nvml, "shutdown", lambda: calls.append("shutdown"))
+
+    assert arch_check.hardware_supports_nvml() is False
+    assert calls == ["init", "shutdown"]
+
+
+@pytest.mark.parametrize(
+    "error_factory",
+    [
+        _make_lib_rm_version_mismatch_error,
+        _make_dynamic_lib_not_found_error,
+    ],
+)
+def test_hardware_supports_nvml_runtime_errors_skip_locally(monkeypatch, error_factory):
+    monkeypatch.delenv("CI", raising=False)
+    monkeypatch.setattr(arch_check.nvml, "init_v2", _raise(error_factory()))
+
+    assert arch_check.hardware_supports_nvml() is False
+
+
+@pytest.mark.parametrize(
+    "error_factory",
+    [
+        _make_lib_rm_version_mismatch_error,
+        _make_dynamic_lib_not_found_error,
+    ],
+)
+def test_hardware_supports_nvml_runtime_errors_fail_in_ci(monkeypatch, error_factory):
+    err = error_factory()
+    monkeypatch.setenv("CI", "1")
+    monkeypatch.setattr(arch_check.nvml, "init_v2", _raise(err))
+
+    with pytest.raises(type(err)):
+        arch_check.hardware_supports_nvml()
diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py
index e056e999f3..1fcebc4e89 100644
--- a/cuda_bindings/tests/test_cuda.py
+++ b/cuda_bindings/tests/test_cuda.py
@@ -552,6 +552,7 @@ def test_get_error_name_and_string():
     assert s == b"CUDA_ERROR_INVALID_DEVICE"
 
 
+@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
 @pytest.mark.skipif(not callableBinary("nvidia-smi"), reason="Binary existence needed")
 def test_device_get_name(device):
     # TODO: Refactor this test once we have nvml bindings to avoid the use of subprocess
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index b771d04276..729f2b0167 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -25,6 +25,7 @@
     PinnedMemoryResourceOptions,
     _device,
 )
+from cuda.core import system as core_system
 from cuda.core._utils.cuda_utils import handle_return
 
 # Import shared test helpers for tests across subprojects.
@@ -59,6 +60,17 @@ def skip_if_managed_memory_unsupported(device):
         pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later")
 
 
+@pytest.fixture(scope="session")
+def require_nvml_runtime_or_skip_local():
+    if not core_system.CUDA_BINDINGS_NVML_IS_COMPATIBLE:
+        pytest.skip("NVML support requires cuda.bindings version 12.9.6+ or 13.1.2+")
+
+    from cuda.bindings._test_helpers.arch_check import hardware_supports_nvml
+
+    if not hardware_supports_nvml():
+        pytest.skip("NVML runtime is unavailable on this system")
+
+
 def create_managed_memory_resource_or_skip(*args, **kwargs):
     try:
         return ManagedMemoryResource(*args, **kwargs)
@@ -209,13 +221,15 @@ def _mempool_device_impl(num):
 
 
 @pytest.fixture
-def mempool_device_x2():
+def mempool_device_x2(request):
+    request.getfixturevalue("require_nvml_runtime_or_skip_local")
     """Fixture that provides two devices if available, otherwise skips test."""
     return _mempool_device_impl(2)
 
 
 @pytest.fixture
-def mempool_device_x3():
+def mempool_device_x3(request):
+    request.getfixturevalue("require_nvml_runtime_or_skip_local")
     """Fixture that provides three devices if available, otherwise skips test."""
     return _mempool_device_impl(3)
 
diff --git a/cuda_core/tests/system/conftest.py b/cuda_core/tests/system/conftest.py
index 6cae2991d2..d28e2c98f8 100644
--- a/cuda_core/tests/system/conftest.py
+++ b/cuda_core/tests/system/conftest.py
@@ -5,21 +5,7 @@
 
 import pytest
 
-from cuda.core import system
-
-SHOULD_SKIP_NVML_TESTS = not system.CUDA_BINDINGS_NVML_IS_COMPATIBLE
-
-
-if system.CUDA_BINDINGS_NVML_IS_COMPATIBLE:
-    from cuda.bindings._test_helpers.arch_check import hardware_supports_nvml
-
-    SHOULD_SKIP_NVML_TESTS |= not hardware_supports_nvml()
-
-
-skip_if_nvml_unsupported = pytest.mark.skipif(
-    SHOULD_SKIP_NVML_TESTS,
-    reason="NVML support requires cuda.bindings version 12.9.6+ or 13.1.2+, and hardware that supports NVML",
-)
+skip_if_nvml_unsupported = pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
 
 
 def unsupported_before(device, expected_device_arch):
diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py
index 83a71a13ec..36ee307e9d 100644
--- a/cuda_core/tests/system/test_system_device.py
+++ b/cuda_core/tests/system/test_system_device.py
@@ -25,7 +25,7 @@
 
 @pytest.fixture(autouse=True, scope="module")
 def check_gpu_available():
-    if not system.CUDA_BINDINGS_NVML_IS_COMPATIBLE or system.get_num_devices() == 0:
+    if system.get_num_devices() == 0:
         pytest.skip("No GPUs available to run device tests", allow_module_level=True)
 
 
diff --git a/cuda_core/tests/system/test_system_system.py b/cuda_core/tests/system/test_system_system.py
index cb260a0e0a..38a01b4f39 100644
--- a/cuda_core/tests/system/test_system_system.py
+++ b/cuda_core/tests/system/test_system_system.py
@@ -18,6 +18,8 @@
 
 from .conftest import skip_if_nvml_unsupported
 
+pytestmark = skip_if_nvml_unsupported
+
 
 def test_driver_version():
     driver_version = system.get_driver_version()
diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py
index 95e47ce8d9..55608ff27a 100644
--- a/cuda_core/tests/test_device.py
+++ b/cuda_core/tests/test_device.py
@@ -26,21 +26,9 @@ def cuda_version():
     return _py_major_ver, _driver_ver
 
 
+@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
 def test_to_system_device(deinit_cuda):
-    from cuda.core.system import _system
-
     device = Device()
-
-    if not _system.CUDA_BINDINGS_NVML_IS_COMPATIBLE:
-        with pytest.raises(RuntimeError):
-            device.to_system_device()
-        pytest.skip("NVML support requires cuda.bindings version 12.9.6+ or 13.1.2+")
-
-    from cuda.bindings._test_helpers.arch_check import hardware_supports_nvml
-
-    if not hardware_supports_nvml():
-        pytest.skip("NVML not supported on this platform")
-
     from cuda.core.system import Device as SystemDevice
 
     system_device = device.to_system_device()
@@ -87,6 +75,7 @@ def test_device_alloc_zero_bytes(deinit_cuda):
     assert buffer.device_id == int(device)
 
 
+@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
 def test_device_id(deinit_cuda):
     for device in Device.get_all_devices():
         device.set_current()
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index ea2e989e1a..cd82ad8e63 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -365,6 +365,7 @@ def test_buffer_external_host():
 
 
 @pytest.mark.parametrize("change_device", [True, False])
+@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
 def test_buffer_external_device(change_device):
     n = ccx_system.get_num_devices()
     if n < 1:
@@ -389,6 +390,7 @@ def test_buffer_external_device(change_device):
 
 
 @pytest.mark.parametrize("change_device", [True, False])
+@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
 def test_buffer_external_pinned_alloc(change_device):
     n = ccx_system.get_num_devices()
     if n < 1:
@@ -414,6 +416,7 @@ def test_buffer_external_pinned_alloc(change_device):
 
 
 @pytest.mark.parametrize("change_device", [True, False])
+@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
 def test_buffer_external_pinned_registered(change_device):
     n = ccx_system.get_num_devices()
     if n < 1:
@@ -447,6 +450,7 @@ def test_buffer_external_pinned_registered(change_device):
 
 
 @pytest.mark.parametrize("change_device", [True, False])
+@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
 def test_buffer_external_managed(change_device):
     n = ccx_system.get_num_devices()
     if n < 1:
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index e74b1fc672..785f40c603 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -131,6 +131,7 @@ def test_get_kernel(init_cuda):
         ("cluster_scheduling_policy_preference", int),
     ],
 )
+@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
 def test_read_only_kernel_attributes(get_saxpy_kernel_cubin, attr, expected_type):
     kernel, _ = get_saxpy_kernel_cubin
     method = getattr(kernel.attributes, attr)
diff --git a/cuda_core/tests/test_object_protocols.py b/cuda_core/tests/test_object_protocols.py
index fa35a3887e..cbe2dc04e8 100644
--- a/cuda_core/tests/test_object_protocols.py
+++ b/cuda_core/tests/test_object_protocols.py
@@ -146,7 +146,8 @@ def sample_program_nvvm(init_cuda):
 
 
 @pytest.fixture
-def sample_device_alt(init_cuda):
+def sample_device_alt(init_cuda, request):
+    request.getfixturevalue("require_nvml_runtime_or_skip_local")
     """An alternate Device object (requires multi-GPU)."""
     if system.get_num_devices() < 2:
         pytest.skip("requires multi-GPU")