From 069bdaabb11173dddbd564e7b7998c409255a3ff Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 26 Feb 2026 16:32:56 -0800
Subject: [PATCH 1/5] Refactor _MemPool hierarchy: separate shared pool
 machinery from device-specific concerns

Move _dev_id, device_id, and peer_accessible_by from _MemPool into
DeviceMemoryResource. Eliminate _MemPoolOptions and refactor pool
initialization into freestanding cdef functions (MP_init_create_pool,
MP_init_current_pool, MP_raise_release_threshold) for cross-module
visibility. Extract __init__ bodies into inline cdef helpers (_DMR_init,
_PMR_init, _MMR_init) for consistency and shorter class definitions.

Implements device_id as -1 for PinnedMemoryResource and
ManagedMemoryResource since they are not device-bound.

Made-with: Cursor
---
 .../core/_memory/_device_memory_resource.pxd  |   6 +-
 .../core/_memory/_device_memory_resource.pyx  | 158 +++++++++--
 .../core/_memory/_managed_memory_resource.pxd |   2 +-
 .../core/_memory/_managed_memory_resource.pyx |  86 +++---
 cuda_core/cuda/core/_memory/_memory_pool.pxd  |  31 +-
 cuda_core/cuda/core/_memory/_memory_pool.pyx  | 268 +++++-------------
 .../core/_memory/_pinned_memory_resource.pyx  | 195 +++++++------
 cuda_core/tests/test_memory.py                |   2 +-
 8 files changed, 392 insertions(+), 356 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_device_memory_resource.pxd b/cuda_core/cuda/core/_memory/_device_memory_resource.pxd
index c293d72750..a7f3bfd958 100644
--- a/cuda_core/cuda/core/_memory/_device_memory_resource.pxd
+++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
@@ -7,7 +7,9 @@ from cuda.core._memory._ipc cimport IPCDataForMR
 
 
 cdef class DeviceMemoryResource(_MemPool):
-    pass
+    cdef:
+        int _dev_id
+        object _peer_accessible_by
 
 
 cpdef DMR_mempool_get_access(DeviceMemoryResource, int)
diff --git a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
index 78a49d3e44..09aa482234 100644
--- a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
@@ -1,17 +1,24 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
 
 from cuda.bindings cimport cydriver
-from cuda.core._memory._memory_pool cimport _MemPool, _MemPoolOptions
+from cuda.core._memory._memory_pool cimport (
+    _MemPool, MP_init_create_pool, MP_raise_release_threshold,
+)
 from cuda.core._memory cimport _ipc
 from cuda.core._memory._ipc cimport IPCAllocationHandle
+from cuda.core._resource_handles cimport (
+    as_cu,
+    get_device_mempool,
+)
 from cuda.core._utils.cuda_utils cimport (
     check_or_create_options,
     HANDLE_RETURN,
 )
+from cpython.mem cimport PyMem_Malloc, PyMem_Free
 
 from dataclasses import dataclass
 import multiprocessing
@@ -19,7 +26,6 @@ import platform  # no-cython-lint
 import uuid
 
 from cuda.core._utils.cuda_utils import check_multiprocessing_start_method
-from cuda.core._resource_handles cimport as_cu
 
 __all__ = ['DeviceMemoryResource', 'DeviceMemoryResourceOptions']
 
@@ -122,27 +128,26 @@ cdef class DeviceMemoryResource(_MemPool):
     associated MMR.
     """
 
-    def __init__(self, device_id: Device | int, options=None):
-        from .._device import Device
-        cdef int dev_id = Device(device_id).device_id
-        cdef DeviceMemoryResourceOptions opts = check_or_create_options(
-            DeviceMemoryResourceOptions, options, "DeviceMemoryResource options",
-            keep_none=True
-        )
-        cdef _MemPoolOptions opts_base = _MemPoolOptions()
-
-        cdef bint ipc_enabled = False
-        if opts:
-            ipc_enabled = opts.ipc_enabled
-            if ipc_enabled and not _ipc.is_supported():
-                raise RuntimeError("IPC is not available on {platform.system()}")
-            opts_base._max_size = opts.max_size
-            opts_base._use_current = False
-        opts_base._ipc_enabled = ipc_enabled
-        opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-        opts_base._type = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
+    def __cinit__(self, *args, **kwargs):
+        self._dev_id = cydriver.CU_DEVICE_INVALID
+        self._peer_accessible_by = ()
 
-        super().__init__(dev_id, opts_base)
+    def __init__(self, device_id: Device | int, options=None):
+        _DMR_init(self, device_id, options)
+
+    def __dealloc__(self):
+        try:
+            self.close()
+        except Exception:
+            pass
+
+    def close(self):
+        """Close the memory resource, revoking peer access before destruction."""
+        # nvbug 5698116: clear peer access before pool destruction; also
+        # needed for non-owned (default) pools to undo modifications.
+        if self._peer_accessible_by:
+            _DMR_set_peer_accessible_by(self, [])
+        super().close()
 
     def __reduce__(self):
         return DeviceMemoryResource.from_registry, (self.uuid,)
@@ -215,6 +220,37 @@ cdef class DeviceMemoryResource(_MemPool):
             raise RuntimeError("Memory resource is not IPC-enabled")
         return self._ipc_data._alloc_handle
 
+    @property
+    def device_id(self) -> int:
+        """The associated device ordinal."""
+        return self._dev_id
+
+    @property
+    def peer_accessible_by(self):
+        """
+        Get or set the devices that can access allocations from this memory
+        pool. Access can be modified at any time and affects all allocations
+        from this memory pool.
+
+        Returns a tuple of sorted device IDs that currently have peer access to
+        allocations from this memory pool.
+
+        When setting, accepts a sequence of Device objects or device IDs.
+        Setting to an empty sequence revokes all peer access.
+
+        Examples
+        --------
+        >>> dmr = DeviceMemoryResource(0)
+        >>> dmr.peer_accessible_by = [1]  # Grant access to device 1
+        >>> assert dmr.peer_accessible_by == (1,)
+        >>> dmr.peer_accessible_by = []  # Revoke access
+        """
+        return self._peer_accessible_by
+
+    @peer_accessible_by.setter
+    def peer_accessible_by(self, devices):
+        _DMR_set_peer_accessible_by(self, devices)
+
     @property
     def is_device_accessible(self) -> bool:
         """Return True. This memory resource provides device-accessible buffers."""
@@ -226,6 +262,82 @@ cdef class DeviceMemoryResource(_MemPool):
         return False
 
 
+cdef inline _DMR_set_peer_accessible_by(DeviceMemoryResource self, devices):
+    from .._device import Device
+
+    cdef set[int] target_ids = {Device(dev).device_id for dev in devices}
+    target_ids.discard(self._dev_id)
+    this_dev = Device(self._dev_id)
+    cdef list bad = [dev for dev in target_ids if not this_dev.can_access_peer(dev)]
+    if bad:
+        raise ValueError(f"Device {self._dev_id} cannot access peer(s): {', '.join(map(str, bad))}")
+    cdef set[int] cur_ids = set(self._peer_accessible_by)
+    cdef set[int] to_add = target_ids - cur_ids
+    cdef set[int] to_rm = cur_ids - target_ids
+    cdef size_t count = len(to_add) + len(to_rm)
+    cdef cydriver.CUmemAccessDesc* access_desc = NULL
+    cdef size_t i = 0
+
+    if count > 0:
+        access_desc = <cydriver.CUmemAccessDesc*>PyMem_Malloc(count * sizeof(cydriver.CUmemAccessDesc))
+        if access_desc == NULL:
+            raise MemoryError("Failed to allocate memory for access descriptors")
+
+        try:
+            for dev_id in to_add:
+                access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
+                access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+                access_desc[i].location.id = dev_id
+                i += 1
+
+            for dev_id in to_rm:
+                access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_NONE
+                access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+                access_desc[i].location.id = dev_id
+                i += 1
+
+            with nogil:
+                HANDLE_RETURN(cydriver.cuMemPoolSetAccess(as_cu(self._h_pool), access_desc, count))
+        finally:
+            if access_desc != NULL:
+                PyMem_Free(access_desc)
+
+        self._peer_accessible_by = tuple(target_ids)
+
+
+cdef inline _DMR_init(DeviceMemoryResource self, device_id, options):
+    from .._device import Device
+    cdef int dev_id = Device(device_id).device_id
+    cdef DeviceMemoryResourceOptions opts = check_or_create_options(
+        DeviceMemoryResourceOptions, options, "DeviceMemoryResource options",
+        keep_none=True
+    )
+    cdef bint ipc_enabled = False
+    cdef size_t max_size = 0
+
+    self._dev_id = dev_id
+
+    if opts is not None:
+        ipc_enabled = opts.ipc_enabled
+        if ipc_enabled and not _ipc.is_supported():
+            raise RuntimeError(f"IPC is not available on {platform.system()}")
+        max_size = opts.max_size
+
+    if opts is None:
+        self._h_pool = get_device_mempool(dev_id)
+        self._mempool_owned = False
+        MP_raise_release_threshold(self)
+    else:
+        MP_init_create_pool(
+            self,
+            cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE,
+            dev_id,
+            cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED,
+            ipc_enabled,
+            max_size,
+        )
+
+
 # Note: this is referenced in instructions to debug nvbug 5698116.
 cpdef DMR_mempool_get_access(DeviceMemoryResource dmr, int device_id):
     """
diff --git a/cuda_core/cuda/core/_memory/_managed_memory_resource.pxd b/cuda_core/cuda/core/_memory/_managed_memory_resource.pxd
index 46e00cd4cb..5a73a57ee9 100644
--- a/cuda_core/cuda/core/_memory/_managed_memory_resource.pxd
+++ b/cuda_core/cuda/core/_memory/_managed_memory_resource.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
diff --git a/cuda_core/cuda/core/_memory/_managed_memory_resource.pyx b/cuda_core/cuda/core/_memory/_managed_memory_resource.pyx
index a268520e55..64f523087c 100644
--- a/cuda_core/cuda/core/_memory/_managed_memory_resource.pyx
+++ b/cuda_core/cuda/core/_memory/_managed_memory_resource.pyx
@@ -6,7 +6,7 @@ from __future__ import annotations
 
 from cuda.bindings cimport cydriver
 
-from cuda.core._memory._memory_pool cimport _MemPool, _MemPoolOptions
+from cuda.core._memory._memory_pool cimport _MemPool, MP_init_create_pool, MP_init_current_pool
 from cuda.core._utils.cuda_utils cimport (
     HANDLE_RETURN,
     check_or_create_options,
@@ -64,40 +64,12 @@ cdef class ManagedMemoryResource(_MemPool):
     """
 
     def __init__(self, options=None):
-        cdef ManagedMemoryResourceOptions opts = check_or_create_options(
-            ManagedMemoryResourceOptions, options, "ManagedMemoryResource options",
-            keep_none=True
-        )
-        cdef _MemPoolOptions opts_base = _MemPoolOptions()
-
-        cdef int device_id = -1
-        cdef object preferred_location = None
-        if opts:
-            preferred_location = opts.preferred_location
-            if preferred_location is not None:
-                device_id = preferred_location
-            opts_base._use_current = False
-
-        opts_base._ipc_enabled = False  # IPC not supported for managed memory pools
-
-        IF CUDA_CORE_BUILD_MAJOR >= 13:
-            # Set location based on preferred_location
-            if preferred_location is None:
-                # Let the driver decide
-                opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_NONE
-            elif device_id == -1:
-                # CPU/host preference
-                opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
-            else:
-                # Device preference
-                opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-
-            opts_base._type = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED
-
-            super().__init__(device_id, opts_base)
-            _check_concurrent_managed_access()
-        ELSE:
-            raise RuntimeError("ManagedMemoryResource requires CUDA 13.0 or later")
+        _MMR_init(self, options)
+
+    @property
+    def device_id(self) -> int:
+        """Return -1. Managed memory migrates automatically and is not tied to a specific device."""
+        return -1
 
     @property
     def is_device_accessible(self) -> bool:
@@ -110,6 +82,50 @@ cdef class ManagedMemoryResource(_MemPool):
         return True
 
 
+cdef inline _MMR_init(ManagedMemoryResource self, options):
+    cdef ManagedMemoryResourceOptions opts = check_or_create_options(
+        ManagedMemoryResourceOptions, options, "ManagedMemoryResource options",
+        keep_none=True
+    )
+    cdef int location_id = -1
+    cdef object preferred_location = None
+    cdef cydriver.CUmemLocationType loc_type
+
+    if opts is not None:
+        preferred_location = opts.preferred_location
+        if preferred_location is not None:
+            location_id = preferred_location
+
+    IF CUDA_CORE_BUILD_MAJOR >= 13:
+        if preferred_location is None:
+            loc_type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_NONE
+        elif location_id == -1:
+            loc_type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
+        else:
+            loc_type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+
+        if opts is None:
+            MP_init_current_pool(
+                self,
+                loc_type,
+                location_id,
+                cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED,
+            )
+        else:
+            MP_init_create_pool(
+                self,
+                loc_type,
+                location_id,
+                cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED,
+                False,
+                0,
+            )
+
+        _check_concurrent_managed_access()
+    ELSE:
+        raise RuntimeError("ManagedMemoryResource requires CUDA 13.0 or later")
+
+
 cdef bint _concurrent_access_warned = False
 cdef object _concurrent_access_lock = threading.Lock()
 
diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pxd b/cuda_core/cuda/core/_memory/_memory_pool.pxd
index a8838bf9dc..45062826e4 100644
--- a/cuda_core/cuda/core/_memory/_memory_pool.pxd
+++ b/cuda_core/cuda/core/_memory/_memory_pool.pxd
@@ -10,15 +10,32 @@ from cuda.core._resource_handles cimport MemoryPoolHandle
 
 cdef class _MemPool(MemoryResource):
     cdef:
-        int                   _dev_id
         MemoryPoolHandle      _h_pool
         bint                  _mempool_owned
         IPCDataForMR          _ipc_data
         object                _attributes
-        object                _peer_accessible_by
         object                __weakref__
 
 
+cdef int MP_init_create_pool(
+    _MemPool self,
+    cydriver.CUmemLocationType loc_type,
+    int loc_id,
+    cydriver.CUmemAllocationType alloc_type,
+    bint ipc_enabled,
+    size_t max_size,
+) except? -1
+
+cdef int MP_init_current_pool(
+    _MemPool self,
+    cydriver.CUmemLocationType loc_type,
+    int loc_id,
+    cydriver.CUmemAllocationType alloc_type,
+) except? -1
+
+cdef int MP_raise_release_threshold(_MemPool self) except? -1
+
+
 cdef class _MemPoolAttributes:
     cdef:
         MemoryPoolHandle _h_pool
@@ -27,13 +44,3 @@ cdef class _MemPoolAttributes:
     cdef _MemPoolAttributes _init(MemoryPoolHandle h_pool)
 
     cdef int _getattribute(self, cydriver.CUmemPool_attribute attr_enum, void* value) except? -1
-
-
-cdef class _MemPoolOptions:
-
-    cdef:
-        bint _ipc_enabled
-        size_t _max_size
-        cydriver.CUmemLocationType _location
-        cydriver.CUmemAllocationType _type
-        bint _use_current
diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pyx b/cuda_core/cuda/core/_memory/_memory_pool.pyx
index 1e9f5116c1..a37ea17ab3 100644
--- a/cuda_core/cuda/core/_memory/_memory_pool.pyx
+++ b/cuda_core/cuda/core/_memory/_memory_pool.pyx
@@ -7,7 +7,6 @@ from __future__ import annotations
 from libc.limits cimport ULLONG_MAX
 from libc.stdint cimport uintptr_t
 from libc.string cimport memset
-from cpython.mem cimport PyMem_Malloc, PyMem_Free
 
 from cuda.bindings cimport cydriver
 from cuda.core._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle, MemoryResource
@@ -18,7 +17,6 @@ from cuda.core._resource_handles cimport (
     DevicePtrHandle,
     create_mempool_handle,
     create_mempool_handle_ref,
-    get_device_mempool,
     deviceptr_alloc_from_pool,
     as_cu,
     as_py,
@@ -28,20 +26,6 @@ from cuda.core._utils.cuda_utils cimport (
     HANDLE_RETURN,
 )
 
-import platform  # no-cython-lint
-
-from cuda.core._utils.cuda_utils import driver
-
-
-cdef class _MemPoolOptions:
-
-    def __cinit__(self):
-        self._ipc_enabled = False
-        self._max_size = 0
-        self._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_INVALID
-        self._type = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_INVALID
-        self._use_current = True
-
 
 cdef class _MemPoolAttributes:
     """Provides access to memory pool attributes."""
@@ -126,24 +110,14 @@ cdef class _MemPoolAttributes:
 cdef class _MemPool(MemoryResource):
 
     def __cinit__(self):
-        self._dev_id = cydriver.CU_DEVICE_INVALID
+        # Note: subclasses use MP_init_create_pool or MP_init_current_pool to initialize.
         self._mempool_owned = False
         self._ipc_data = None
         self._attributes = None
-        self._peer_accessible_by = ()
-
-    def __init__(self, int device_id, _MemPoolOptions opts):
-        if opts._use_current:
-            _MP_init_current(self, device_id, opts)
-        else:
-            _MP_init_create(self, device_id, opts)
-
-    def __dealloc__(self):
-        _MP_close(self)
 
     def close(self):
         """
-        Close the device memory resource and destroy the associated memory pool
+        Close the memory resource and destroy the associated memory pool
         if owned.
         """
         _MP_close(self)
@@ -194,11 +168,6 @@ cdef class _MemPool(MemoryResource):
             self._attributes = _MemPoolAttributes._init(self._h_pool)
         return self._attributes
 
-    @property
-    def device_id(self) -> int:
-        """The associated device ordinal."""
-        return self._dev_id
-
     @property
     def handle(self) -> object:
         """Handle to the underlying memory pool."""
@@ -209,73 +178,6 @@ cdef class _MemPool(MemoryResource):
         """Whether the memory resource handle is owned. If False, ``close`` has no effect."""
         return self._mempool_owned
 
-    @property
-    def peer_accessible_by(self):
-        """
-        Get or set the devices that can access allocations from this memory
-        pool. Access can be modified at any time and affects all allocations
-        from this memory pool.
-
-        Returns a tuple of sorted device IDs that currently have peer access to
-        allocations from this memory pool.
-
-        When setting, accepts a sequence of Device objects or device IDs.
-        Setting to an empty sequence revokes all peer access.
-
-        Examples
-        --------
-        >>> dmr = DeviceMemoryResource(0)
-        >>> dmr.peer_accessible_by = [1]  # Grant access to device 1
-        >>> assert dmr.peer_accessible_by == (1,)
-        >>> dmr.peer_accessible_by = []  # Revoke access
-        """
-        return self._peer_accessible_by
-
-    @peer_accessible_by.setter
-    def peer_accessible_by(self, devices):
-        """Set which devices can access this memory pool."""
-        from .._device import Device
-
-        # Convert all devices to device IDs
-        cdef set[int] target_ids = {Device(dev).device_id for dev in devices}
-        target_ids.discard(self._dev_id)  # exclude this device from peer access list
-        this_dev = Device(self._dev_id)
-        cdef list bad = [dev for dev in target_ids if not this_dev.can_access_peer(dev)]
-        if bad:
-            raise ValueError(f"Device {self._dev_id} cannot access peer(s): {', '.join(map(str, bad))}")
-        cdef set[int] cur_ids = set(self._peer_accessible_by)
-        cdef set[int] to_add = target_ids - cur_ids
-        cdef set[int] to_rm = cur_ids - target_ids
-        cdef size_t count = len(to_add) + len(to_rm) # transaction size
-        cdef cydriver.CUmemAccessDesc* access_desc = NULL
-        cdef size_t i = 0
-
-        if count > 0:
-            access_desc = <cydriver.CUmemAccessDesc*>PyMem_Malloc(count * sizeof(cydriver.CUmemAccessDesc))
-            if access_desc == NULL:
-                raise MemoryError("Failed to allocate memory for access descriptors")
-
-            try:
-                for dev_id in to_add:
-                    access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
-                    access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-                    access_desc[i].location.id = dev_id
-                    i += 1
-
-                for dev_id in to_rm:
-                    access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_NONE
-                    access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-                    access_desc[i].location.id = dev_id
-                    i += 1
-
-                with nogil:
-                    HANDLE_RETURN(cydriver.cuMemPoolSetAccess(as_cu(self._h_pool), access_desc, count))
-            finally:
-                if access_desc != NULL:
-                    PyMem_Free(access_desc)
-
-            self._peer_accessible_by = tuple(target_ids)
-
     @property
     def is_ipc_enabled(self) -> bool:
         """Whether this memory resource has IPC enabled."""
@@ -298,106 +200,90 @@ cdef class _MemPool(MemoryResource):
         return getattr(self._ipc_data, 'uuid', None)
 
 
-# _MemPool Implementation
-# -----------------------
+cdef int MP_init_create_pool(
+    _MemPool self,
+    cydriver.CUmemLocationType loc_type,
+    int loc_id,
+    cydriver.CUmemAllocationType alloc_type,
+    bint ipc_enabled,
+    size_t max_size,
+) except? -1:
+    """Initialize a _MemPool by creating a new memory pool with the given
+    parameters.
 
-cdef int _MP_init_current(_MemPool self, int dev_id, _MemPoolOptions opts) except?-1:
-    # Get the current memory pool.
-    cdef cydriver.cuuint64_t current_threshold
-    cdef cydriver.cuuint64_t max_threshold = ULLONG_MAX
-    cdef cydriver.CUmemLocation loc
-    cdef cydriver.CUmemoryPool pool
+    Sets ``_h_pool`` (owning), ``_mempool_owned``, and ``_ipc_data``.
+    """
+    cdef cydriver.CUmemPoolProps properties
+    memset(&properties, 0, sizeof(cydriver.CUmemPoolProps))
 
-    self._dev_id = dev_id
-    self._mempool_owned = False
+    properties.allocType = alloc_type
+    properties.handleTypes = (
+        _ipc.IPC_HANDLE_TYPE if ipc_enabled
+        else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
+    )
+    properties.location.id = loc_id
+    properties.location.type = loc_type
+    properties.maxSize = max_size
 
-    if opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \
-            and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE:
-        assert dev_id >= 0
-        self._h_pool = get_device_mempool(dev_id)
+    self._mempool_owned = True
+    self._h_pool = create_mempool_handle(properties)
 
-        # Set a higher release threshold to improve performance when there are
-        # no active allocations.  By default, the release threshold is 0, which
-        # means memory is immediately released back to the OS when there are no
-        # active suballocations, causing performance issues.
-        with nogil:
-            HANDLE_RETURN(
-                cydriver.cuMemPoolGetAttribute(
-                    as_cu(self._h_pool),
-                    cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
-                    &current_threshold
-                )
-            )
-            if current_threshold == 0:
-                HANDLE_RETURN(cydriver.cuMemPoolSetAttribute(
-                    as_cu(self._h_pool),
-                    cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
-                    &max_threshold
-                ))
-    elif opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \
-            and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST:
-        IF CUDA_CORE_BUILD_MAJOR >= 13:
-            assert dev_id == -1
-            loc.id = dev_id
-            loc.type = opts._location
-            with nogil:
-                HANDLE_RETURN(cydriver.cuMemGetMemPool(&pool, &loc, opts._type))
-            self._h_pool = create_mempool_handle_ref(pool)
-        ELSE:
-            raise RuntimeError("not supported")
-    elif opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \
-            and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA:
-        IF CUDA_CORE_BUILD_MAJOR >= 13:
-            assert dev_id == 0
-            loc.id = 0
-            loc.type = opts._location
-            with nogil:
-                HANDLE_RETURN(cydriver.cuMemGetMemPool(&pool, &loc, opts._type))
-            self._h_pool = create_mempool_handle_ref(pool)
-        ELSE:
-            raise RuntimeError("not supported")
-    else:
-        IF CUDA_CORE_BUILD_MAJOR >= 13:
-            if opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED:
-                # Managed memory pools
-                loc.id = dev_id
-                loc.type = opts._location
-                with nogil:
-                    HANDLE_RETURN(cydriver.cuMemGetMemPool(&pool, &loc, opts._type))
-                self._h_pool = create_mempool_handle_ref(pool)
-            else:
-                assert False
-        ELSE:
-            assert False
+    if ipc_enabled:
+        alloc_handle = _ipc.MP_export_mempool(self)
+        self._ipc_data = _ipc.IPCDataForMR(alloc_handle, False)
 
     return 0
 
 
-cdef int _MP_init_create(_MemPool self, int dev_id, _MemPoolOptions opts) except?-1:
-    cdef cydriver.CUmemPoolProps properties
-    memset(&properties, 0, sizeof(cydriver.CUmemPoolProps))
+cdef int MP_init_current_pool(
+    _MemPool self,
+    cydriver.CUmemLocationType loc_type,
+    int loc_id,
+    cydriver.CUmemAllocationType alloc_type,
+) except? -1:
+    """Initialize a _MemPool by getting the driver's current pool for a
+    location and allocation type.
 
-    cdef bint ipc_enabled = opts._ipc_enabled
-    properties.allocType = opts._type
-    properties.handleTypes = _ipc.IPC_HANDLE_TYPE if ipc_enabled else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
-    properties.location.id = dev_id
-    properties.location.type = opts._location
-    # managed memory does not support maxSize as of CUDA 13.0
+    Sets ``_h_pool`` (non-owning) via ``cuMemGetMemPool``.
+    Requires CUDA 13+.
+    """
     IF CUDA_CORE_BUILD_MAJOR >= 13:
-        if properties.allocType != cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED:
-            properties.maxSize = opts._max_size
+        cdef cydriver.CUmemLocation loc
+        cdef cydriver.CUmemoryPool pool
+        loc.id = loc_id
+        loc.type = loc_type
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemGetMemPool(&pool, &loc, alloc_type))
+        self._h_pool = create_mempool_handle_ref(pool)
+        self._mempool_owned = False
     ELSE:
-        properties.maxSize = opts._max_size
-
-    self._dev_id = dev_id
-    self._mempool_owned = True
+        raise RuntimeError("not supported")
+    return 0
 
-    self._h_pool = create_mempool_handle(properties)
 
-    if ipc_enabled:
-        alloc_handle = _ipc.MP_export_mempool(self)
-        self._ipc_data = _ipc.IPCDataForMR(alloc_handle, False)
+cdef int MP_raise_release_threshold(_MemPool self) except? -1:
+    """Raise the pool's release threshold to ULLONG_MAX if currently zero.
 
+    By default the release threshold is 0, meaning memory is returned to
+    the OS as soon as there are no active suballocations.  Setting it to
+    ULLONG_MAX avoids repeated OS round-trips.
+    """
+    cdef cydriver.cuuint64_t current_threshold
+    cdef cydriver.cuuint64_t max_threshold = ULLONG_MAX
+    with nogil:
+        HANDLE_RETURN(
+            cydriver.cuMemPoolGetAttribute(
+                as_cu(self._h_pool),
+                cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+                &current_threshold
+            )
+        )
+        if current_threshold == 0:
+            HANDLE_RETURN(cydriver.cuMemPoolSetAttribute(
+                as_cu(self._h_pool),
+                cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+                &max_threshold
+            ))
     return 0
 
 
@@ -438,17 +324,9 @@ cdef inline _MP_close(_MemPool self):
     if not self._h_pool:
         return
 
-    # This works around nvbug 5698116. When a memory pool handle is recycled
-    # the new handle inherits the peer access state of the previous handle.
-    if self._peer_accessible_by:
-        self.peer_accessible_by = []
-
     # Reset members in declaration order.
-    # The RAII deleter handles nvbug 5698116 workaround (clears peer access)
-    # and calls cuMemPoolDestroy if this is an owning handle.
+    # The RAII deleter calls cuMemPoolDestroy if this is an owning handle.
     self._h_pool.reset()
-    self._dev_id = cydriver.CU_DEVICE_INVALID
     self._mempool_owned = False
     self._ipc_data = None
     self._attributes = None
-    self._peer_accessible_by = ()
diff --git a/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx
index b2a9db4594..b35bc1ebdf 100644
--- a/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx
+++ b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx
@@ -1,11 +1,11 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
 
 from cuda.bindings cimport cydriver
-from cuda.core._memory._memory_pool cimport _MemPool, _MemPoolOptions
+from cuda.core._memory._memory_pool cimport _MemPool, MP_init_create_pool, MP_init_current_pool
 from cuda.core._memory cimport _ipc
 from cuda.core._memory._ipc cimport IPCAllocationHandle
 from cuda.core._utils.cuda_utils cimport (
@@ -24,67 +24,6 @@ import warnings
 
 from cuda.core._utils.cuda_utils import check_multiprocessing_start_method
 
-
-# Cache to ensure NUMA warning is only raised once per process
-cdef bint _numa_warning_shown = False
-cdef object _lock = threading.Lock()
-
-
-def _check_numa_nodes():
-    """Check if system has multiple NUMA nodes and warn if so."""
-    global _numa_warning_shown
-    if _numa_warning_shown:
-        return
-
-    with _lock:
-        if _numa_warning_shown:
-            return
-
-        if platform.system() != "Linux":
-            _numa_warning_shown = True
-            return
-
-        numa_count = None
-
-        # Try /sys filesystem first (most reliable and doesn't require external tools)
-        try:
-            node_path = "/sys/devices/system/node"
-            if os.path.exists(node_path):
-                # Count directories named "node[0-9]+"
-                nodes = [d for d in os.listdir(node_path) if d.startswith("node") and d[4:].isdigit()]
-                numa_count = len(nodes)
-        except (OSError, PermissionError):
-            pass
-
-        # Fallback to lscpu if /sys check didn't work
-        if numa_count is None:
-            try:
-                result = subprocess.run(
-                    ["lscpu"],
-                    capture_output=True,
-                    text=True,
-                    timeout=1
-                )
-                for line in result.stdout.splitlines():
-                    if line.startswith("NUMA node(s):"):
-                        numa_count = int(line.split(":")[1].strip())
-                        break
-            except (subprocess.SubprocessError, ValueError, FileNotFoundError):
-                pass
-
-        # Warn if multiple NUMA nodes detected
-        if numa_count is not None and numa_count > 1:
-            warnings.warn(
-                f"System has {numa_count} NUMA nodes. IPC-enabled pinned memory "
-                f"uses location ID 0, which may not work correctly with multiple "
-                f"NUMA nodes.",
-                UserWarning,
-                stacklevel=3
-            )
-
-        _numa_warning_shown = True
-
-
 __all__ = ['PinnedMemoryResource', 'PinnedMemoryResourceOptions']
 
 
@@ -143,30 +82,7 @@ cdef class PinnedMemoryResource(_MemPool):
     """
 
     def __init__(self, options=None):
-        cdef PinnedMemoryResourceOptions opts = check_or_create_options(
-            PinnedMemoryResourceOptions, options, "PinnedMemoryResource options",
-            keep_none=True
-        )
-        cdef _MemPoolOptions opts_base = _MemPoolOptions()
-
-        cdef bint ipc_enabled = False
-        if opts:
-            ipc_enabled = opts.ipc_enabled
-            if ipc_enabled and not _ipc.is_supported():
-                raise RuntimeError(f"IPC is not available on {platform.system()}")
-            if ipc_enabled:
-                # Check for multiple NUMA nodes on Linux
-                _check_numa_nodes()
-            opts_base._max_size = opts.max_size
-            opts_base._use_current = False
-        opts_base._ipc_enabled = ipc_enabled
-        if ipc_enabled:
-            opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA
-        else:
-            opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
-        opts_base._type = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
-
-        super().__init__(0 if ipc_enabled else -1, opts_base)
+        _PMR_init(self, options)
 
     def __reduce__(self):
         return PinnedMemoryResource.from_registry, (self.uuid,)
@@ -239,6 +155,11 @@ cdef class PinnedMemoryResource(_MemPool):
             raise RuntimeError("Memory resource is not IPC-enabled")
         return self._ipc_data._alloc_handle
 
+    @property
+    def device_id(self) -> int:
+        """Return -1. Pinned memory is host memory and is not associated with a specific device."""
+        return -1
+
     @property
     def is_device_accessible(self) -> bool:
         """Return True. This memory resource provides device-accessible buffers."""
@@ -250,6 +171,49 @@ cdef class PinnedMemoryResource(_MemPool):
         return True
 
 
+cdef inline _PMR_init(PinnedMemoryResource self, options):
+    cdef PinnedMemoryResourceOptions opts = check_or_create_options(
+        PinnedMemoryResourceOptions, options, "PinnedMemoryResource options",
+        keep_none=True
+    )
+    cdef bint ipc_enabled = False
+    cdef size_t max_size = 0
+    cdef cydriver.CUmemLocationType loc_type
+    cdef int location_id
+
+    if opts is not None:
+        ipc_enabled = opts.ipc_enabled
+        if ipc_enabled and not _ipc.is_supported():
+            raise RuntimeError(f"IPC is not available on {platform.system()}")
+        if ipc_enabled:
+            _check_numa_nodes()
+        max_size = opts.max_size
+
+    if ipc_enabled:
+        loc_type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA
+        location_id = 0
+    else:
+        loc_type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
+        location_id = -1
+
+    if opts is None:
+        MP_init_current_pool(
+            self,
+            loc_type,
+            location_id,
+            cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED,
+        )
+    else:
+        MP_init_create_pool(
+            self,
+            loc_type,
+            location_id,
+            cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED,
+            ipc_enabled,
+            max_size,
+        )
+
+
 def _deep_reduce_pinned_memory_resource(mr):
     check_multiprocessing_start_method()
     alloc_handle = mr.get_allocation_handle()
@@ -257,3 +221,60 @@ def _deep_reduce_pinned_memory_resource(mr):
 
 
 multiprocessing.reduction.register(PinnedMemoryResource, _deep_reduce_pinned_memory_resource)
+
+
+cdef bint _numa_warning_shown = False
+cdef object _numa_lock = threading.Lock()
+
+
+cdef inline _check_numa_nodes():
+    """Check if system has multiple NUMA nodes and warn if so."""
+    global _numa_warning_shown
+    if _numa_warning_shown:
+        return
+
+    with _numa_lock:
+        if _numa_warning_shown:
+            return
+
+        if platform.system() != "Linux":
+            _numa_warning_shown = True
+            return
+
+        numa_count = None
+
+        # Try /sys filesystem first (most reliable and doesn't require external tools)
+        try:
+            node_path = "/sys/devices/system/node"
+            if os.path.exists(node_path):
+                nodes = [d for d in os.listdir(node_path) if d.startswith("node") and d[4:].isdigit()]
+                numa_count = len(nodes)
+        except (OSError, PermissionError):
+            pass
+
+        # Fallback to lscpu if /sys check didn't work
+        if numa_count is None:
+            try:
+                result = subprocess.run(
+                    ["lscpu"],
+                    capture_output=True,
+                    text=True,
+                    timeout=1
+                )
+                for line in result.stdout.splitlines():
+                    if line.startswith("NUMA node(s):"):
+                        numa_count = int(line.split(":")[1].strip())
+                        break
+            except (subprocess.SubprocessError, ValueError, FileNotFoundError):
+                pass
+
+        if numa_count is not None and numa_count > 1:
+            warnings.warn(
+                f"System has {numa_count} NUMA nodes. IPC-enabled pinned memory "
+                f"uses location ID 0, which may not work correctly with multiple "
+                f"NUMA nodes.",
+                UserWarning,
+                stacklevel=3
+            )
+
+        _numa_warning_shown = True
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 65230944ad..49c4935f59 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1038,7 +1038,7 @@ def test_pinned_mempool_ipc_basic():
     assert mr.is_ipc_enabled
     assert mr.is_device_accessible
     assert mr.is_host_accessible
-    assert mr.device_id == 0  # IPC-enabled uses location id 0
+    assert mr.device_id == -1  # pinned memory is not device-specific
 
     # Test allocation handle export
     alloc_handle = mr.get_allocation_handle()

From e55a26b0bdb340361acf3a5bb8b896d1bf9e2b27 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 26 Feb 2026 17:10:00 -0800
Subject: [PATCH 2/5] Fix PinnedMemoryResource IPC to derive NUMA ID from
 active device (#1603)

PinnedMemoryResource(ipc_enabled=True) hardcoded host NUMA ID 0, causing
failures on multi-NUMA systems where the active device is attached to a
different NUMA node. Now derives the NUMA ID from the current device's
host_numa_id attribute, and adds an explicit numa_id option for manual
override. Removes the _check_numa_nodes warning machinery in favor of
proper NUMA node selection.

Made-with: Cursor
---
 .../core/_memory/_pinned_memory_resource.pxd  |   2 +-
 .../core/_memory/_pinned_memory_resource.pyx  | 119 +++++++-----------
 cuda_core/tests/test_memory.py                |  72 ++++++++++-
 3 files changed, 116 insertions(+), 77 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_pinned_memory_resource.pxd b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pxd
index a8262d9bd8..fcfcfeb346 100644
--- a/cuda_core/cuda/core/_memory/_pinned_memory_resource.pxd
+++ b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pxd
@@ -7,4 +7,4 @@ from cuda.core._memory._ipc cimport IPCDataForMR
 
 
 cdef class PinnedMemoryResource(_MemPool):
-    pass
+    cdef int _numa_id
diff --git a/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx
index b35bc1ebdf..64ebcc7bc5 100644
--- a/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx
+++ b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx
@@ -15,12 +15,8 @@ from cuda.core._utils.cuda_utils cimport (
 
 from dataclasses import dataclass
 import multiprocessing
-import os
 import platform  # no-cython-lint
-import subprocess
-import threading
 import uuid
-import warnings
 
 from cuda.core._utils.cuda_utils import check_multiprocessing_start_method
 
@@ -41,9 +37,22 @@ cdef class PinnedMemoryResourceOptions:
     max_size : int, optional
         Maximum pool size. When set to 0, defaults to a system-dependent value.
         (Default to 0)
+
+    numa_id : int or None, optional
+        Host NUMA node ID for pool placement. When set to None (the default),
+        the behavior depends on ``ipc_enabled``:
+
+        - ``ipc_enabled=False``: OS-managed placement (location type HOST).
+        - ``ipc_enabled=True``: automatically derived from the current CUDA
+          device's ``host_numa_id`` attribute, requiring an active CUDA
+          context.
+
+        When set to a non-negative integer, that NUMA node is used explicitly
+        regardless of ``ipc_enabled`` (location type HOST_NUMA).
     """
     ipc_enabled : bool = False
     max_size : int = 0
+    numa_id : int | None = None
 
 
 cdef class PinnedMemoryResource(_MemPool):
@@ -71,12 +80,10 @@ cdef class PinnedMemoryResource(_MemPool):
     -----
     To create an IPC-Enabled memory resource (MR) that is capable of sharing
     allocations between processes, specify ``ipc_enabled=True`` in the initializer
-    option. When IPC is enabled, the location type is automatically set to
-    CU_MEM_LOCATION_TYPE_HOST_NUMA instead of CU_MEM_LOCATION_TYPE_HOST,
-    with location ID 0.
-
-    Note: IPC support for pinned memory requires a single NUMA node. A warning
-    is issued if multiple NUMA nodes are detected.
+    option. When IPC is enabled and ``numa_id`` is not specified, the NUMA node
+    is automatically derived from the current CUDA device's ``host_numa_id``
+    attribute, which requires an active CUDA context. If ``numa_id`` is
+    explicitly set, that value is used regardless of ``ipc_enabled``.
 
     See :class:`DeviceMemoryResource` for more details on IPC usage patterns.
     """
@@ -160,6 +167,11 @@ cdef class PinnedMemoryResource(_MemPool):
         """Return -1. Pinned memory is host memory and is not associated with a specific device."""
         return -1
 
+    @property
+    def numa_id(self) -> int:
+        """The host NUMA node ID used for pool placement, or -1 for OS-managed placement."""
+        return self._numa_id
+
     @property
     def is_device_accessible(self) -> bool:
         """Return True. This memory resource provides device-accessible buffers."""
@@ -172,6 +184,8 @@ cdef class PinnedMemoryResource(_MemPool):
 
 
 cdef inline _PMR_init(PinnedMemoryResource self, options):
+    from .._device import Device
+
     cdef PinnedMemoryResourceOptions opts = check_or_create_options(
         PinnedMemoryResourceOptions, options, "PinnedMemoryResource options",
         keep_none=True
@@ -179,35 +193,47 @@ cdef inline _PMR_init(PinnedMemoryResource self, options):
     cdef bint ipc_enabled = False
     cdef size_t max_size = 0
     cdef cydriver.CUmemLocationType loc_type
-    cdef int location_id
+    cdef int numa_id = -1
 
     if opts is not None:
         ipc_enabled = opts.ipc_enabled
         if ipc_enabled and not _ipc.is_supported():
             raise RuntimeError(f"IPC is not available on {platform.system()}")
-        if ipc_enabled:
-            _check_numa_nodes()
         max_size = opts.max_size
 
-    if ipc_enabled:
+        if opts.numa_id is not None:
+            numa_id = opts.numa_id
+            if numa_id < 0:
+                raise ValueError(f"numa_id must be >= 0, got {numa_id}")
+        elif ipc_enabled:
+            dev = Device()
+            numa_id = dev.properties.host_numa_id
+            if numa_id < 0:
+                raise RuntimeError(
+                    "Cannot determine host NUMA ID for IPC-enabled pinned "
+                    "memory pool. The system may not support NUMA, or no "
+                    "CUDA context is active. Set numa_id explicitly or "
+                    "call Device.set_current() first.")
+
+    if numa_id >= 0:
         loc_type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA
-        location_id = 0
     else:
         loc_type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
-        location_id = -1
+
+    self._numa_id = numa_id
 
     if opts is None:
         MP_init_current_pool(
             self,
             loc_type,
-            location_id,
+            numa_id,
             cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED,
         )
     else:
         MP_init_create_pool(
             self,
             loc_type,
-            location_id,
+            numa_id,
             cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED,
             ipc_enabled,
             max_size,
@@ -221,60 +247,3 @@ def _deep_reduce_pinned_memory_resource(mr):
 
 
 multiprocessing.reduction.register(PinnedMemoryResource, _deep_reduce_pinned_memory_resource)
-
-
-cdef bint _numa_warning_shown = False
-cdef object _numa_lock = threading.Lock()
-
-
-cdef inline _check_numa_nodes():
-    """Check if system has multiple NUMA nodes and warn if so."""
-    global _numa_warning_shown
-    if _numa_warning_shown:
-        return
-
-    with _numa_lock:
-        if _numa_warning_shown:
-            return
-
-        if platform.system() != "Linux":
-            _numa_warning_shown = True
-            return
-
-        numa_count = None
-
-        # Try /sys filesystem first (most reliable and doesn't require external tools)
-        try:
-            node_path = "/sys/devices/system/node"
-            if os.path.exists(node_path):
-                nodes = [d for d in os.listdir(node_path) if d.startswith("node") and d[4:].isdigit()]
-                numa_count = len(nodes)
-        except (OSError, PermissionError):
-            pass
-
-        # Fallback to lscpu if /sys check didn't work
-        if numa_count is None:
-            try:
-                result = subprocess.run(
-                    ["lscpu"],
-                    capture_output=True,
-                    text=True,
-                    timeout=1
-                )
-                for line in result.stdout.splitlines():
-                    if line.startswith("NUMA node(s):"):
-                        numa_count = int(line.split(":")[1].strip())
-                        break
-            except (subprocess.SubprocessError, ValueError, FileNotFoundError):
-                pass
-
-        if numa_count is not None and numa_count > 1:
-            warnings.warn(
-                f"System has {numa_count} NUMA nodes. IPC-enabled pinned memory "
-                f"uses location ID 0, which may not work correctly with multiple "
-                f"NUMA nodes.",
-                UserWarning,
-                stacklevel=3
-            )
-
-        _numa_warning_shown = True
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 49c4935f59..8933dcba09 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1039,6 +1039,7 @@ def test_pinned_mempool_ipc_basic():
     assert mr.is_device_accessible
     assert mr.is_host_accessible
     assert mr.device_id == -1  # pinned memory is not device-specific
+    assert mr.numa_id >= 0  # IPC requires a concrete NUMA node
 
     # Test allocation handle export
     alloc_handle = mr.get_allocation_handle()
@@ -1070,7 +1071,8 @@ def test_pinned_mempool_ipc_errors():
     options = PinnedMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=False)
     mr = PinnedMemoryResource(options)
     assert not mr.is_ipc_enabled
-    assert mr.device_id == -1  # Non-IPC uses location id -1
+    assert mr.device_id == -1
+    assert mr.numa_id == -1  # Non-IPC uses OS-managed placement
 
     buffer = mr.allocate(64)
     ipc_error_msg = "Memory resource is not IPC-enabled"
@@ -1089,6 +1091,74 @@ def test_pinned_mempool_ipc_errors():
     mr.close()
 
 
+def test_pinned_mr_numa_id_default_no_ipc(init_cuda):
+    """numa_id defaults to -1 (OS-managed) when IPC is disabled."""
+    device = Device()
+    skip_if_pinned_memory_unsupported(device)
+
+    mr = PinnedMemoryResource(PinnedMemoryResourceOptions())
+    assert mr.numa_id == -1
+    mr.close()
+
+    mr = PinnedMemoryResource(PinnedMemoryResourceOptions(ipc_enabled=False))
+    assert mr.numa_id == -1
+    mr.close()
+
+
+def test_pinned_mr_numa_id_default_with_ipc(init_cuda):
+    """numa_id is derived from the current device when IPC is enabled."""
+    device = Device()
+    skip_if_pinned_memory_unsupported(device)
+
+    if platform.system() == "Windows":
+        pytest.skip("IPC not implemented for Windows")
+    if not supports_ipc_mempool(device):
+        pytest.skip("Driver rejects IPC-enabled mempool creation on this platform")
+
+    expected_numa_id = device.properties.host_numa_id
+    if expected_numa_id < 0:
+        pytest.skip("System does not support NUMA")
+
+    mr = PinnedMemoryResource(PinnedMemoryResourceOptions(ipc_enabled=True, max_size=POOL_SIZE))
+    assert mr.numa_id == expected_numa_id
+    mr.close()
+
+
+def test_pinned_mr_numa_id_explicit(init_cuda):
+    """Explicit numa_id is used regardless of ipc_enabled."""
+    device = Device()
+    skip_if_pinned_memory_unsupported(device)
+
+    host_numa_id = device.properties.host_numa_id
+    if host_numa_id < 0:
+        pytest.skip("System does not support NUMA")
+
+    mr = PinnedMemoryResource(PinnedMemoryResourceOptions(numa_id=host_numa_id))
+    assert mr.numa_id == host_numa_id
+    mr.close()
+
+    if platform.system() == "Windows":
+        pytest.skip("IPC not implemented for Windows")
+    if not supports_ipc_mempool(device):
+        pytest.skip("Driver rejects IPC-enabled mempool creation on this platform")
+
+    mr = PinnedMemoryResource(PinnedMemoryResourceOptions(ipc_enabled=True, numa_id=host_numa_id, max_size=POOL_SIZE))
+    assert mr.numa_id == host_numa_id
+    mr.close()
+
+
+def test_pinned_mr_numa_id_negative_error(init_cuda):
+    """Negative numa_id raises ValueError."""
+    device = Device()
+    skip_if_pinned_memory_unsupported(device)
+
+    with pytest.raises(ValueError, match="numa_id must be >= 0"):
+        PinnedMemoryResource(PinnedMemoryResourceOptions(numa_id=-1))
+
+    with pytest.raises(ValueError, match="numa_id must be >= 0"):
+        PinnedMemoryResource(PinnedMemoryResourceOptions(numa_id=-42))
+
+
 @pytest.mark.parametrize("ipc_enabled", [True, False])
 @pytest.mark.parametrize(
     "property_name,expected_type",

From 29025785a584bf0cdd7c72875d3200796ffcf7f8 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 2 Mar 2026 13:37:59 -0800
Subject: [PATCH 3/5] Add preferred_location_type option and query property to
 ManagedMemoryResource

Extends ManagedMemoryResourceOptions with a preferred_location_type field
("device", "host", "host_numa", or None) enabling NUMA-aware managed memory
pool placement. Adds ManagedMemoryResource.preferred_location property to
query the resolved setting. Fully backwards-compatible: existing code using
preferred_location alone continues to work unchanged.

Made-with: Cursor
---
 .pre-commit-config.yaml                       |    1 +
 .../cuda/bindings/_bindings/cydriver.pxd.in   |    4 +-
 .../cuda/bindings/_bindings/cydriver.pyx.in   |    4 +-
 .../cuda/bindings/_bindings/cynvrtc.pxd.in    |    4 +-
 .../cuda/bindings/_bindings/cynvrtc.pyx.in    |    4 +-
 .../cuda/bindings/_bindings/cyruntime.pxd.in  |    4 +-
 .../cuda/bindings/_bindings/cyruntime.pyx.in  |    4 +-
 .../bindings/_bindings/cyruntime_ptds.pxd.in  |    4 +-
 .../bindings/_bindings/cyruntime_ptds.pyx.in  |    4 +-
 .../cuda/bindings/_internal/_fast_enum.py     |    2 +-
 .../cuda/bindings/_internal/cufile.pxd        |    2 +-
 .../cuda/bindings/_internal/cufile_linux.pyx  |    2 +-
 .../cuda/bindings/_internal/nvjitlink.pxd     |    2 +-
 .../bindings/_internal/nvjitlink_linux.pyx    |    2 +-
 .../bindings/_internal/nvjitlink_windows.pyx  |    2 +-
 .../cuda/bindings/_internal/nvml.pxd          |    2 +-
 .../cuda/bindings/_internal/nvml_linux.pyx    |    2 +-
 .../cuda/bindings/_internal/nvml_windows.pyx  |    2 +-
 .../cuda/bindings/_internal/nvvm.pxd          |    2 +-
 .../cuda/bindings/_internal/nvvm_linux.pyx    |    2 +-
 .../cuda/bindings/_internal/nvvm_windows.pyx  |    2 +-
 cuda_bindings/cuda/bindings/cufile.pxd        |    2 +-
 cuda_bindings/cuda/bindings/cufile.pyx        |  152 ++-
 cuda_bindings/cuda/bindings/cycufile.pxd      |    2 +-
 cuda_bindings/cuda/bindings/cycufile.pyx      |    2 +-
 cuda_bindings/cuda/bindings/cydriver.pxd.in   |    4 +-
 cuda_bindings/cuda/bindings/cydriver.pyx.in   |    4 +-
 cuda_bindings/cuda/bindings/cynvjitlink.pxd   |    2 +-
 cuda_bindings/cuda/bindings/cynvjitlink.pyx   |    2 +-
 cuda_bindings/cuda/bindings/cynvml.pxd        |    2 +-
 cuda_bindings/cuda/bindings/cynvml.pyx        |    2 +-
 cuda_bindings/cuda/bindings/cynvrtc.pxd.in    |    4 +-
 cuda_bindings/cuda/bindings/cynvrtc.pyx.in    |    4 +-
 cuda_bindings/cuda/bindings/cynvvm.pxd        |    2 +-
 cuda_bindings/cuda/bindings/cynvvm.pyx        |    2 +-
 cuda_bindings/cuda/bindings/cyruntime.pxd.in  |    4 +-
 cuda_bindings/cuda/bindings/cyruntime.pyx.in  |    4 +-
 .../cuda/bindings/cyruntime_functions.pxi.in  |    4 +-
 .../cuda/bindings/cyruntime_types.pxi.in      |    4 +-
 cuda_bindings/cuda/bindings/driver.pxd.in     |    2 +-
 cuda_bindings/cuda/bindings/driver.pyx.in     |    2 +-
 cuda_bindings/cuda/bindings/nvjitlink.pxd     |    2 +-
 cuda_bindings/cuda/bindings/nvjitlink.pyx     |    4 +-
 cuda_bindings/cuda/bindings/nvml.pxd          |    2 +-
 cuda_bindings/cuda/bindings/nvml.pyx          | 1197 ++++++++++++++++-
 cuda_bindings/cuda/bindings/nvrtc.pxd.in      |    4 +-
 cuda_bindings/cuda/bindings/nvrtc.pyx.in      |    2 +-
 cuda_bindings/cuda/bindings/nvvm.pxd          |    2 +-
 cuda_bindings/cuda/bindings/nvvm.pyx          |    4 +-
 cuda_bindings/cuda/bindings/runtime.pxd.in    |    2 +-
 cuda_bindings/cuda/bindings/runtime.pyx.in    |    6 +-
 .../core/_memory/_managed_memory_resource.pxd |    4 +-
 .../core/_memory/_managed_memory_resource.pyx |  173 ++-
 cuda_core/cuda/core/_program.pxd              |    2 +
 cuda_core/cuda/core/_program.pyx              |  123 +-
 cuda_core/cuda/core/_stream.pyx               |   24 +-
 cuda_core/docs/source/release/0.6.0-notes.rst |    5 +
 cuda_core/docs/source/release/0.7.x-notes.rst |   57 +
 cuda_core/pyproject.toml                      |    8 +-
 cuda_core/tests/conftest.py                   |    6 +
 cuda_core/tests/test_memory.py                |  149 ++
 cuda_core/tests/test_program.py               |   35 +
 pytest.ini                                    |    1 +
 63 files changed, 1952 insertions(+), 123 deletions(-)
 create mode 100644 cuda_core/docs/source/release/0.7.x-notes.rst

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 44ba5d5bf2..2fbb9d897e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,6 +19,7 @@ repos:
     hooks:
       - id: ruff-check
         args: [--fix, --show-fixes]
+        exclude: ^cuda_bindings/cuda/bindings/_internal/_fast_enum\.py$
       - id: ruff-format
 
   - repo: local
diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
index 6f5a2a4014..2127076caa 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
@@ -1,7 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 from cuda.bindings.cydriver cimport *
 
 {{if 'cuGetErrorString' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
index 4dba6dfbc8..e7b4f463b6 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
@@ -1,7 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 {{if 'Windows' == platform.system()}}
 import os
 cimport cuda.bindings._lib.windll as windll
diff --git a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in
index f1bbb53998..7d8fc40a20 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in
@@ -1,7 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 from cuda.bindings.cynvrtc cimport *
 
 {{if 'nvrtcGetErrorString' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in
index 608aebd1af..2b88fde640 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in
@@ -1,7 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 {{if 'Windows' == platform.system()}}
 import os
 cimport cuda.bindings._lib.windll as windll
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
index 05451b0b42..8f0339be21 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
@@ -1,7 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 include "../cyruntime_types.pxi"
 
 include "../_lib/cyruntime/cyruntime.pxd"
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
index 6a8d1ab783..cccd4fc661 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
@@ -1,7 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 include "../cyruntime_functions.pxi"
 
 import os
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
index 804c7078ab..0af3f78b2b 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
@@ -1,7 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 cdef extern from "":
     """
     #define CUDA_API_PER_THREAD_DEFAULT_STREAM
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
index f57fbbb126..bd0b42c0b3 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
@@ -1,7 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 cdef extern from "":
     """
     #define CUDA_API_PER_THREAD_DEFAULT_STREAM
diff --git a/cuda_bindings/cuda/bindings/_internal/_fast_enum.py b/cuda_bindings/cuda/bindings/_internal/_fast_enum.py
index 33e3b1e12f..0958b55b8f 100644
--- a/cuda_bindings/cuda/bindings/_internal/_fast_enum.py
+++ b/cuda_bindings/cuda/bindings/_internal/_fast_enum.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 
-# This code was automatically generated across versions from 12.9.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 
 """
diff --git a/cuda_bindings/cuda/bindings/_internal/cufile.pxd b/cuda_bindings/cuda/bindings/_internal/cufile.pxd
index a2e7d560ce..4b1a09a182 100644
--- a/cuda_bindings/cuda/bindings/_internal/cufile.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/cufile.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from ..cycufile cimport *
 
diff --git a/cuda_bindings/cuda/bindings/_internal/cufile_linux.pyx b/cuda_bindings/cuda/bindings/_internal/cufile_linux.pyx
index eb38750f5e..cbb2c422ac 100644
--- a/cuda_bindings/cuda/bindings/_internal/cufile_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/cufile_linux.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 import threading
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd b/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
index 84abf408d4..6c9670edee 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from ..cynvjitlink cimport *
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
index 057b52f5eb..378efda1c6 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
index 8967e7fe1c..976b824852 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvml.pxd b/cuda_bindings/cuda/bindings/_internal/nvml.pxd
index e62b29b2c8..d9ddec48fb 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvml.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvml.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from ..cynvml cimport *
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvml_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvml_linux.pyx
index 84e17a4313..54e5d51748 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvml_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvml_linux.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvml_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvml_windows.pyx
index dd71b4ae0b..309b5a3039 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvml_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvml_windows.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm.pxd b/cuda_bindings/cuda/bindings/_internal/nvvm.pxd
index 00576dd8c9..c560367884 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvvm.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvvm.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from ..cynvvm cimport *
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
index 2d03097235..f1d9febdb2 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx
index 6684402bfc..3dd11074b2 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/cufile.pxd b/cuda_bindings/cuda/bindings/cufile.pxd
index b330aa42f2..033da9ec84 100644
--- a/cuda_bindings/cuda/bindings/cufile.pxd
+++ b/cuda_bindings/cuda/bindings/cufile.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/cufile.pyx b/cuda_bindings/cuda/bindings/cufile.pyx
index 4ddc988fca..16c564e2a8 100644
--- a/cuda_bindings/cuda/bindings/cufile.pyx
+++ b/cuda_bindings/cuda/bindings/cufile.pyx
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 cimport cython  # NOQA
 from libc cimport errno
@@ -36,6 +36,33 @@ cdef __from_data(data, dtype_name, expected_dtype, lowpp_type):
     return lowpp_type.from_ptr(data.ctypes.data, not data.flags.writeable, data)
 
 
+cdef __from_buffer(buffer, size, lowpp_type):
+    cdef Py_buffer view
+    if cpython.PyObject_GetBuffer(buffer, &view, cpython.PyBUF_SIMPLE) != 0:
+        raise TypeError("buffer argument does not support the buffer protocol")
+    try:
+        if view.itemsize != 1:
+            raise ValueError("buffer itemsize must be 1 byte")
+        if view.len != size:
+            raise ValueError(f"buffer length must be {size} bytes")
+        return lowpp_type.from_ptr(<intptr_t><void *>view.buf, not view.readonly, buffer)
+    finally:
+        cpython.PyBuffer_Release(&view)
+
+
+cdef __getbuffer(object self, cpython.Py_buffer *buffer, void *ptr, int size, bint readonly):
+    buffer.buf = <char *>ptr
+    buffer.format = 'b'
+    buffer.internal = NULL
+    buffer.itemsize = 1
+    buffer.len = size
+    buffer.ndim = 1
+    buffer.obj = self
+    buffer.readonly = readonly
+    buffer.shape = &buffer.len
+    buffer.strides = &buffer.itemsize
+    buffer.suboffsets = NULL
+
 ###############################################################################
 # POD
 ###############################################################################
@@ -97,6 +124,12 @@ cdef class _py_anon_pod1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof((<CUfileDescr_t*>NULL).handle)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof((<CUfileDescr_t*>NULL).handle), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <_anon_pod1 *>malloc(sizeof((<CUfileDescr_t*>NULL).handle))
@@ -131,6 +164,11 @@ cdef class _py_anon_pod1:
             raise ValueError("This _py_anon_pod1 instance is read-only")
         self._ptr[0].handle = <void *><intptr_t>val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an _py_anon_pod1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof((<CUfileDescr_t*>NULL).handle), _py_anon_pod1)
+
     @staticmethod
     def from_data(data):
         """Create an _py_anon_pod1 instance wrapping the given NumPy array.
@@ -231,6 +269,12 @@ cdef class _py_anon_pod3:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof((<CUfileIOParams_t*>NULL).u.batch)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof((<CUfileIOParams_t*>NULL).u.batch), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <_anon_pod3 *>malloc(sizeof((<CUfileIOParams_t*>NULL).u.batch))
@@ -287,6 +331,11 @@ cdef class _py_anon_pod3:
             raise ValueError("This _py_anon_pod3 instance is read-only")
         self._ptr[0].size = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an _py_anon_pod3 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof((<CUfileIOParams_t*>NULL).u.batch), _py_anon_pod3)
+
     @staticmethod
     def from_data(data):
         """Create an _py_anon_pod3 instance wrapping the given NumPy array.
@@ -390,6 +439,12 @@ cdef class IOEvents:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def cookie(self):
         """Union[~_numpy.intp, int]: """
@@ -442,6 +497,11 @@ cdef class IOEvents:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an IOEvents instance with the memory from the given buffer."""
+        return IOEvents.from_data(_numpy.frombuffer(buffer, dtype=io_events_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an IOEvents instance wrapping the given NumPy array.
@@ -543,6 +603,12 @@ cdef class OpCounter:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(CUfileOpCounter_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(CUfileOpCounter_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <CUfileOpCounter_t *>malloc(sizeof(CUfileOpCounter_t))
@@ -577,6 +643,11 @@ cdef class OpCounter:
             raise ValueError("This OpCounter instance is read-only")
         self._ptr[0].err = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an OpCounter instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(CUfileOpCounter_t), OpCounter)
+
     @staticmethod
     def from_data(data):
         """Create an OpCounter instance wrapping the given NumPy array.
@@ -707,6 +778,12 @@ cdef class PerGpuStats:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def uuid(self):
         """~_numpy.int8: (array of length 16)."""
@@ -1054,6 +1131,11 @@ cdef class PerGpuStats:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an PerGpuStats instance with the memory from the given buffer."""
+        return PerGpuStats.from_data(_numpy.frombuffer(buffer, dtype=per_gpu_stats_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an PerGpuStats instance wrapping the given NumPy array.
@@ -1160,6 +1242,12 @@ cdef class Descr:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def type(self):
         """Union[~_numpy.int32, int]: """
@@ -1210,6 +1298,11 @@ cdef class Descr:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an Descr instance with the memory from the given buffer."""
+        return Descr.from_data(_numpy.frombuffer(buffer, dtype=descr_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an Descr instance wrapping the given NumPy array.
@@ -1305,6 +1398,12 @@ cdef class _py_anon_pod2:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof((<CUfileIOParams_t*>NULL).u)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof((<CUfileIOParams_t*>NULL).u), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <_anon_pod2 *>malloc(sizeof((<CUfileIOParams_t*>NULL).u))
@@ -1329,6 +1428,11 @@ cdef class _py_anon_pod2:
         cdef _py_anon_pod3 val_ = val
         memcpy(<void *>&(self._ptr[0].batch), <void *>(val_._get_ptr()), sizeof(_anon_pod3) * 1)
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an _py_anon_pod2 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof((<CUfileIOParams_t*>NULL).u), _py_anon_pod2)
+
     @staticmethod
     def from_data(data):
         """Create an _py_anon_pod2 instance wrapping the given NumPy array.
@@ -1468,6 +1572,12 @@ cdef class StatsLevel1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(CUfileStatsLevel1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(CUfileStatsLevel1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <CUfileStatsLevel1_t *>malloc(sizeof(CUfileStatsLevel1_t))
@@ -1974,6 +2084,11 @@ cdef class StatsLevel1:
             raise ValueError("This StatsLevel1 instance is read-only")
         self._ptr[0].last_batch_write_bytes = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an StatsLevel1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(CUfileStatsLevel1_t), StatsLevel1)
+
     @staticmethod
     def from_data(data):
         """Create an StatsLevel1 instance wrapping the given NumPy array.
@@ -2079,6 +2194,12 @@ cdef class IOParams:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def mode(self):
         """Union[~_numpy.int32, int]: """
@@ -2151,6 +2272,11 @@ cdef class IOParams:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an IOParams instance with the memory from the given buffer."""
+        return IOParams.from_data(_numpy.frombuffer(buffer, dtype=io_params_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an IOParams instance wrapping the given NumPy array.
@@ -2253,6 +2379,12 @@ cdef class StatsLevel2:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(CUfileStatsLevel2_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(CUfileStatsLevel2_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <CUfileStatsLevel2_t *>malloc(sizeof(CUfileStatsLevel2_t))
@@ -2311,6 +2443,11 @@ cdef class StatsLevel2:
         arr[:] = _numpy.asarray(val, dtype=_numpy.uint64)
         memcpy(<void *>(&(self._ptr[0].write_size_kb_hist)), <void *>(arr.data), sizeof(uint64_t) * len(val))
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an StatsLevel2 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(CUfileStatsLevel2_t), StatsLevel2)
+
     @staticmethod
     def from_data(data):
         """Create an StatsLevel2 instance wrapping the given NumPy array.
@@ -2410,6 +2547,12 @@ cdef class StatsLevel3:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(CUfileStatsLevel3_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(CUfileStatsLevel3_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <CUfileStatsLevel3_t *>malloc(sizeof(CUfileStatsLevel3_t))
@@ -2459,6 +2602,11 @@ cdef class StatsLevel3:
             raise ValueError("This StatsLevel3 instance is read-only")
         self._ptr[0].num_gpus = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an StatsLevel3 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(CUfileStatsLevel3_t), StatsLevel3)
+
     @staticmethod
     def from_data(data):
         """Create an StatsLevel3 instance wrapping the given NumPy array.
diff --git a/cuda_bindings/cuda/bindings/cycufile.pxd b/cuda_bindings/cuda/bindings/cycufile.pxd
index 05358ec3ac..ce3f6bc94b 100644
--- a/cuda_bindings/cuda/bindings/cycufile.pxd
+++ b/cuda_bindings/cuda/bindings/cycufile.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 from libc.time cimport time_t
diff --git a/cuda_bindings/cuda/bindings/cycufile.pyx b/cuda_bindings/cuda/bindings/cycufile.pyx
index 6bb831666f..32d7ae07b8 100644
--- a/cuda_bindings/cuda/bindings/cycufile.pyx
+++ b/cuda_bindings/cuda/bindings/cycufile.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from ._internal cimport cufile as _cufile
 
diff --git a/cuda_bindings/cuda/bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/cydriver.pxd.in
index 23681ca6e7..ccafc102f6 100644
--- a/cuda_bindings/cuda/bindings/cydriver.pxd.in
+++ b/cuda_bindings/cuda/bindings/cydriver.pxd.in
@@ -1,7 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
diff --git a/cuda_bindings/cuda/bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/cydriver.pyx.in
index 301563b490..d54c5140a6 100644
--- a/cuda_bindings/cuda/bindings/cydriver.pyx.in
+++ b/cuda_bindings/cuda/bindings/cydriver.pyx.in
@@ -1,7 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 cimport cuda.bindings._bindings.cydriver as cydriver
 
 {{if 'cuGetErrorString' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/cynvjitlink.pxd b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
index eff351fef1..d9ad2ec49a 100644
--- a/cuda_bindings/cuda/bindings/cynvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uint32_t
 
diff --git a/cuda_bindings/cuda/bindings/cynvjitlink.pyx b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
index e5e6b13210..669c6a3937 100644
--- a/cuda_bindings/cuda/bindings/cynvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from ._internal cimport nvjitlink as _nvjitlink
 
diff --git a/cuda_bindings/cuda/bindings/cynvml.pxd b/cuda_bindings/cuda/bindings/cynvml.pxd
index 4765ef39b1..d95297e6cc 100644
--- a/cuda_bindings/cuda/bindings/cynvml.pxd
+++ b/cuda_bindings/cuda/bindings/cynvml.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from libc.stdint cimport int64_t
 
diff --git a/cuda_bindings/cuda/bindings/cynvml.pyx b/cuda_bindings/cuda/bindings/cynvml.pyx
index b395a6ac53..200cf74e7d 100644
--- a/cuda_bindings/cuda/bindings/cynvml.pyx
+++ b/cuda_bindings/cuda/bindings/cynvml.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from ._internal cimport nvml as _nvml
 
diff --git a/cuda_bindings/cuda/bindings/cynvrtc.pxd.in b/cuda_bindings/cuda/bindings/cynvrtc.pxd.in
index af5acab52d..a03d3a80f6 100644
--- a/cuda_bindings/cuda/bindings/cynvrtc.pxd.in
+++ b/cuda_bindings/cuda/bindings/cynvrtc.pxd.in
@@ -1,7 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
diff --git a/cuda_bindings/cuda/bindings/cynvrtc.pyx.in b/cuda_bindings/cuda/bindings/cynvrtc.pyx.in
index 423efcf54c..9781cfde24 100644
--- a/cuda_bindings/cuda/bindings/cynvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/cynvrtc.pyx.in
@@ -1,7 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 cimport cuda.bindings._bindings.cynvrtc as cynvrtc
 
 {{if 'nvrtcGetErrorString' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/cynvvm.pxd b/cuda_bindings/cuda/bindings/cynvvm.pxd
index 5960917803..9548196a9e 100644
--- a/cuda_bindings/cuda/bindings/cynvvm.pxd
+++ b/cuda_bindings/cuda/bindings/cynvvm.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 
 ###############################################################################
diff --git a/cuda_bindings/cuda/bindings/cynvvm.pyx b/cuda_bindings/cuda/bindings/cynvvm.pyx
index 7fe09d572a..24e1899004 100644
--- a/cuda_bindings/cuda/bindings/cynvvm.pyx
+++ b/cuda_bindings/cuda/bindings/cynvvm.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from ._internal cimport nvvm as _nvvm
 
diff --git a/cuda_bindings/cuda/bindings/cyruntime.pxd.in b/cuda_bindings/cuda/bindings/cyruntime.pxd.in
index 0b4344ab02..2b2cc4aae8 100644
--- a/cuda_bindings/cuda/bindings/cyruntime.pxd.in
+++ b/cuda_bindings/cuda/bindings/cyruntime.pxd.in
@@ -1,7 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
diff --git a/cuda_bindings/cuda/bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/cyruntime.pyx.in
index 248346d274..44b1cb86a2 100644
--- a/cuda_bindings/cuda/bindings/cyruntime.pyx.in
+++ b/cuda_bindings/cuda/bindings/cyruntime.pyx.in
@@ -1,7 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 cimport cuda.bindings._bindings.cyruntime as cyruntime
 cimport cython
 
diff --git a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
index 3ca4474fc5..3be1573eab 100644
--- a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
+++ b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
@@ -1,7 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 cdef extern from "cuda_runtime_api.h":
 
     {{if 'cudaDeviceReset' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in b/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
index 3af28f67e7..c3166d195f 100644
--- a/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
+++ b/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
@@ -1,7 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 
 cdef extern from "vector_types.h":
 
diff --git a/cuda_bindings/cuda/bindings/driver.pxd.in b/cuda_bindings/cuda/bindings/driver.pxd.in
index ed992b8bd0..43d70e92f5 100644
--- a/cuda_bindings/cuda/bindings/driver.pxd.in
+++ b/cuda_bindings/cuda/bindings/driver.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 cimport cuda.bindings.cydriver as cydriver
 
 include "_lib/utils.pxd"
diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in
index 60f510dde2..fbff464c87 100644
--- a/cuda_bindings/cuda/bindings/driver.pyx.in
+++ b/cuda_bindings/cuda/bindings/driver.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0, generator version c185cc3. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 from typing import Any, Optional
 import cython
 import ctypes
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pxd b/cuda_bindings/cuda/bindings/nvjitlink.pxd
index 0080a46415..5155c0fbb1 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uint32_t
 
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pyx b/cuda_bindings/cuda/bindings/nvjitlink.pyx
index 874ee55ce7..f50c76307b 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pyx
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 cimport cython  # NOQA
 
diff --git a/cuda_bindings/cuda/bindings/nvml.pxd b/cuda_bindings/cuda/bindings/nvml.pxd
index 7b37a14122..a7644091e2 100644
--- a/cuda_bindings/cuda/bindings/nvml.pxd
+++ b/cuda_bindings/cuda/bindings/nvml.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.1.1, generator version 0.3.1.dev1283+gc7bc6fa75. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/nvml.pyx b/cuda_bindings/cuda/bindings/nvml.pyx
index 990e098cec..f25485ad69 100644
--- a/cuda_bindings/cuda/bindings/nvml.pyx
+++ b/cuda_bindings/cuda/bindings/nvml.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.1.1, generator version 0.3.1.dev1283+gc7bc6fa75. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 cimport cython  # NOQA
 
@@ -36,6 +36,33 @@ cdef __from_data(data, dtype_name, expected_dtype, lowpp_type):
     return lowpp_type.from_ptr(data.ctypes.data, not data.flags.writeable, data)
 
 
+cdef __from_buffer(buffer, size, lowpp_type):
+    cdef Py_buffer view
+    if cpython.PyObject_GetBuffer(buffer, &view, cpython.PyBUF_SIMPLE) != 0:
+        raise TypeError("buffer argument does not support the buffer protocol")
+    try:
+        if view.itemsize != 1:
+            raise ValueError("buffer itemsize must be 1 byte")
+        if view.len != size:
+            raise ValueError(f"buffer length must be {size} bytes")
+        return lowpp_type.from_ptr(<intptr_t><void *>view.buf, not view.readonly, buffer)
+    finally:
+        cpython.PyBuffer_Release(&view)
+
+
+cdef __getbuffer(object self, cpython.Py_buffer *buffer, void *ptr, int size, bint readonly):
+    buffer.buf = <char *>ptr
+    buffer.format = 'b'
+    buffer.internal = NULL
+    buffer.itemsize = 1
+    buffer.len = size
+    buffer.ndim = 1
+    buffer.obj = self
+    buffer.readonly = readonly
+    buffer.shape = &buffer.len
+    buffer.strides = &buffer.itemsize
+    buffer.suboffsets = NULL
+
 
 cdef inline unsigned int NVML_VERSION_STRUCT(const unsigned int size, const unsigned int ver) nogil:
     return (size | (ver << 24))
@@ -2074,6 +2101,12 @@ cdef class PciInfoExt_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlPciInfoExt_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlPciInfoExt_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlPciInfoExt_v1_t *>malloc(sizeof(nvmlPciInfoExt_v1_t))
@@ -2189,6 +2222,11 @@ cdef class PciInfoExt_v1:
         cdef char *ptr = buf
         memcpy(<void *>(self._ptr[0].busId), <void *>ptr, 32)
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an PciInfoExt_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlPciInfoExt_v1_t), PciInfoExt_v1)
+
     @staticmethod
     def from_data(data):
         """Create an PciInfoExt_v1 instance wrapping the given NumPy array.
@@ -2292,6 +2330,12 @@ cdef class PciInfo:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlPciInfo_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlPciInfo_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlPciInfo_t *>malloc(sizeof(nvmlPciInfo_t))
@@ -2389,6 +2433,11 @@ cdef class PciInfo:
         cdef char *ptr = buf
         memcpy(<void *>(self._ptr[0].busId), <void *>ptr, 32)
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an PciInfo instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlPciInfo_t), PciInfo)
+
     @staticmethod
     def from_data(data):
         """Create an PciInfo instance wrapping the given NumPy array.
@@ -2487,6 +2536,12 @@ cdef class Utilization:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlUtilization_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlUtilization_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlUtilization_t *>malloc(sizeof(nvmlUtilization_t))
@@ -2521,6 +2576,11 @@ cdef class Utilization:
             raise ValueError("This Utilization instance is read-only")
         self._ptr[0].memory = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an Utilization instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlUtilization_t), Utilization)
+
     @staticmethod
     def from_data(data):
         """Create an Utilization instance wrapping the given NumPy array.
@@ -2620,6 +2680,12 @@ cdef class Memory:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlMemory_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlMemory_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlMemory_t *>malloc(sizeof(nvmlMemory_t))
@@ -2665,6 +2731,11 @@ cdef class Memory:
             raise ValueError("This Memory instance is read-only")
         self._ptr[0].used = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an Memory instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlMemory_t), Memory)
+
     @staticmethod
     def from_data(data):
         """Create an Memory instance wrapping the given NumPy array.
@@ -2766,6 +2837,12 @@ cdef class Memory_v2:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlMemory_v2_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlMemory_v2_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlMemory_v2_t *>malloc(sizeof(nvmlMemory_v2_t))
@@ -2833,6 +2910,11 @@ cdef class Memory_v2:
             raise ValueError("This Memory_v2 instance is read-only")
         self._ptr[0].used = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an Memory_v2 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlMemory_v2_t), Memory_v2)
+
     @staticmethod
     def from_data(data):
         """Create an Memory_v2 instance wrapping the given NumPy array.
@@ -2932,6 +3014,12 @@ cdef class BAR1Memory:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlBAR1Memory_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlBAR1Memory_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlBAR1Memory_t *>malloc(sizeof(nvmlBAR1Memory_t))
@@ -2977,6 +3065,11 @@ cdef class BAR1Memory:
             raise ValueError("This BAR1Memory instance is read-only")
         self._ptr[0].bar1Used = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an BAR1Memory instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlBAR1Memory_t), BAR1Memory)
+
     @staticmethod
     def from_data(data):
         """Create an BAR1Memory instance wrapping the given NumPy array.
@@ -3081,6 +3174,12 @@ cdef class ProcessInfo:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def pid(self):
         """Union[~_numpy.uint32, int]: """
@@ -3144,6 +3243,11 @@ cdef class ProcessInfo:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ProcessInfo instance with the memory from the given buffer."""
+        return ProcessInfo.from_data(_numpy.frombuffer(buffer, dtype=process_info_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an ProcessInfo instance wrapping the given NumPy array.
@@ -3252,6 +3356,12 @@ cdef class ProcessDetail_v1:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def pid(self):
         """Union[~_numpy.uint32, int]: Process ID."""
@@ -3326,6 +3436,11 @@ cdef class ProcessDetail_v1:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ProcessDetail_v1 instance with the memory from the given buffer."""
+        return ProcessDetail_v1.from_data(_numpy.frombuffer(buffer, dtype=process_detail_v1_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an ProcessDetail_v1 instance wrapping the given NumPy array.
@@ -3434,6 +3549,12 @@ cdef class DeviceAttributes:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlDeviceAttributes_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlDeviceAttributes_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlDeviceAttributes_t *>malloc(sizeof(nvmlDeviceAttributes_t))
@@ -3545,6 +3666,11 @@ cdef class DeviceAttributes:
             raise ValueError("This DeviceAttributes instance is read-only")
         self._ptr[0].memorySizeMB = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an DeviceAttributes instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlDeviceAttributes_t), DeviceAttributes)
+
     @staticmethod
     def from_data(data):
         """Create an DeviceAttributes instance wrapping the given NumPy array.
@@ -3642,6 +3768,12 @@ cdef class C2cModeInfo_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlC2cModeInfo_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlC2cModeInfo_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlC2cModeInfo_v1_t *>malloc(sizeof(nvmlC2cModeInfo_v1_t))
@@ -3665,6 +3797,11 @@ cdef class C2cModeInfo_v1:
             raise ValueError("This C2cModeInfo_v1 instance is read-only")
         self._ptr[0].isC2cEnabled = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an C2cModeInfo_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlC2cModeInfo_v1_t), C2cModeInfo_v1)
+
     @staticmethod
     def from_data(data):
         """Create an C2cModeInfo_v1 instance wrapping the given NumPy array.
@@ -3766,6 +3903,12 @@ cdef class RowRemapperHistogramValues:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlRowRemapperHistogramValues_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlRowRemapperHistogramValues_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlRowRemapperHistogramValues_t *>malloc(sizeof(nvmlRowRemapperHistogramValues_t))
@@ -3833,6 +3976,11 @@ cdef class RowRemapperHistogramValues:
             raise ValueError("This RowRemapperHistogramValues instance is read-only")
         self._ptr[0].none = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an RowRemapperHistogramValues instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlRowRemapperHistogramValues_t), RowRemapperHistogramValues)
+
     @staticmethod
     def from_data(data):
         """Create an RowRemapperHistogramValues instance wrapping the given NumPy array.
@@ -3935,6 +4083,12 @@ cdef class BridgeChipInfo:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def type(self):
         """Union[~_numpy.int32, int]: """
@@ -3976,6 +4130,11 @@ cdef class BridgeChipInfo:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an BridgeChipInfo instance with the memory from the given buffer."""
+        return BridgeChipInfo.from_data(_numpy.frombuffer(buffer, dtype=bridge_chip_info_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an BridgeChipInfo instance wrapping the given NumPy array.
@@ -4077,6 +4236,12 @@ cdef class Value:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlValue_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlValue_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlValue_t *>malloc(sizeof(nvmlValue_t))
@@ -4166,6 +4331,11 @@ cdef class Value:
             raise ValueError("This Value instance is read-only")
         self._ptr[0].usVal = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an Value instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlValue_t), Value)
+
     @staticmethod
     def from_data(data):
         """Create an Value instance wrapping the given NumPy array.
@@ -4267,6 +4437,12 @@ cdef class _py_anon_pod0:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(_anon_pod0)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(_anon_pod0), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <_anon_pod0 *>malloc(sizeof(_anon_pod0))
@@ -4334,6 +4510,11 @@ cdef class _py_anon_pod0:
             raise ValueError("This _py_anon_pod0 instance is read-only")
         self._ptr[0].target = <nvmlThermalTarget_t><int>val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an _py_anon_pod0 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(_anon_pod0), _py_anon_pod0)
+
     @staticmethod
     def from_data(data):
         """Create an _py_anon_pod0 instance wrapping the given NumPy array.
@@ -4434,6 +4615,12 @@ cdef class CoolerInfo_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlCoolerInfo_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlCoolerInfo_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlCoolerInfo_v1_t *>malloc(sizeof(nvmlCoolerInfo_v1_t))
@@ -4490,6 +4677,11 @@ cdef class CoolerInfo_v1:
             raise ValueError("This CoolerInfo_v1 instance is read-only")
         self._ptr[0].target = <nvmlCoolerTarget_t><int>val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an CoolerInfo_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlCoolerInfo_v1_t), CoolerInfo_v1)
+
     @staticmethod
     def from_data(data):
         """Create an CoolerInfo_v1 instance wrapping the given NumPy array.
@@ -4592,6 +4784,12 @@ cdef class ClkMonFaultInfo:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def clk_api_domain(self):
         """Union[~_numpy.uint32, int]: """
@@ -4633,6 +4831,11 @@ cdef class ClkMonFaultInfo:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ClkMonFaultInfo instance with the memory from the given buffer."""
+        return ClkMonFaultInfo.from_data(_numpy.frombuffer(buffer, dtype=clk_mon_fault_info_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an ClkMonFaultInfo instance wrapping the given NumPy array.
@@ -4738,6 +4941,12 @@ cdef class ClockOffset_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlClockOffset_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlClockOffset_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlClockOffset_v1_t *>malloc(sizeof(nvmlClockOffset_v1_t))
@@ -4816,6 +5025,11 @@ cdef class ClockOffset_v1:
             raise ValueError("This ClockOffset_v1 instance is read-only")
         self._ptr[0].maxClockOffsetMHz = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ClockOffset_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlClockOffset_v1_t), ClockOffset_v1)
+
     @staticmethod
     def from_data(data):
         """Create an ClockOffset_v1 instance wrapping the given NumPy array.
@@ -4922,6 +5136,12 @@ cdef class ProcessUtilizationSample:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def pid(self):
         """Union[~_numpy.uint32, int]: """
@@ -5007,6 +5227,11 @@ cdef class ProcessUtilizationSample:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ProcessUtilizationSample instance with the memory from the given buffer."""
+        return ProcessUtilizationSample.from_data(_numpy.frombuffer(buffer, dtype=process_utilization_sample_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an ProcessUtilizationSample instance wrapping the given NumPy array.
@@ -5118,6 +5343,12 @@ cdef class ProcessUtilizationInfo_v1:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def time_stamp(self):
         """Union[~_numpy.uint64, int]: CPU Timestamp in microseconds."""
@@ -5225,6 +5456,11 @@ cdef class ProcessUtilizationInfo_v1:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ProcessUtilizationInfo_v1 instance with the memory from the given buffer."""
+        return ProcessUtilizationInfo_v1.from_data(_numpy.frombuffer(buffer, dtype=process_utilization_info_v1_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an ProcessUtilizationInfo_v1 instance wrapping the given NumPy array.
@@ -5337,6 +5573,12 @@ cdef class EccSramErrorStatus_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlEccSramErrorStatus_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlEccSramErrorStatus_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlEccSramErrorStatus_v1_t *>malloc(sizeof(nvmlEccSramErrorStatus_v1_t))
@@ -5492,6 +5734,11 @@ cdef class EccSramErrorStatus_v1:
             raise ValueError("This EccSramErrorStatus_v1 instance is read-only")
         self._ptr[0].bThresholdExceeded = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an EccSramErrorStatus_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlEccSramErrorStatus_v1_t), EccSramErrorStatus_v1)
+
     @staticmethod
     def from_data(data):
         """Create an EccSramErrorStatus_v1 instance wrapping the given NumPy array.
@@ -5596,6 +5843,12 @@ cdef class PlatformInfo_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlPlatformInfo_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlPlatformInfo_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlPlatformInfo_v1_t *>malloc(sizeof(nvmlPlatformInfo_v1_t))
@@ -5708,6 +5961,11 @@ cdef class PlatformInfo_v1:
             raise ValueError("This PlatformInfo_v1 instance is read-only")
         self._ptr[0].moduleId = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an PlatformInfo_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlPlatformInfo_v1_t), PlatformInfo_v1)
+
     @staticmethod
     def from_data(data):
         """Create an PlatformInfo_v1 instance wrapping the given NumPy array.
@@ -5812,6 +6070,12 @@ cdef class PlatformInfo_v2:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlPlatformInfo_v2_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlPlatformInfo_v2_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlPlatformInfo_v2_t *>malloc(sizeof(nvmlPlatformInfo_v2_t))
@@ -5924,6 +6188,11 @@ cdef class PlatformInfo_v2:
             raise ValueError("This PlatformInfo_v2 instance is read-only")
         self._ptr[0].moduleId = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an PlatformInfo_v2 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlPlatformInfo_v2_t), PlatformInfo_v2)
+
     @staticmethod
     def from_data(data):
         """Create an PlatformInfo_v2 instance wrapping the given NumPy array.
@@ -6024,6 +6293,12 @@ cdef class _py_anon_pod1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(_anon_pod1)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(_anon_pod1), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <_anon_pod1 *>malloc(sizeof(_anon_pod1))
@@ -6080,6 +6355,11 @@ cdef class _py_anon_pod1:
             raise ValueError("This _py_anon_pod1 instance is read-only")
         self._ptr[0].decThreshold = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an _py_anon_pod1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(_anon_pod1), _py_anon_pod1)
+
     @staticmethod
     def from_data(data):
         """Create an _py_anon_pod1 instance wrapping the given NumPy array.
@@ -6183,6 +6463,12 @@ cdef class VgpuPlacementList_v2:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuPlacementList_v2_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuPlacementList_v2_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlVgpuPlacementList_v2_t *>malloc(sizeof(nvmlVgpuPlacementList_v2_t))
@@ -6247,6 +6533,11 @@ cdef class VgpuPlacementList_v2:
             raise ValueError("This VgpuPlacementList_v2 instance is read-only")
         self._ptr[0].mode = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuPlacementList_v2 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuPlacementList_v2_t), VgpuPlacementList_v2)
+
     @staticmethod
     def from_data(data):
         """Create an VgpuPlacementList_v2 instance wrapping the given NumPy array.
@@ -6346,6 +6637,12 @@ cdef class VgpuTypeBar1Info_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuTypeBar1Info_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuTypeBar1Info_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlVgpuTypeBar1Info_v1_t *>malloc(sizeof(nvmlVgpuTypeBar1Info_v1_t))
@@ -6380,6 +6677,11 @@ cdef class VgpuTypeBar1Info_v1:
             raise ValueError("This VgpuTypeBar1Info_v1 instance is read-only")
         self._ptr[0].bar1Size = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuTypeBar1Info_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuTypeBar1Info_v1_t), VgpuTypeBar1Info_v1)
+
     @staticmethod
     def from_data(data):
         """Create an VgpuTypeBar1Info_v1 instance wrapping the given NumPy array.
@@ -6490,6 +6792,12 @@ cdef class VgpuProcessUtilizationInfo_v1:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def process_name(self):
         """~_numpy.int8: (array of length 64).Name of process running within the vGPU VM."""
@@ -6617,6 +6925,11 @@ cdef class VgpuProcessUtilizationInfo_v1:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuProcessUtilizationInfo_v1 instance with the memory from the given buffer."""
+        return VgpuProcessUtilizationInfo_v1.from_data(_numpy.frombuffer(buffer, dtype=vgpu_process_utilization_info_v1_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an VgpuProcessUtilizationInfo_v1 instance wrapping the given NumPy array.
@@ -6718,6 +7031,12 @@ cdef class _py_anon_pod2:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(_anon_pod2)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(_anon_pod2), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <_anon_pod2 *>malloc(sizeof(_anon_pod2))
@@ -6752,6 +7071,11 @@ cdef class _py_anon_pod2:
             raise ValueError("This _py_anon_pod2 instance is read-only")
         self._ptr[0].timeslice = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an _py_anon_pod2 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(_anon_pod2), _py_anon_pod2)
+
     @staticmethod
     def from_data(data):
         """Create an _py_anon_pod2 instance wrapping the given NumPy array.
@@ -6849,6 +7173,12 @@ cdef class _py_anon_pod3:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(_anon_pod3)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(_anon_pod3), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <_anon_pod3 *>malloc(sizeof(_anon_pod3))
@@ -6872,6 +7202,11 @@ cdef class _py_anon_pod3:
             raise ValueError("This _py_anon_pod3 instance is read-only")
         self._ptr[0].timeslice = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an _py_anon_pod3 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(_anon_pod3), _py_anon_pod3)
+
     @staticmethod
     def from_data(data):
         """Create an _py_anon_pod3 instance wrapping the given NumPy array.
@@ -6978,6 +7313,12 @@ cdef class VgpuSchedulerLogEntry:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def timestamp(self):
         """Union[~_numpy.uint64, int]: """
@@ -7063,6 +7404,11 @@ cdef class VgpuSchedulerLogEntry:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuSchedulerLogEntry instance with the memory from the given buffer."""
+        return VgpuSchedulerLogEntry.from_data(_numpy.frombuffer(buffer, dtype=vgpu_scheduler_log_entry_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an VgpuSchedulerLogEntry instance wrapping the given NumPy array.
@@ -7164,6 +7510,12 @@ cdef class _py_anon_pod4:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(_anon_pod4)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(_anon_pod4), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <_anon_pod4 *>malloc(sizeof(_anon_pod4))
@@ -7198,6 +7550,11 @@ cdef class _py_anon_pod4:
             raise ValueError("This _py_anon_pod4 instance is read-only")
         self._ptr[0].frequency = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an _py_anon_pod4 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(_anon_pod4), _py_anon_pod4)
+
     @staticmethod
     def from_data(data):
         """Create an _py_anon_pod4 instance wrapping the given NumPy array.
@@ -7295,6 +7652,12 @@ cdef class _py_anon_pod5:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(_anon_pod5)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(_anon_pod5), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <_anon_pod5 *>malloc(sizeof(_anon_pod5))
@@ -7318,6 +7681,11 @@ cdef class _py_anon_pod5:
             raise ValueError("This _py_anon_pod5 instance is read-only")
         self._ptr[0].timeslice = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an _py_anon_pod5 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(_anon_pod5), _py_anon_pod5)
+
     @staticmethod
     def from_data(data):
         """Create an _py_anon_pod5 instance wrapping the given NumPy array.
@@ -7422,6 +7790,12 @@ cdef class VgpuSchedulerCapabilities:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuSchedulerCapabilities_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuSchedulerCapabilities_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlVgpuSchedulerCapabilities_t *>malloc(sizeof(nvmlVgpuSchedulerCapabilities_t))
@@ -7528,6 +7902,11 @@ cdef class VgpuSchedulerCapabilities:
             raise ValueError("This VgpuSchedulerCapabilities instance is read-only")
         self._ptr[0].minAvgFactorForARR = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuSchedulerCapabilities instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuSchedulerCapabilities_t), VgpuSchedulerCapabilities)
+
     @staticmethod
     def from_data(data):
         """Create an VgpuSchedulerCapabilities instance wrapping the given NumPy array.
@@ -7631,6 +8010,12 @@ cdef class VgpuLicenseExpiry:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuLicenseExpiry_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuLicenseExpiry_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlVgpuLicenseExpiry_t *>malloc(sizeof(nvmlVgpuLicenseExpiry_t))
@@ -7720,6 +8105,11 @@ cdef class VgpuLicenseExpiry:
             raise ValueError("This VgpuLicenseExpiry instance is read-only")
         self._ptr[0].status = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuLicenseExpiry instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuLicenseExpiry_t), VgpuLicenseExpiry)
+
     @staticmethod
     def from_data(data):
         """Create an VgpuLicenseExpiry instance wrapping the given NumPy array.
@@ -7823,6 +8213,12 @@ cdef class GridLicenseExpiry:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlGridLicenseExpiry_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlGridLicenseExpiry_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlGridLicenseExpiry_t *>malloc(sizeof(nvmlGridLicenseExpiry_t))
@@ -7912,6 +8308,11 @@ cdef class GridLicenseExpiry:
             raise ValueError("This GridLicenseExpiry instance is read-only")
         self._ptr[0].status = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an GridLicenseExpiry instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlGridLicenseExpiry_t), GridLicenseExpiry)
+
     @staticmethod
     def from_data(data):
         """Create an GridLicenseExpiry instance wrapping the given NumPy array.
@@ -8011,6 +8412,12 @@ cdef class VgpuTypeIdInfo_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuTypeIdInfo_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuTypeIdInfo_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlVgpuTypeIdInfo_v1_t *>malloc(sizeof(nvmlVgpuTypeIdInfo_v1_t))
@@ -8053,6 +8460,11 @@ cdef class VgpuTypeIdInfo_v1:
         self._ptr[0].vgpuCount = len(val)
         self._refs["vgpu_type_ids"] = arr
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuTypeIdInfo_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuTypeIdInfo_v1_t), VgpuTypeIdInfo_v1)
+
     @staticmethod
     def from_data(data):
         """Create an VgpuTypeIdInfo_v1 instance wrapping the given NumPy array.
@@ -8152,6 +8564,12 @@ cdef class ActiveVgpuInstanceInfo_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlActiveVgpuInstanceInfo_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlActiveVgpuInstanceInfo_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlActiveVgpuInstanceInfo_v1_t *>malloc(sizeof(nvmlActiveVgpuInstanceInfo_v1_t))
@@ -8194,6 +8612,11 @@ cdef class ActiveVgpuInstanceInfo_v1:
         self._ptr[0].vgpuCount = len(val)
         self._refs["vgpu_instances"] = arr
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ActiveVgpuInstanceInfo_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlActiveVgpuInstanceInfo_v1_t), ActiveVgpuInstanceInfo_v1)
+
     @staticmethod
     def from_data(data):
         """Create an ActiveVgpuInstanceInfo_v1 instance wrapping the given NumPy array.
@@ -8297,6 +8720,12 @@ cdef class VgpuCreatablePlacementInfo_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuCreatablePlacementInfo_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuCreatablePlacementInfo_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlVgpuCreatablePlacementInfo_v1_t *>malloc(sizeof(nvmlVgpuCreatablePlacementInfo_v1_t))
@@ -8361,6 +8790,11 @@ cdef class VgpuCreatablePlacementInfo_v1:
         self._ptr[0].placementSize = len(val)
         self._refs["placement_ids"] = arr
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuCreatablePlacementInfo_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuCreatablePlacementInfo_v1_t), VgpuCreatablePlacementInfo_v1)
+
     @staticmethod
     def from_data(data):
         """Create an VgpuCreatablePlacementInfo_v1 instance wrapping the given NumPy array.
@@ -8464,6 +8898,12 @@ cdef class HwbcEntry:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def hwbc_id(self):
         """Union[~_numpy.uint32, int]: """
@@ -8503,6 +8943,11 @@ cdef class HwbcEntry:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an HwbcEntry instance with the memory from the given buffer."""
+        return HwbcEntry.from_data(_numpy.frombuffer(buffer, dtype=hwbc_entry_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an HwbcEntry instance wrapping the given NumPy array.
@@ -8604,6 +9049,12 @@ cdef class LedState:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlLedState_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlLedState_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlLedState_t *>malloc(sizeof(nvmlLedState_t))
@@ -8642,6 +9093,11 @@ cdef class LedState:
             raise ValueError("This LedState instance is read-only")
         self._ptr[0].color = <nvmlLedColor_t><int>val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an LedState instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlLedState_t), LedState)
+
     @staticmethod
     def from_data(data):
         """Create an LedState instance wrapping the given NumPy array.
@@ -8742,6 +9198,12 @@ cdef class UnitInfo:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlUnitInfo_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlUnitInfo_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlUnitInfo_t *>malloc(sizeof(nvmlUnitInfo_t))
@@ -8814,6 +9276,11 @@ cdef class UnitInfo:
         cdef char *ptr = buf
         memcpy(<void *>(self._ptr[0].firmwareVersion), <void *>ptr, 96)
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an UnitInfo instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlUnitInfo_t), UnitInfo)
+
     @staticmethod
     def from_data(data):
         """Create an UnitInfo instance wrapping the given NumPy array.
@@ -8914,6 +9381,12 @@ cdef class PSUInfo:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlPSUInfo_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlPSUInfo_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlPSUInfo_t *>malloc(sizeof(nvmlPSUInfo_t))
@@ -8974,6 +9447,11 @@ cdef class PSUInfo:
             raise ValueError("This PSUInfo instance is read-only")
         self._ptr[0].power = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an PSUInfo instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlPSUInfo_t), PSUInfo)
+
     @staticmethod
     def from_data(data):
         """Create an PSUInfo instance wrapping the given NumPy array.
@@ -9076,6 +9554,12 @@ cdef class UnitFanInfo:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def speed(self):
         """Union[~_numpy.uint32, int]: """
@@ -9117,6 +9601,11 @@ cdef class UnitFanInfo:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an UnitFanInfo instance with the memory from the given buffer."""
+        return UnitFanInfo.from_data(_numpy.frombuffer(buffer, dtype=unit_fan_info_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an UnitFanInfo instance wrapping the given NumPy array.
@@ -9221,6 +9710,12 @@ cdef class EventData:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlEventData_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlEventData_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlEventData_t *>malloc(sizeof(nvmlEventData_t))
@@ -9288,6 +9783,11 @@ cdef class EventData:
             raise ValueError("This EventData instance is read-only")
         self._ptr[0].computeInstanceId = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an EventData instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlEventData_t), EventData)
+
     @staticmethod
     def from_data(data):
         """Create an EventData instance wrapping the given NumPy array.
@@ -9390,6 +9890,12 @@ cdef class SystemEventData_v1:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def event_type(self):
         """Union[~_numpy.uint64, int]: Information about what specific system event occurred."""
@@ -9431,6 +9937,11 @@ cdef class SystemEventData_v1:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an SystemEventData_v1 instance with the memory from the given buffer."""
+        return SystemEventData_v1.from_data(_numpy.frombuffer(buffer, dtype=system_event_data_v1_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an SystemEventData_v1 instance wrapping the given NumPy array.
@@ -9537,6 +10048,12 @@ cdef class AccountingStats:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlAccountingStats_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlAccountingStats_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlAccountingStats_t *>malloc(sizeof(nvmlAccountingStats_t))
@@ -9615,6 +10132,11 @@ cdef class AccountingStats:
             raise ValueError("This AccountingStats instance is read-only")
         self._ptr[0].isRunning = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an AccountingStats instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlAccountingStats_t), AccountingStats)
+
     @staticmethod
     def from_data(data):
         """Create an AccountingStats instance wrapping the given NumPy array.
@@ -9723,6 +10245,12 @@ cdef class EncoderSessionInfo:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def session_id(self):
         """Union[~_numpy.uint32, int]: """
@@ -9830,6 +10358,11 @@ cdef class EncoderSessionInfo:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an EncoderSessionInfo instance with the memory from the given buffer."""
+        return EncoderSessionInfo.from_data(_numpy.frombuffer(buffer, dtype=encoder_session_info_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an EncoderSessionInfo instance wrapping the given NumPy array.
@@ -9932,6 +10465,12 @@ cdef class FBCStats:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlFBCStats_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlFBCStats_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlFBCStats_t *>malloc(sizeof(nvmlFBCStats_t))
@@ -9977,6 +10516,11 @@ cdef class FBCStats:
             raise ValueError("This FBCStats instance is read-only")
         self._ptr[0].averageLatency = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an FBCStats instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlFBCStats_t), FBCStats)
+
     @staticmethod
     def from_data(data):
         """Create an FBCStats instance wrapping the given NumPy array.
@@ -10089,6 +10633,12 @@ cdef class FBCSessionInfo:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def session_id(self):
         """Union[~_numpy.uint32, int]: """
@@ -10240,6 +10790,11 @@ cdef class FBCSessionInfo:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an FBCSessionInfo instance with the memory from the given buffer."""
+        return FBCSessionInfo.from_data(_numpy.frombuffer(buffer, dtype=fbc_session_info_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an FBCSessionInfo instance wrapping the given NumPy array.
@@ -10341,6 +10896,12 @@ cdef class ConfComputeSystemCaps:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlConfComputeSystemCaps_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlConfComputeSystemCaps_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlConfComputeSystemCaps_t *>malloc(sizeof(nvmlConfComputeSystemCaps_t))
@@ -10375,6 +10936,11 @@ cdef class ConfComputeSystemCaps:
             raise ValueError("This ConfComputeSystemCaps instance is read-only")
         self._ptr[0].gpusCaps = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ConfComputeSystemCaps instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlConfComputeSystemCaps_t), ConfComputeSystemCaps)
+
     @staticmethod
     def from_data(data):
         """Create an ConfComputeSystemCaps instance wrapping the given NumPy array.
@@ -10474,6 +11040,12 @@ cdef class ConfComputeSystemState:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlConfComputeSystemState_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlConfComputeSystemState_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlConfComputeSystemState_t *>malloc(sizeof(nvmlConfComputeSystemState_t))
@@ -10519,6 +11091,11 @@ cdef class ConfComputeSystemState:
             raise ValueError("This ConfComputeSystemState instance is read-only")
         self._ptr[0].devToolsMode = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ConfComputeSystemState instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlConfComputeSystemState_t), ConfComputeSystemState)
+
     @staticmethod
     def from_data(data):
         """Create an ConfComputeSystemState instance wrapping the given NumPy array.
@@ -10620,6 +11197,12 @@ cdef class SystemConfComputeSettings_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlSystemConfComputeSettings_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlSystemConfComputeSettings_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlSystemConfComputeSettings_v1_t *>malloc(sizeof(nvmlSystemConfComputeSettings_v1_t))
@@ -10687,6 +11270,11 @@ cdef class SystemConfComputeSettings_v1:
             raise ValueError("This SystemConfComputeSettings_v1 instance is read-only")
         self._ptr[0].multiGpuMode = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an SystemConfComputeSettings_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlSystemConfComputeSettings_v1_t), SystemConfComputeSettings_v1)
+
     @staticmethod
     def from_data(data):
         """Create an SystemConfComputeSettings_v1 instance wrapping the given NumPy array.
@@ -10785,6 +11373,12 @@ cdef class ConfComputeMemSizeInfo:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlConfComputeMemSizeInfo_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlConfComputeMemSizeInfo_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlConfComputeMemSizeInfo_t *>malloc(sizeof(nvmlConfComputeMemSizeInfo_t))
@@ -10819,6 +11413,11 @@ cdef class ConfComputeMemSizeInfo:
             raise ValueError("This ConfComputeMemSizeInfo instance is read-only")
         self._ptr[0].unprotectedMemSizeKib = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ConfComputeMemSizeInfo instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlConfComputeMemSizeInfo_t), ConfComputeMemSizeInfo)
+
     @staticmethod
     def from_data(data):
         """Create an ConfComputeMemSizeInfo instance wrapping the given NumPy array.
@@ -10919,6 +11518,12 @@ cdef class ConfComputeGpuCertificate:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlConfComputeGpuCertificate_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlConfComputeGpuCertificate_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlConfComputeGpuCertificate_t *>malloc(sizeof(nvmlConfComputeGpuCertificate_t))
@@ -10975,6 +11580,11 @@ cdef class ConfComputeGpuCertificate:
         arr[:] = _numpy.asarray(val, dtype=_numpy.uint8)
         memcpy(<void *>(&(self._ptr[0].attestationCertChain)), <void *>(arr.data), sizeof(unsigned char) * len(val))
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ConfComputeGpuCertificate instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlConfComputeGpuCertificate_t), ConfComputeGpuCertificate)
+
     @staticmethod
     def from_data(data):
         """Create an ConfComputeGpuCertificate instance wrapping the given NumPy array.
@@ -11077,6 +11687,12 @@ cdef class ConfComputeGpuAttestationReport:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlConfComputeGpuAttestationReport_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlConfComputeGpuAttestationReport_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlConfComputeGpuAttestationReport_t *>malloc(sizeof(nvmlConfComputeGpuAttestationReport_t))
@@ -11161,6 +11777,11 @@ cdef class ConfComputeGpuAttestationReport:
         arr[:] = _numpy.asarray(val, dtype=_numpy.uint8)
         memcpy(<void *>(&(self._ptr[0].cecAttestationReport)), <void *>(arr.data), sizeof(unsigned char) * len(val))
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ConfComputeGpuAttestationReport instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlConfComputeGpuAttestationReport_t), ConfComputeGpuAttestationReport)
+
     @staticmethod
     def from_data(data):
         """Create an ConfComputeGpuAttestationReport instance wrapping the given NumPy array.
@@ -11263,6 +11884,12 @@ cdef class GpuFabricInfo_v2:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlGpuFabricInfo_v2_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlGpuFabricInfo_v2_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlGpuFabricInfo_v2_t *>malloc(sizeof(nvmlGpuFabricInfo_v2_t))
@@ -11347,6 +11974,11 @@ cdef class GpuFabricInfo_v2:
             raise ValueError("This GpuFabricInfo_v2 instance is read-only")
         self._ptr[0].healthMask = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an GpuFabricInfo_v2 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlGpuFabricInfo_v2_t), GpuFabricInfo_v2)
+
     @staticmethod
     def from_data(data):
         """Create an GpuFabricInfo_v2 instance wrapping the given NumPy array.
@@ -11446,6 +12078,12 @@ cdef class NvlinkSupportedBwModes_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlNvlinkSupportedBwModes_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlNvlinkSupportedBwModes_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlNvlinkSupportedBwModes_v1_t *>malloc(sizeof(nvmlNvlinkSupportedBwModes_v1_t))
@@ -11491,6 +12129,11 @@ cdef class NvlinkSupportedBwModes_v1:
         arr[:] = _numpy.asarray(val, dtype=_numpy.uint8)
         memcpy(<void *>(&(self._ptr[0].bwModes)), <void *>(arr.data), sizeof(unsigned char) * len(val))
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an NvlinkSupportedBwModes_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlNvlinkSupportedBwModes_v1_t), NvlinkSupportedBwModes_v1)
+
     @staticmethod
     def from_data(data):
         """Create an NvlinkSupportedBwModes_v1 instance wrapping the given NumPy array.
@@ -11590,6 +12233,12 @@ cdef class NvlinkGetBwMode_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlNvlinkGetBwMode_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlNvlinkGetBwMode_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlNvlinkGetBwMode_v1_t *>malloc(sizeof(nvmlNvlinkGetBwMode_v1_t))
@@ -11635,6 +12284,11 @@ cdef class NvlinkGetBwMode_v1:
             raise ValueError("This NvlinkGetBwMode_v1 instance is read-only")
         self._ptr[0].bwMode = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an NvlinkGetBwMode_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlNvlinkGetBwMode_v1_t), NvlinkGetBwMode_v1)
+
     @staticmethod
     def from_data(data):
         """Create an NvlinkGetBwMode_v1 instance wrapping the given NumPy array.
@@ -11734,6 +12388,12 @@ cdef class NvlinkSetBwMode_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlNvlinkSetBwMode_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlNvlinkSetBwMode_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlNvlinkSetBwMode_v1_t *>malloc(sizeof(nvmlNvlinkSetBwMode_v1_t))
@@ -11779,6 +12439,11 @@ cdef class NvlinkSetBwMode_v1:
             raise ValueError("This NvlinkSetBwMode_v1 instance is read-only")
         self._ptr[0].bwMode = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an NvlinkSetBwMode_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlNvlinkSetBwMode_v1_t), NvlinkSetBwMode_v1)
+
     @staticmethod
     def from_data(data):
         """Create an NvlinkSetBwMode_v1 instance wrapping the given NumPy array.
@@ -11877,6 +12542,12 @@ cdef class VgpuVersion:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuVersion_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuVersion_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlVgpuVersion_t *>malloc(sizeof(nvmlVgpuVersion_t))
@@ -11911,6 +12582,11 @@ cdef class VgpuVersion:
             raise ValueError("This VgpuVersion instance is read-only")
         self._ptr[0].maxVersion = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuVersion instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuVersion_t), VgpuVersion)
+
     @staticmethod
     def from_data(data):
         """Create an VgpuVersion instance wrapping the given NumPy array.
@@ -12017,6 +12693,12 @@ cdef class VgpuMetadata:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuMetadata_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuMetadata_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlVgpuMetadata_t *>malloc(sizeof(nvmlVgpuMetadata_t))
@@ -12140,6 +12822,11 @@ cdef class VgpuMetadata:
         cdef char *ptr = buf
         memcpy(<void *>(self._ptr[0].opaqueData), <void *>ptr, 4)
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuMetadata instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuMetadata_t), VgpuMetadata)
+
     @staticmethod
     def from_data(data):
         """Create an VgpuMetadata instance wrapping the given NumPy array.
@@ -12238,6 +12925,12 @@ cdef class VgpuPgpuCompatibility:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuPgpuCompatibility_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuPgpuCompatibility_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlVgpuPgpuCompatibility_t *>malloc(sizeof(nvmlVgpuPgpuCompatibility_t))
@@ -12272,6 +12965,11 @@ cdef class VgpuPgpuCompatibility:
             raise ValueError("This VgpuPgpuCompatibility instance is read-only")
         self._ptr[0].compatibilityLimitCode = <nvmlVgpuPgpuCompatibilityLimitCode_t><int>val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuPgpuCompatibility instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuPgpuCompatibility_t), VgpuPgpuCompatibility)
+
     @staticmethod
     def from_data(data):
         """Create an VgpuPgpuCompatibility instance wrapping the given NumPy array.
@@ -12374,6 +13072,12 @@ cdef class GpuInstancePlacement:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def start(self):
         """Union[~_numpy.uint32, int]: """
@@ -12415,6 +13119,11 @@ cdef class GpuInstancePlacement:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an GpuInstancePlacement instance with the memory from the given buffer."""
+        return GpuInstancePlacement.from_data(_numpy.frombuffer(buffer, dtype=gpu_instance_placement_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an GpuInstancePlacement instance wrapping the given NumPy array.
@@ -12527,6 +13236,12 @@ cdef class GpuInstanceProfileInfo_v3:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlGpuInstanceProfileInfo_v3_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlGpuInstanceProfileInfo_v3_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlGpuInstanceProfileInfo_v3_t *>malloc(sizeof(nvmlGpuInstanceProfileInfo_v3_t))
@@ -12686,6 +13401,11 @@ cdef class GpuInstanceProfileInfo_v3:
             raise ValueError("This GpuInstanceProfileInfo_v3 instance is read-only")
         self._ptr[0].capabilities = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an GpuInstanceProfileInfo_v3 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlGpuInstanceProfileInfo_v3_t), GpuInstanceProfileInfo_v3)
+
     @staticmethod
     def from_data(data):
         """Create an GpuInstanceProfileInfo_v3 instance wrapping the given NumPy array.
@@ -12788,6 +13508,12 @@ cdef class ComputeInstancePlacement:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def start(self):
         """Union[~_numpy.uint32, int]: """
@@ -12829,6 +13555,11 @@ cdef class ComputeInstancePlacement:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ComputeInstancePlacement instance with the memory from the given buffer."""
+        return ComputeInstancePlacement.from_data(_numpy.frombuffer(buffer, dtype=compute_instance_placement_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an ComputeInstancePlacement instance wrapping the given NumPy array.
@@ -12939,6 +13670,12 @@ cdef class ComputeInstanceProfileInfo_v2:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlComputeInstanceProfileInfo_v2_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlComputeInstanceProfileInfo_v2_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlComputeInstanceProfileInfo_v2_t *>malloc(sizeof(nvmlComputeInstanceProfileInfo_v2_t))
@@ -13076,6 +13813,11 @@ cdef class ComputeInstanceProfileInfo_v2:
         cdef char *ptr = buf
         memcpy(<void *>(self._ptr[0].name), <void *>ptr, 96)
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ComputeInstanceProfileInfo_v2 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlComputeInstanceProfileInfo_v2_t), ComputeInstanceProfileInfo_v2)
+
     @staticmethod
     def from_data(data):
         """Create an ComputeInstanceProfileInfo_v2 instance wrapping the given NumPy array.
@@ -13184,6 +13926,12 @@ cdef class ComputeInstanceProfileInfo_v3:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlComputeInstanceProfileInfo_v3_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlComputeInstanceProfileInfo_v3_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlComputeInstanceProfileInfo_v3_t *>malloc(sizeof(nvmlComputeInstanceProfileInfo_v3_t))
@@ -13332,6 +14080,11 @@ cdef class ComputeInstanceProfileInfo_v3:
             raise ValueError("This ComputeInstanceProfileInfo_v3 instance is read-only")
         self._ptr[0].capabilities = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ComputeInstanceProfileInfo_v3 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlComputeInstanceProfileInfo_v3_t), ComputeInstanceProfileInfo_v3)
+
     @staticmethod
     def from_data(data):
         """Create an ComputeInstanceProfileInfo_v3 instance wrapping the given NumPy array.
@@ -13430,6 +14183,12 @@ cdef class DeviceAddressingMode_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlDeviceAddressingMode_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlDeviceAddressingMode_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlDeviceAddressingMode_v1_t *>malloc(sizeof(nvmlDeviceAddressingMode_v1_t))
@@ -13464,6 +14223,11 @@ cdef class DeviceAddressingMode_v1:
             raise ValueError("This DeviceAddressingMode_v1 instance is read-only")
         self._ptr[0].value = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an DeviceAddressingMode_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlDeviceAddressingMode_v1_t), DeviceAddressingMode_v1)
+
     @staticmethod
     def from_data(data):
         """Create an DeviceAddressingMode_v1 instance wrapping the given NumPy array.
@@ -13563,6 +14327,12 @@ cdef class RepairStatus_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlRepairStatus_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlRepairStatus_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlRepairStatus_v1_t *>malloc(sizeof(nvmlRepairStatus_v1_t))
@@ -13608,6 +14378,11 @@ cdef class RepairStatus_v1:
             raise ValueError("This RepairStatus_v1 instance is read-only")
         self._ptr[0].bTpcRepairPending = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an RepairStatus_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlRepairStatus_v1_t), RepairStatus_v1)
+
     @staticmethod
     def from_data(data):
         """Create an RepairStatus_v1 instance wrapping the given NumPy array.
@@ -13707,6 +14482,12 @@ cdef class DevicePowerMizerModes_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlDevicePowerMizerModes_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlDevicePowerMizerModes_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlDevicePowerMizerModes_v1_t *>malloc(sizeof(nvmlDevicePowerMizerModes_v1_t))
@@ -13752,6 +14533,11 @@ cdef class DevicePowerMizerModes_v1:
             raise ValueError("This DevicePowerMizerModes_v1 instance is read-only")
         self._ptr[0].supportedPowerMizerModes = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an DevicePowerMizerModes_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlDevicePowerMizerModes_v1_t), DevicePowerMizerModes_v1)
+
     @staticmethod
     def from_data(data):
         """Create an DevicePowerMizerModes_v1 instance wrapping the given NumPy array.
@@ -13859,6 +14645,12 @@ cdef class EccSramUniqueUncorrectedErrorEntry_v1:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def unit(self):
         """Union[~_numpy.uint32, int]: the SRAM unit index"""
@@ -13955,6 +14747,11 @@ cdef class EccSramUniqueUncorrectedErrorEntry_v1:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an EccSramUniqueUncorrectedErrorEntry_v1 instance with the memory from the given buffer."""
+        return EccSramUniqueUncorrectedErrorEntry_v1.from_data(_numpy.frombuffer(buffer, dtype=ecc_sram_unique_uncorrected_error_entry_v1_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an EccSramUniqueUncorrectedErrorEntry_v1 instance wrapping the given NumPy array.
@@ -14061,6 +14858,12 @@ cdef class GpuFabricInfo_v3:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlGpuFabricInfo_v3_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlGpuFabricInfo_v3_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlGpuFabricInfo_v3_t *>malloc(sizeof(nvmlGpuFabricInfo_v3_t))
@@ -14156,6 +14959,11 @@ cdef class GpuFabricInfo_v3:
             raise ValueError("This GpuFabricInfo_v3 instance is read-only")
         self._ptr[0].healthSummary = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an GpuFabricInfo_v3 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlGpuFabricInfo_v3_t), GpuFabricInfo_v3)
+
     @staticmethod
     def from_data(data):
         """Create an GpuFabricInfo_v3 instance wrapping the given NumPy array.
@@ -14254,6 +15062,12 @@ cdef class NvLinkInfo_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlNvLinkInfo_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlNvLinkInfo_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlNvLinkInfo_v1_t *>malloc(sizeof(nvmlNvLinkInfo_v1_t))
@@ -14288,6 +15102,11 @@ cdef class NvLinkInfo_v1:
             raise ValueError("This NvLinkInfo_v1 instance is read-only")
         self._ptr[0].isNvleEnabled = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an NvLinkInfo_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlNvLinkInfo_v1_t), NvLinkInfo_v1)
+
     @staticmethod
     def from_data(data):
         """Create an NvLinkInfo_v1 instance wrapping the given NumPy array.
@@ -14388,6 +15207,12 @@ cdef class NvlinkFirmwareVersion:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlNvlinkFirmwareVersion_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlNvlinkFirmwareVersion_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlNvlinkFirmwareVersion_t *>malloc(sizeof(nvmlNvlinkFirmwareVersion_t))
@@ -14444,6 +15269,11 @@ cdef class NvlinkFirmwareVersion:
             raise ValueError("This NvlinkFirmwareVersion instance is read-only")
         self._ptr[0].subMinor = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an NvlinkFirmwareVersion instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlNvlinkFirmwareVersion_t), NvlinkFirmwareVersion)
+
     @staticmethod
     def from_data(data):
         """Create an NvlinkFirmwareVersion instance wrapping the given NumPy array.
@@ -14541,6 +15371,12 @@ cdef class PRMCounterInput_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlPRMCounterInput_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlPRMCounterInput_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlPRMCounterInput_v1_t *>malloc(sizeof(nvmlPRMCounterInput_v1_t))
@@ -14564,6 +15400,11 @@ cdef class PRMCounterInput_v1:
             raise ValueError("This PRMCounterInput_v1 instance is read-only")
         self._ptr[0].localPort = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an PRMCounterInput_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlPRMCounterInput_v1_t), PRMCounterInput_v1)
+
     @staticmethod
     def from_data(data):
         """Create an PRMCounterInput_v1 instance wrapping the given NumPy array.
@@ -14662,6 +15503,12 @@ cdef class ExcludedDeviceInfo:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlExcludedDeviceInfo_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlExcludedDeviceInfo_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlExcludedDeviceInfo_t *>malloc(sizeof(nvmlExcludedDeviceInfo_t))
@@ -14701,6 +15548,11 @@ cdef class ExcludedDeviceInfo:
         cdef char *ptr = buf
         memcpy(<void *>(self._ptr[0].uuid), <void *>ptr, 80)
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ExcludedDeviceInfo instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlExcludedDeviceInfo_t), ExcludedDeviceInfo)
+
     @staticmethod
     def from_data(data):
         """Create an ExcludedDeviceInfo instance wrapping the given NumPy array.
@@ -14803,6 +15655,12 @@ cdef class ProcessDetailList_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlProcessDetailList_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlProcessDetailList_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlProcessDetailList_v1_t *>malloc(sizeof(nvmlProcessDetailList_v1_t))
@@ -14853,6 +15711,11 @@ cdef class ProcessDetailList_v1:
         self._ptr[0].numProcArrayEntries = len(arr)
         self._refs["proc_array"] = arr
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ProcessDetailList_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlProcessDetailList_v1_t), ProcessDetailList_v1)
+
     @staticmethod
     def from_data(data):
         """Create an ProcessDetailList_v1 instance wrapping the given NumPy array.
@@ -14952,6 +15815,12 @@ cdef class BridgeChipHierarchy:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlBridgeChipHierarchy_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlBridgeChipHierarchy_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlBridgeChipHierarchy_t *>malloc(sizeof(nvmlBridgeChipHierarchy_t))
@@ -14981,6 +15850,11 @@ cdef class BridgeChipHierarchy:
             return
         memcpy(<void *>&(self._ptr[0].bridgeChipInfo), <void *>(val_._get_ptr()), sizeof(nvmlBridgeChipInfo_t) * self._ptr[0].bridgeCount)
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an BridgeChipHierarchy instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlBridgeChipHierarchy_t), BridgeChipHierarchy)
+
     @staticmethod
     def from_data(data):
         """Create an BridgeChipHierarchy instance wrapping the given NumPy array.
@@ -15083,6 +15957,12 @@ cdef class Sample:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def time_stamp(self):
         """Union[~_numpy.uint64, int]: """
@@ -15122,6 +16002,11 @@ cdef class Sample:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an Sample instance with the memory from the given buffer."""
+        return Sample.from_data(_numpy.frombuffer(buffer, dtype=sample_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an Sample instance wrapping the given NumPy array.
@@ -15231,6 +16116,12 @@ cdef class VgpuInstanceUtilizationSample:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def vgpu_instance(self):
         """Union[~_numpy.uint32, int]: """
@@ -15308,6 +16199,11 @@ cdef class VgpuInstanceUtilizationSample:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuInstanceUtilizationSample instance with the memory from the given buffer."""
+        return VgpuInstanceUtilizationSample.from_data(_numpy.frombuffer(buffer, dtype=vgpu_instance_utilization_sample_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an VgpuInstanceUtilizationSample instance wrapping the given NumPy array.
@@ -15419,6 +16315,12 @@ cdef class VgpuInstanceUtilizationInfo_v1:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def time_stamp(self):
         """Union[~_numpy.uint64, int]: CPU Timestamp in microseconds."""
@@ -15514,6 +16416,11 @@ cdef class VgpuInstanceUtilizationInfo_v1:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuInstanceUtilizationInfo_v1 instance with the memory from the given buffer."""
+        return VgpuInstanceUtilizationInfo_v1.from_data(_numpy.frombuffer(buffer, dtype=vgpu_instance_utilization_info_v1_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an VgpuInstanceUtilizationInfo_v1 instance wrapping the given NumPy array.
@@ -15624,6 +16531,12 @@ cdef class FieldValue:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def field_id(self):
         """Union[~_numpy.uint32, int]: """
@@ -15718,6 +16631,11 @@ cdef class FieldValue:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an FieldValue instance with the memory from the given buffer."""
+        return FieldValue.from_data(_numpy.frombuffer(buffer, dtype=field_value_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an FieldValue instance wrapping the given NumPy array.
@@ -15820,6 +16738,12 @@ cdef class PRMCounterValue_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlPRMCounterValue_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlPRMCounterValue_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlPRMCounterValue_v1_t *>malloc(sizeof(nvmlPRMCounterValue_v1_t))
@@ -15866,6 +16790,11 @@ cdef class PRMCounterValue_v1:
             raise ValueError("This PRMCounterValue_v1 instance is read-only")
         self._ptr[0].outputType = <nvmlValueType_t><int>val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an PRMCounterValue_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlPRMCounterValue_v1_t), PRMCounterValue_v1)
+
     @staticmethod
     def from_data(data):
         """Create an PRMCounterValue_v1 instance wrapping the given NumPy array.
@@ -15964,6 +16893,12 @@ cdef class GpuThermalSettings:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlGpuThermalSettings_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlGpuThermalSettings_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlGpuThermalSettings_t *>malloc(sizeof(nvmlGpuThermalSettings_t))
@@ -16001,6 +16936,11 @@ cdef class GpuThermalSettings:
             raise ValueError("This GpuThermalSettings instance is read-only")
         self._ptr[0].count = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an GpuThermalSettings instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlGpuThermalSettings_t), GpuThermalSettings)
+
     @staticmethod
     def from_data(data):
         """Create an GpuThermalSettings instance wrapping the given NumPy array.
@@ -16100,6 +17040,12 @@ cdef class ClkMonStatus:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlClkMonStatus_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlClkMonStatus_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlClkMonStatus_t *>malloc(sizeof(nvmlClkMonStatus_t))
@@ -16140,6 +17086,11 @@ cdef class ClkMonStatus:
             raise ValueError("This ClkMonStatus instance is read-only")
         self._ptr[0].bGlobalStatus = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ClkMonStatus instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlClkMonStatus_t), ClkMonStatus)
+
     @staticmethod
     def from_data(data):
         """Create an ClkMonStatus instance wrapping the given NumPy array.
@@ -16242,6 +17193,12 @@ cdef class ProcessesUtilizationInfo_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlProcessesUtilizationInfo_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlProcessesUtilizationInfo_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlProcessesUtilizationInfo_v1_t *>malloc(sizeof(nvmlProcessesUtilizationInfo_v1_t))
@@ -16292,6 +17249,11 @@ cdef class ProcessesUtilizationInfo_v1:
         self._ptr[0].processSamplesCount = len(arr)
         self._refs["proc_util_array"] = arr
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ProcessesUtilizationInfo_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlProcessesUtilizationInfo_v1_t), ProcessesUtilizationInfo_v1)
+
     @staticmethod
     def from_data(data):
         """Create an ProcessesUtilizationInfo_v1 instance wrapping the given NumPy array.
@@ -16391,6 +17353,12 @@ cdef class GpuDynamicPstatesInfo:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlGpuDynamicPstatesInfo_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlGpuDynamicPstatesInfo_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlGpuDynamicPstatesInfo_t *>malloc(sizeof(nvmlGpuDynamicPstatesInfo_t))
@@ -16428,6 +17396,11 @@ cdef class GpuDynamicPstatesInfo:
             raise ValueError("This GpuDynamicPstatesInfo instance is read-only")
         self._ptr[0].flags = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an GpuDynamicPstatesInfo instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlGpuDynamicPstatesInfo_t), GpuDynamicPstatesInfo)
+
     @staticmethod
     def from_data(data):
         """Create an GpuDynamicPstatesInfo instance wrapping the given NumPy array.
@@ -16530,6 +17503,12 @@ cdef class VgpuProcessesUtilizationInfo_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuProcessesUtilizationInfo_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuProcessesUtilizationInfo_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlVgpuProcessesUtilizationInfo_v1_t *>malloc(sizeof(nvmlVgpuProcessesUtilizationInfo_v1_t))
@@ -16580,6 +17559,11 @@ cdef class VgpuProcessesUtilizationInfo_v1:
         self._ptr[0].vgpuProcessCount = len(arr)
         self._refs["vgpu_proc_util_array"] = arr
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuProcessesUtilizationInfo_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuProcessesUtilizationInfo_v1_t), VgpuProcessesUtilizationInfo_v1)
+
     @staticmethod
     def from_data(data):
         """Create an VgpuProcessesUtilizationInfo_v1 instance wrapping the given NumPy array.
@@ -16674,6 +17658,12 @@ cdef class VgpuSchedulerParams:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuSchedulerParams_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuSchedulerParams_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlVgpuSchedulerParams_t *>malloc(sizeof(nvmlVgpuSchedulerParams_t))
@@ -16710,6 +17700,11 @@ cdef class VgpuSchedulerParams:
         cdef _py_anon_pod3 val_ = val
         memcpy(<void *>&(self._ptr[0].vgpuSchedData), <void *>(val_._get_ptr()), sizeof(_anon_pod3) * 1)
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuSchedulerParams instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuSchedulerParams_t), VgpuSchedulerParams)
+
     @staticmethod
     def from_data(data):
         """Create an VgpuSchedulerParams instance wrapping the given NumPy array.
@@ -16803,6 +17798,12 @@ cdef class VgpuSchedulerSetParams:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuSchedulerSetParams_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuSchedulerSetParams_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlVgpuSchedulerSetParams_t *>malloc(sizeof(nvmlVgpuSchedulerSetParams_t))
@@ -16839,6 +17840,11 @@ cdef class VgpuSchedulerSetParams:
         cdef _py_anon_pod5 val_ = val
         memcpy(<void *>&(self._ptr[0].vgpuSchedData), <void *>(val_._get_ptr()), sizeof(_anon_pod5) * 1)
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuSchedulerSetParams instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuSchedulerSetParams_t), VgpuSchedulerSetParams)
+
     @staticmethod
     def from_data(data):
         """Create an VgpuSchedulerSetParams instance wrapping the given NumPy array.
@@ -16938,6 +17944,12 @@ cdef class VgpuLicenseInfo:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuLicenseInfo_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuLicenseInfo_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlVgpuLicenseInfo_t *>malloc(sizeof(nvmlVgpuLicenseInfo_t))
@@ -16984,6 +17996,11 @@ cdef class VgpuLicenseInfo:
             raise ValueError("This VgpuLicenseInfo instance is read-only")
         self._ptr[0].currentState = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuLicenseInfo instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuLicenseInfo_t), VgpuLicenseInfo)
+
     @staticmethod
     def from_data(data):
         """Create an VgpuLicenseInfo instance wrapping the given NumPy array.
@@ -17090,6 +18107,12 @@ cdef class GridLicensableFeature:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def feature_code(self):
         """Union[~_numpy.int32, int]: """
@@ -17169,6 +18192,11 @@ cdef class GridLicensableFeature:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an GridLicensableFeature instance with the memory from the given buffer."""
+        return GridLicensableFeature.from_data(_numpy.frombuffer(buffer, dtype=grid_licensable_feature_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an GridLicensableFeature instance wrapping the given NumPy array.
@@ -17270,6 +18298,12 @@ cdef class UnitFanSpeeds:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlUnitFanSpeeds_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlUnitFanSpeeds_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlUnitFanSpeeds_t *>malloc(sizeof(nvmlUnitFanSpeeds_t))
@@ -17307,6 +18341,11 @@ cdef class UnitFanSpeeds:
             raise ValueError("This UnitFanSpeeds instance is read-only")
         self._ptr[0].count = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an UnitFanSpeeds instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlUnitFanSpeeds_t), UnitFanSpeeds)
+
     @staticmethod
     def from_data(data):
         """Create an UnitFanSpeeds instance wrapping the given NumPy array.
@@ -17411,6 +18450,12 @@ cdef class VgpuPgpuMetadata:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuPgpuMetadata_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuPgpuMetadata_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlVgpuPgpuMetadata_t *>malloc(sizeof(nvmlVgpuPgpuMetadata_t))
@@ -17509,6 +18554,11 @@ cdef class VgpuPgpuMetadata:
         cdef char *ptr = buf
         memcpy(<void *>(self._ptr[0].opaqueData), <void *>ptr, 4)
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuPgpuMetadata instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuPgpuMetadata_t), VgpuPgpuMetadata)
+
     @staticmethod
     def from_data(data):
         """Create an VgpuPgpuMetadata instance wrapping the given NumPy array.
@@ -17609,6 +18659,12 @@ cdef class GpuInstanceInfo:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlGpuInstanceInfo_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlGpuInstanceInfo_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlGpuInstanceInfo_t *>malloc(sizeof(nvmlGpuInstanceInfo_t))
@@ -17666,6 +18722,11 @@ cdef class GpuInstanceInfo:
             raise ValueError("This GpuInstanceInfo instance is read-only")
         self._ptr[0].profileId = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an GpuInstanceInfo instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlGpuInstanceInfo_t), GpuInstanceInfo)
+
     @staticmethod
     def from_data(data):
         """Create an GpuInstanceInfo instance wrapping the given NumPy array.
@@ -17767,6 +18828,12 @@ cdef class ComputeInstanceInfo:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlComputeInstanceInfo_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlComputeInstanceInfo_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlComputeInstanceInfo_t *>malloc(sizeof(nvmlComputeInstanceInfo_t))
@@ -17835,6 +18902,11 @@ cdef class ComputeInstanceInfo:
             raise ValueError("This ComputeInstanceInfo instance is read-only")
         self._ptr[0].profileId = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ComputeInstanceInfo instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlComputeInstanceInfo_t), ComputeInstanceInfo)
+
     @staticmethod
     def from_data(data):
         """Create an ComputeInstanceInfo instance wrapping the given NumPy array.
@@ -17936,6 +19008,12 @@ cdef class EccSramUniqueUncorrectedErrorCounts_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlEccSramUniqueUncorrectedErrorCounts_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlEccSramUniqueUncorrectedErrorCounts_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlEccSramUniqueUncorrectedErrorCounts_v1_t *>malloc(sizeof(nvmlEccSramUniqueUncorrectedErrorCounts_v1_t))
@@ -17975,6 +19053,11 @@ cdef class EccSramUniqueUncorrectedErrorCounts_v1:
         self._ptr[0].entryCount = len(arr)
         self._refs["entries"] = arr
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an EccSramUniqueUncorrectedErrorCounts_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlEccSramUniqueUncorrectedErrorCounts_v1_t), EccSramUniqueUncorrectedErrorCounts_v1)
+
     @staticmethod
     def from_data(data):
         """Create an EccSramUniqueUncorrectedErrorCounts_v1 instance wrapping the given NumPy array.
@@ -18074,6 +19157,12 @@ cdef class NvlinkFirmwareInfo:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlNvlinkFirmwareInfo_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlNvlinkFirmwareInfo_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlNvlinkFirmwareInfo_t *>malloc(sizeof(nvmlNvlinkFirmwareInfo_t))
@@ -18111,6 +19200,11 @@ cdef class NvlinkFirmwareInfo:
             raise ValueError("This NvlinkFirmwareInfo instance is read-only")
         self._ptr[0].numValidEntries = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an NvlinkFirmwareInfo instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlNvlinkFirmwareInfo_t), NvlinkFirmwareInfo)
+
     @staticmethod
     def from_data(data):
         """Create an NvlinkFirmwareInfo instance wrapping the given NumPy array.
@@ -18214,6 +19308,12 @@ cdef class VgpuInstancesUtilizationInfo_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuInstancesUtilizationInfo_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuInstancesUtilizationInfo_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlVgpuInstancesUtilizationInfo_v1_t *>malloc(sizeof(nvmlVgpuInstancesUtilizationInfo_v1_t))
@@ -18275,6 +19375,11 @@ cdef class VgpuInstancesUtilizationInfo_v1:
         self._ptr[0].vgpuInstanceCount = len(arr)
         self._refs["vgpu_util_array"] = arr
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuInstancesUtilizationInfo_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuInstancesUtilizationInfo_v1_t), VgpuInstancesUtilizationInfo_v1)
+
     @staticmethod
     def from_data(data):
         """Create an VgpuInstancesUtilizationInfo_v1 instance wrapping the given NumPy array.
@@ -18379,6 +19484,12 @@ cdef class PRMCounter_v1:
             return False
         return bool((self_data == other._data).all())
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        cpython.PyBuffer_Release(buffer)
+
     @property
     def counter_id(self):
         """Union[~_numpy.uint32, int]: Counter ID, one of nvmlPRMCounterId_t."""
@@ -18427,6 +19538,11 @@ cdef class PRMCounter_v1:
     def __setitem__(self, key, val):
         self._data[key] = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an PRMCounter_v1 instance with the memory from the given buffer."""
+        return PRMCounter_v1.from_data(_numpy.frombuffer(buffer, dtype=prm_counter_v1_dtype))
+
     @staticmethod
     def from_data(data):
         """Create an PRMCounter_v1 instance wrapping the given NumPy array.
@@ -18532,6 +19648,12 @@ cdef class VgpuSchedulerLog:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuSchedulerLog_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuSchedulerLog_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlVgpuSchedulerLog_t *>malloc(sizeof(nvmlVgpuSchedulerLog_t))
@@ -18614,6 +19736,11 @@ cdef class VgpuSchedulerLog:
             raise ValueError("This VgpuSchedulerLog instance is read-only")
         self._ptr[0].entriesCount = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuSchedulerLog instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuSchedulerLog_t), VgpuSchedulerLog)
+
     @staticmethod
     def from_data(data):
         """Create an VgpuSchedulerLog instance wrapping the given NumPy array.
@@ -18713,6 +19840,12 @@ cdef class VgpuSchedulerGetState:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuSchedulerGetState_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuSchedulerGetState_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlVgpuSchedulerGetState_t *>malloc(sizeof(nvmlVgpuSchedulerGetState_t))
@@ -18759,6 +19892,11 @@ cdef class VgpuSchedulerGetState:
             raise ValueError("This VgpuSchedulerGetState instance is read-only")
         self._ptr[0].arrMode = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuSchedulerGetState instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuSchedulerGetState_t), VgpuSchedulerGetState)
+
     @staticmethod
     def from_data(data):
         """Create an VgpuSchedulerGetState instance wrapping the given NumPy array.
@@ -18860,6 +19998,12 @@ cdef class VgpuSchedulerStateInfo_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuSchedulerStateInfo_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuSchedulerStateInfo_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlVgpuSchedulerStateInfo_v1_t *>malloc(sizeof(nvmlVgpuSchedulerStateInfo_v1_t))
@@ -18928,6 +20072,11 @@ cdef class VgpuSchedulerStateInfo_v1:
             raise ValueError("This VgpuSchedulerStateInfo_v1 instance is read-only")
         self._ptr[0].arrMode = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuSchedulerStateInfo_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuSchedulerStateInfo_v1_t), VgpuSchedulerStateInfo_v1)
+
     @staticmethod
     def from_data(data):
         """Create an VgpuSchedulerStateInfo_v1 instance wrapping the given NumPy array.
@@ -19031,6 +20180,12 @@ cdef class VgpuSchedulerLogInfo_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuSchedulerLogInfo_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuSchedulerLogInfo_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlVgpuSchedulerLogInfo_v1_t *>malloc(sizeof(nvmlVgpuSchedulerLogInfo_v1_t))
@@ -19124,6 +20279,11 @@ cdef class VgpuSchedulerLogInfo_v1:
             raise ValueError("This VgpuSchedulerLogInfo_v1 instance is read-only")
         self._ptr[0].entriesCount = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuSchedulerLogInfo_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuSchedulerLogInfo_v1_t), VgpuSchedulerLogInfo_v1)
+
     @staticmethod
     def from_data(data):
         """Create an VgpuSchedulerLogInfo_v1 instance wrapping the given NumPy array.
@@ -19225,6 +20385,12 @@ cdef class VgpuSchedulerState_v1:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuSchedulerState_v1_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuSchedulerState_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlVgpuSchedulerState_v1_t *>malloc(sizeof(nvmlVgpuSchedulerState_v1_t))
@@ -19293,6 +20459,11 @@ cdef class VgpuSchedulerState_v1:
             raise ValueError("This VgpuSchedulerState_v1 instance is read-only")
         self._ptr[0].enableARRMode = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuSchedulerState_v1 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuSchedulerState_v1_t), VgpuSchedulerState_v1)
+
     @staticmethod
     def from_data(data):
         """Create an VgpuSchedulerState_v1 instance wrapping the given NumPy array.
@@ -19392,6 +20563,12 @@ cdef class GridLicensableFeatures:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlGridLicensableFeatures_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlGridLicensableFeatures_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlGridLicensableFeatures_t *>malloc(sizeof(nvmlGridLicensableFeatures_t))
@@ -19432,6 +20609,11 @@ cdef class GridLicensableFeatures:
             raise ValueError("This GridLicensableFeatures instance is read-only")
         self._ptr[0].isGridLicenseSupported = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an GridLicensableFeatures instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlGridLicensableFeatures_t), GridLicensableFeatures)
+
     @staticmethod
     def from_data(data):
         """Create an GridLicensableFeatures instance wrapping the given NumPy array.
@@ -19531,6 +20713,12 @@ cdef class NvLinkInfo_v2:
         other_ = other
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlNvLinkInfo_v2_t)) == 0)
 
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlNvLinkInfo_v2_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
             self._ptr = <nvmlNvLinkInfo_v2_t *>malloc(sizeof(nvmlNvLinkInfo_v2_t))
@@ -19577,6 +20765,11 @@ cdef class NvLinkInfo_v2:
             raise ValueError("This NvLinkInfo_v2 instance is read-only")
         self._ptr[0].isNvleEnabled = val
 
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an NvLinkInfo_v2 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlNvLinkInfo_v2_t), NvLinkInfo_v2)
+
     @staticmethod
     def from_data(data):
         """Create an NvLinkInfo_v2 instance wrapping the given NumPy array.
@@ -26227,4 +27420,4 @@ cpdef str vgpu_type_get_name(unsigned int vgpu_type_id):
     with nogil:
         __status__ = nvmlVgpuTypeGetName(<nvmlVgpuTypeId_t>vgpu_type_id, vgpu_type_name, <unsigned int*>size)
     check_status(__status__)
-    return cpython.PyUnicode_FromString(vgpu_type_name)
+    return cpython.PyUnicode_FromStringAndSize(vgpu_type_name, size[0])
diff --git a/cuda_bindings/cuda/bindings/nvrtc.pxd.in b/cuda_bindings/cuda/bindings/nvrtc.pxd.in
index fbda11a161..cb2b0c260a 100644
--- a/cuda_bindings/cuda/bindings/nvrtc.pxd.in
+++ b/cuda_bindings/cuda/bindings/nvrtc.pxd.in
@@ -1,7 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 cimport cuda.bindings.cynvrtc as cynvrtc
 
 include "_lib/utils.pxd"
diff --git a/cuda_bindings/cuda/bindings/nvrtc.pyx.in b/cuda_bindings/cuda/bindings/nvrtc.pyx.in
index 3586d33f7a..3cb0381b63 100644
--- a/cuda_bindings/cuda/bindings/nvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/nvrtc.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0, generator version fd3f910. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 from typing import Any, Optional
 import cython
 import ctypes
diff --git a/cuda_bindings/cuda/bindings/nvvm.pxd b/cuda_bindings/cuda/bindings/nvvm.pxd
index d18c880860..fd8bbbdcf9 100644
--- a/cuda_bindings/cuda/bindings/nvvm.pxd
+++ b/cuda_bindings/cuda/bindings/nvvm.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/nvvm.pyx b/cuda_bindings/cuda/bindings/nvvm.pyx
index 2f55020235..81ca09754a 100644
--- a/cuda_bindings/cuda/bindings/nvvm.pyx
+++ b/cuda_bindings/cuda/bindings/nvvm.pyx
@@ -1,8 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 13.1.1. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.1.1, generator version 0.3.1.dev1322+g646ce84ec. Do not modify it directly.
 
 cimport cython  # NOQA
 
diff --git a/cuda_bindings/cuda/bindings/runtime.pxd.in b/cuda_bindings/cuda/bindings/runtime.pxd.in
index 6d88763abb..91ecd45b31 100644
--- a/cuda_bindings/cuda/bindings/runtime.pxd.in
+++ b/cuda_bindings/cuda/bindings/runtime.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 cimport cuda.bindings.cyruntime as cyruntime
 
 include "_lib/utils.pxd"
diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in
index f4473554eb..0fe497fb41 100644
--- a/cuda_bindings/cuda/bindings/runtime.pyx.in
+++ b/cuda_bindings/cuda/bindings/runtime.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.1.0, generator version c185cc3. Do not modify it directly.
+# This code was automatically generated with version 13.1.0, generator version 49a8141. Do not modify it directly.
 from typing import Any, Optional
 import cython
 import ctypes
@@ -41333,10 +41333,10 @@ def sizeof(objType):
     {{if True}}
     if objType == VdpOutputSurface:
         return sizeof(cyruntime.VdpOutputSurface){{endif}}
-    {{if 'cudaStreamAttrValue' in found_types}}
+    {{if True}}
     if objType == cudaStreamAttrValue:
         return sizeof(cyruntime.cudaStreamAttrValue){{endif}}
-    {{if 'cudaKernelNodeAttrValue' in found_types}}
+    {{if True}}
     if objType == cudaKernelNodeAttrValue:
         return sizeof(cyruntime.cudaKernelNodeAttrValue){{endif}}
     {{if True}}
diff --git a/cuda_core/cuda/core/_memory/_managed_memory_resource.pxd b/cuda_core/cuda/core/_memory/_managed_memory_resource.pxd
index 5a73a57ee9..8dd0bbbeb1 100644
--- a/cuda_core/cuda/core/_memory/_managed_memory_resource.pxd
+++ b/cuda_core/cuda/core/_memory/_managed_memory_resource.pxd
@@ -6,4 +6,6 @@ from cuda.core._memory._memory_pool cimport _MemPool
 
 
 cdef class ManagedMemoryResource(_MemPool):
-    pass
+    cdef:
+        str _pref_loc_type
+        int _pref_loc_id
diff --git a/cuda_core/cuda/core/_memory/_managed_memory_resource.pyx b/cuda_core/cuda/core/_memory/_managed_memory_resource.pyx
index 64f523087c..4f24bd8d11 100644
--- a/cuda_core/cuda/core/_memory/_managed_memory_resource.pyx
+++ b/cuda_core/cuda/core/_memory/_managed_memory_resource.pyx
@@ -26,12 +26,35 @@ cdef class ManagedMemoryResourceOptions:
     Attributes
     ----------
     preferred_location : int | None, optional
-        The preferred device location for the managed memory.
-        Use a device ID (0, 1, 2, ...) for device preference, -1 for CPU/host,
-        or None to let the driver decide.
-        (Default to None)
+        A location identifier (device ordinal or NUMA node ID) whose
+        meaning depends on ``preferred_location_type``.
+        (Default to ``None``)
+
+    preferred_location_type : ``"device"`` | ``"host"`` | ``"host_numa"`` | None, optional
+        Controls how ``preferred_location`` is interpreted.
+
+        When set to ``None`` (the default), legacy behavior is used:
+        ``preferred_location`` is interpreted as a device ordinal,
+        ``-1`` for host, or ``None`` for no preference.
+
+        When set explicitly, the type determines both the kind of
+        preferred location and the valid values for
+        ``preferred_location``:
+
+        - ``"device"``: prefer a specific GPU. ``preferred_location``
+          must be a device ordinal (``>= 0``).
+        - ``"host"``: prefer host memory (OS-managed NUMA placement).
+          ``preferred_location`` must be ``None``.
+        - ``"host_numa"``: prefer a specific host NUMA node.
+          ``preferred_location`` must be a NUMA node ID (``>= 0``),
+          or ``None`` to derive the NUMA node from the current CUDA
+          device's ``host_numa_id`` attribute (requires an active
+          CUDA context).
+
+        (Default to ``None``)
     """
     preferred_location: int | None = None
+    preferred_location_type: str | None = None
 
 
 cdef class ManagedMemoryResource(_MemPool):
@@ -68,9 +91,26 @@ cdef class ManagedMemoryResource(_MemPool):
 
     @property
     def device_id(self) -> int:
-        """Return -1. Managed memory migrates automatically and is not tied to a specific device."""
+        """The preferred device ordinal, or -1 if the preferred location is not a device."""
+        if self._pref_loc_type == "device":
+            return self._pref_loc_id
         return -1
 
+    @property
+    def preferred_location(self) -> tuple | None:
+        """The preferred location for managed memory allocations.
+
+        Returns ``None`` if no preferred location is set (driver decides),
+        or a tuple ``(type, id)`` where *type* is one of ``"device"``,
+        ``"host"``, or ``"host_numa"``, and *id* is the device ordinal,
+        ``None`` (for ``"host"``), or the NUMA node ID, respectively.
+        """
+        if self._pref_loc_type is None:
+            return None
+        if self._pref_loc_type == "host":
+            return ("host", None)
+        return (self._pref_loc_type, self._pref_loc_id)
+
     @property
     def is_device_accessible(self) -> bool:
         """Return True. This memory resource provides device-accessible buffers."""
@@ -82,40 +122,121 @@ cdef class ManagedMemoryResource(_MemPool):
         return True
 
 
-cdef inline _MMR_init(ManagedMemoryResource self, options):
-    cdef ManagedMemoryResourceOptions opts = check_or_create_options(
-        ManagedMemoryResourceOptions, options, "ManagedMemoryResource options",
-        keep_none=True
-    )
-    cdef int location_id = -1
-    cdef object preferred_location = None
-    cdef cydriver.CUmemLocationType loc_type
-
-    if opts is not None:
-        preferred_location = opts.preferred_location
-        if preferred_location is not None:
-            location_id = preferred_location
+IF CUDA_CORE_BUILD_MAJOR >= 13:
+    cdef tuple _VALID_LOCATION_TYPES = ("device", "host", "host_numa")
+
+
+    cdef _resolve_preferred_location(ManagedMemoryResourceOptions opts):
+        """Resolve preferred location options into driver and stored values.
+
+        Returns a 4-tuple:
+            (CUmemLocationType, loc_id, pref_loc_type_str, pref_loc_id)
+        """
+        cdef object pref_loc = opts.preferred_location if opts is not None else None
+        cdef object pref_type = opts.preferred_location_type if opts is not None else None
 
+        if pref_type is not None and pref_type not in _VALID_LOCATION_TYPES:
+            raise ValueError(
+                f"preferred_location_type must be one of {_VALID_LOCATION_TYPES!r} "
+                f"or None, got {pref_type!r}"
+            )
+
+        if pref_type is None:
+            # Legacy behavior
+            if pref_loc is None:
+                return (
+                    cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_NONE,
+                    -1, None, -1,
+                )
+            if pref_loc == -1:
+                return (
+                    cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST,
+                    -1, "host", -1,
+                )
+            if pref_loc < 0:
+                raise ValueError(
+                    f"preferred_location must be a device ordinal (>= 0), -1 for "
+                    f"host, or None for no preference, got {pref_loc}"
+                )
+            return (
+                cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE,
+                pref_loc, "device", pref_loc,
+            )
+
+        if pref_type == "device":
+            if pref_loc is None or pref_loc < 0:
+                raise ValueError(
+                    f"preferred_location must be a device ordinal (>= 0) when "
+                    f"preferred_location_type is 'device', got {pref_loc!r}"
+                )
+            return (
+                cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE,
+                pref_loc, "device", pref_loc,
+            )
+
+        if pref_type == "host":
+            if pref_loc is not None:
+                raise ValueError(
+                    f"preferred_location must be None when "
+                    f"preferred_location_type is 'host', got {pref_loc!r}"
+                )
+            return (
+                cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST,
+                -1, "host", -1,
+            )
+
+        # pref_type == "host_numa"
+        if pref_loc is None:
+            from .._device import Device
+            dev = Device()
+            numa_id = dev.properties.host_numa_id
+            if numa_id < 0:
+                raise RuntimeError(
+                    "Cannot determine host NUMA ID for the current CUDA device. "
+                    "The system may not support NUMA, or no CUDA context is "
+                    "active. Set preferred_location to an explicit NUMA node ID "
+                    "or call Device.set_current() first."
+                )
+            return (
+                cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA,
+                numa_id, "host_numa", numa_id,
+            )
+        if pref_loc < 0:
+            raise ValueError(
+                f"preferred_location must be a NUMA node ID (>= 0) or None "
+                f"when preferred_location_type is 'host_numa', got {pref_loc}"
+            )
+        return (
+            cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA,
+            pref_loc, "host_numa", pref_loc,
+        )
+
+
+cdef inline _MMR_init(ManagedMemoryResource self, options):
     IF CUDA_CORE_BUILD_MAJOR >= 13:
-        if preferred_location is None:
-            loc_type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_NONE
-        elif location_id == -1:
-            loc_type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
-        else:
-            loc_type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+        cdef ManagedMemoryResourceOptions opts = check_or_create_options(
+            ManagedMemoryResourceOptions, options, "ManagedMemoryResource options",
+            keep_none=True
+        )
+        cdef cydriver.CUmemLocationType loc_type
+        cdef int loc_id
+
+        loc_type, loc_id, self._pref_loc_type, self._pref_loc_id = (
+            _resolve_preferred_location(opts)
+        )
 
         if opts is None:
             MP_init_current_pool(
                 self,
                 loc_type,
-                location_id,
+                loc_id,
                 cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED,
             )
         else:
             MP_init_create_pool(
                 self,
                 loc_type,
-                location_id,
+                loc_id,
                 cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED,
                 False,
                 0,
diff --git a/cuda_core/cuda/core/_program.pxd b/cuda_core/cuda/core/_program.pxd
index 02d436d3f3..7a6717059b 100644
--- a/cuda_core/cuda/core/_program.pxd
+++ b/cuda_core/cuda/core/_program.pxd
@@ -16,3 +16,5 @@ cdef class Program:
         object _compile_lock  # Per-instance lock for compile-time mutation
         bint _use_libdevice      # Flag for libdevice loading
         bint _libdevice_added
+        bytes _nvrtc_code       # Source code for NVRTC retry (PCH auto-resize)
+        str _pch_status         # PCH creation outcome after compile
diff --git a/cuda_core/cuda/core/_program.pyx b/cuda_core/cuda/core/_program.pyx
index 68c0476b09..0b1fa93279 100644
--- a/cuda_core/cuda/core/_program.pyx
+++ b/cuda_core/cuda/core/_program.pyx
@@ -105,6 +105,32 @@ cdef class Program:
         """
         return Program_compile(self, target_type, name_expressions, logs)
 
+    @property
+    def pch_status(self) -> str | None:
+        """PCH creation outcome from the most recent :meth:`compile` call.
+
+        Possible values:
+
+        * ``"created"`` — PCH file was written successfully.
+        * ``"not_attempted"`` — PCH creation was not attempted (e.g. the
+          compiler decided not to, or automatic PCH processing skipped it).
+        * ``"failed"`` — an error prevented PCH creation.
+        * ``None`` — PCH was not requested, the program has not been
+          compiled yet, the backend is not NVRTC (e.g. PTX or NVVM),
+          or the NVRTC bindings are too old to report status.
+
+        When ``create_pch`` is set in :class:`ProgramOptions` and the PCH
+        heap is too small, :meth:`compile` automatically resizes the heap
+        and retries, so ``"created"`` should be the common outcome.
+
+        .. note::
+
+           PCH is only supported for ``code_type="c++"`` programs that
+           use the NVRTC backend. For PTX and NVVM programs this property
+           always returns ``None``.
+        """
+        return self._pch_status
+
     @property
     def backend(self) -> str:
         """Return this Program instance's underlying backend."""
@@ -477,6 +503,8 @@ def _find_libdevice_path():
     return find_bitcode_lib("device")
 
 
+
+
 cdef inline bint _process_define_macro_inner(list options, object macro) except? -1:
     """Process a single define macro, returning True if successful."""
     if isinstance(macro, str):
@@ -548,6 +576,8 @@ cdef inline int Program_init(Program self, object code, str code_type, object op
     self._use_libdevice = False
     self._libdevice_added = False
 
+    self._pch_status = None
+
     if code_type == "c++":
         assert_type(code, str)
         if options.extra_sources is not None:
@@ -562,6 +592,7 @@ cdef inline int Program_init(Program self, object code, str code_type, object op
             HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcCreateProgram(
                 &nvrtc_prog, code_ptr, name_ptr, 0, NULL, NULL))
         self._h_nvrtc = create_nvrtc_program_handle(nvrtc_prog)
+        self._nvrtc_code = code_bytes
         self._backend = "NVRTC"
         self._linker = None
 
@@ -649,9 +680,15 @@ cdef inline int Program_init(Program self, object code, str code_type, object op
     return 0
 
 
-cdef object Program_compile_nvrtc(Program self, str target_type, object name_expressions, object logs):
-    """Compile using NVRTC backend and return ObjectCode."""
-    cdef cynvrtc.nvrtcProgram prog = as_cu(self._h_nvrtc)
+cdef object _nvrtc_compile_and_extract(
+    cynvrtc.nvrtcProgram prog, str target_type, object name_expressions,
+    object logs, list options_list, str name,
+):
+    """Run nvrtcCompileProgram on *prog* and extract the output.
+
+    This is the inner compile+extract loop, factored out so the PCH
+    auto-retry path can call it on a fresh program handle.
+    """
     cdef size_t output_size = 0
     cdef size_t logsize = 0
     cdef vector[const char*] options_vec
@@ -669,7 +706,6 @@ cdef object Program_compile_nvrtc(Program self, str target_type, object name_exp
             HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcAddNameExpression(prog, name_ptr))
 
     # Build options array
-    options_list = self._options.as_bytes("nvrtc", target_type)
     options_vec.resize(len(options_list))
     for i in range(len(options_list)):
         options_vec[i] = <const char*>(<bytes>options_list[i])
@@ -716,7 +752,84 @@ cdef object Program_compile_nvrtc(Program self, str target_type, object name_exp
                 HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetProgramLog(prog, data_ptr))
             logs.write(log.decode("utf-8", errors="backslashreplace"))
 
-    return ObjectCode._init(bytes(data), target_type, symbol_mapping=symbol_mapping, name=self._options.name)
+    return ObjectCode._init(bytes(data), target_type, symbol_mapping=symbol_mapping, name=name)
+
+
+cdef int _nvrtc_pch_apis_cached = -1  # -1 = unchecked
+
+cdef bint _has_nvrtc_pch_apis():
+    global _nvrtc_pch_apis_cached
+    if _nvrtc_pch_apis_cached < 0:
+        _nvrtc_pch_apis_cached = hasattr(nvrtc, "nvrtcGetPCHCreateStatus")
+    return _nvrtc_pch_apis_cached
+
+
+cdef str _PCH_STATUS_CREATED = "created"
+cdef str _PCH_STATUS_NOT_ATTEMPTED = "not_attempted"
+cdef str _PCH_STATUS_FAILED = "failed"
+
+
+cdef str _read_pch_status(cynvrtc.nvrtcProgram prog):
+    """Query nvrtcGetPCHCreateStatus and translate to a high-level string."""
+    cdef cynvrtc.nvrtcResult err
+    with nogil:
+        err = cynvrtc.nvrtcGetPCHCreateStatus(prog)
+    if err == cynvrtc.nvrtcResult.NVRTC_SUCCESS:
+        return _PCH_STATUS_CREATED
+    if err == cynvrtc.nvrtcResult.NVRTC_ERROR_PCH_CREATE_HEAP_EXHAUSTED:
+        return None  # sentinel: caller should auto-retry
+    if err == cynvrtc.nvrtcResult.NVRTC_ERROR_NO_PCH_CREATE_ATTEMPTED:
+        return _PCH_STATUS_NOT_ATTEMPTED
+    return _PCH_STATUS_FAILED
+
+
+cdef object Program_compile_nvrtc(Program self, str target_type, object name_expressions, object logs):
+    """Compile using NVRTC backend and return ObjectCode."""
+    cdef cynvrtc.nvrtcProgram prog = as_cu(self._h_nvrtc)
+    cdef list options_list = self._options.as_bytes("nvrtc", target_type)
+
+    result = _nvrtc_compile_and_extract(
+        prog, target_type, name_expressions, logs, options_list, self._options.name,
+    )
+
+    cdef bint pch_creation_possible = self._options.create_pch or self._options.pch
+    if not pch_creation_possible or not _has_nvrtc_pch_apis():
+        self._pch_status = None
+        return result
+
+    try:
+        status = _read_pch_status(prog)
+    except RuntimeError as e:
+        raise RuntimeError(
+            "PCH was requested but the runtime libnvrtc does not support "
+            "PCH APIs. Update to CUDA toolkit 12.8 or newer."
+        ) from e
+
+    if status is not None:
+        self._pch_status = status
+        return result
+
+    # Heap exhausted — auto-resize and retry with a fresh program
+    cdef size_t required = 0
+    with nogil:
+        HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetPCHHeapSizeRequired(prog, &required))
+        HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcSetPCHHeapSize(required))
+
+    cdef cynvrtc.nvrtcProgram retry_prog
+    cdef const char* code_ptr = <const char*>self._nvrtc_code
+    cdef const char* name_ptr = <const char*>self._options._name
+    with nogil:
+        HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcCreateProgram(
+            &retry_prog, code_ptr, name_ptr, 0, NULL, NULL))
+    self._h_nvrtc = create_nvrtc_program_handle(retry_prog)
+
+    result = _nvrtc_compile_and_extract(
+        retry_prog, target_type, name_expressions, logs, options_list, self._options.name,
+    )
+
+    status = _read_pch_status(retry_prog)
+    self._pch_status = status if status is not None else _PCH_STATUS_FAILED
+    return result
 
 
 cdef object Program_compile_nvvm(Program self, str target_type, object logs):
diff --git a/cuda_core/cuda/core/_stream.pyx b/cuda_core/cuda/core/_stream.pyx
index c6c25874c8..bada70c7b9 100644
--- a/cuda_core/cuda/core/_stream.pyx
+++ b/cuda_core/cuda/core/_stream.pyx
@@ -470,18 +470,14 @@ cdef Stream Stream_accept(arg, bint allow_stream_protocol=False):
         return <Stream>(arg)
     elif isinstance(arg, GraphBuilder):
         return <Stream>(arg.stream)
-    elif allow_stream_protocol:
-        try:
-            stream = Stream._init(arg)
-        except:
-            pass
-        else:
-            warnings.warn(
-                "Passing foreign stream objects to this function via the "
-                "stream protocol is deprecated. Convert the object explicitly "
-                "using Stream(obj) instead.",
-                stacklevel=2,
-                category=DeprecationWarning,
-            )
-            return <Stream>(stream)
+    elif allow_stream_protocol and hasattr(arg, "__cuda_stream__"):
+        stream = Stream._init(arg)
+        warnings.warn(
+            "Passing foreign stream objects to this function via the "
+            "stream protocol is deprecated. Convert the object explicitly "
+            "using Stream(obj) instead.",
+            stacklevel=2,
+            category=DeprecationWarning,
+        )
+        return <Stream>(stream)
     raise TypeError(f"Stream or GraphBuilder expected, got {type(arg).__name__}")
diff --git a/cuda_core/docs/source/release/0.6.0-notes.rst b/cuda_core/docs/source/release/0.6.0-notes.rst
index b7d6188cc2..654eb7641b 100644
--- a/cuda_core/docs/source/release/0.6.0-notes.rst
+++ b/cuda_core/docs/source/release/0.6.0-notes.rst
@@ -54,6 +54,11 @@ New features
 - Added CUDA version compatibility check at import time to detect mismatches between
   ``cuda.core`` and the installed ``cuda-bindings`` version.
 
+- ``Program.compile()`` now automatically resizes the NVRTC PCH heap and
+  retries when precompiled header creation fails due to heap exhaustion.
+  The ``pch_status`` property reports the PCH creation outcome
+  (``"created"``, ``"not_attempted"``, ``"failed"``, or ``None``).
+
 
 Fixes and enhancements
 ----------------------
diff --git a/cuda_core/docs/source/release/0.7.x-notes.rst b/cuda_core/docs/source/release/0.7.x-notes.rst
new file mode 100644
index 0000000000..032f5a7005
--- /dev/null
+++ b/cuda_core/docs/source/release/0.7.x-notes.rst
@@ -0,0 +1,57 @@
+.. SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-License-Identifier: Apache-2.0
+
+.. currentmodule:: cuda.core
+
+``cuda.core`` 0.7.x Release Notes
+=================================
+
+
+Highlights
+----------
+
+None.
+
+
+Breaking Changes
+----------------
+
+None.
+
+
+New features
+------------
+
+- Added ``preferred_location_type`` option to :class:`ManagedMemoryResourceOptions`
+  for explicit control over the preferred location kind (``"device"``,
+  ``"host"``, or ``"host_numa"``). This enables NUMA-aware managed memory
+  pool placement. The existing ``preferred_location`` parameter retains full
+  backwards compatibility when ``preferred_location_type`` is not set.
+
+- Added :attr:`ManagedMemoryResource.preferred_location` property to query the
+  resolved preferred location of a managed memory pool. Returns ``None`` for no
+  preference, or a tuple such as ``("device", 0)``, ``("host", None)``, or
+  ``("host_numa", 3)``.
+
+- Added ``numa_id`` option to :class:`PinnedMemoryResourceOptions` for explicit
+  control over host NUMA node placement. When ``ipc_enabled=True`` and
+  ``numa_id`` is not set, the NUMA node is automatically derived from the
+  current CUDA device.
+
+- Added :attr:`PinnedMemoryResource.numa_id` property to query the host NUMA
+  node ID used for pool placement. Returns ``-1`` for OS-managed placement.
+
+
+New examples
+------------
+
+None.
+
+
+Fixes and enhancements
+----------------------
+
+- Fixed IPC-enabled pinned memory pools using a hardcoded NUMA node ID of ``0``
+  instead of the NUMA node closest to the active CUDA device. On multi-NUMA
+  systems where the device is attached to a non-zero host NUMA node, this could
+  cause pool creation or allocation failures. (:issue:`1603`)
diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index b08f435031..a2828a9274 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -57,12 +57,12 @@ cu13 = ["cuda-bindings[all]==13.*"]
 [dependency-groups]
 test = ["cython>=3.2,<3.3", "setuptools", "pytest>=6.2.4", "pytest-randomly", "pytest-repeat", "pytest-rerunfailures"]
 ml-dtypes = ["ml-dtypes>=0.5.4,<0.6.0"]
-test-cu12 = [ {include-group = "ml-dtypes" }, "cuda-core[test]", "cupy-cuda12x; python_version < '3.14'", "cuda-toolkit[cudart]==12.*"]  # runtime headers needed by CuPy
-test-cu13 = [ {include-group = "ml-dtypes" }, "cuda-core[test]", "cupy-cuda13x; python_version < '3.14'", "cuda-toolkit[cudart]==13.*"]  # runtime headers needed by CuPy
+test-cu12 = [ {include-group = "ml-dtypes" }, {include-group = "test" }, "cupy-cuda12x; python_version < '3.14'", "cuda-toolkit[cudart]==12.*"]  # runtime headers needed by CuPy
+test-cu13 = [ {include-group = "ml-dtypes" }, {include-group = "test" }, "cupy-cuda13x; python_version < '3.14'", "cuda-toolkit[cudart]==13.*"]  # runtime headers needed by CuPy
 # free threaded build, cupy doesn't support free-threaded builds yet, so avoid installing it for now
 # TODO: cupy should support free threaded builds
-test-cu12-ft = [ {include-group = "ml-dtypes" }, "cuda-core[test]", "cuda-toolkit[cudart]==12.*"]
-test-cu13-ft = [ {include-group = "ml-dtypes" }, "cuda-core[test]", "cuda-toolkit[cudart]==13.*"]
+test-cu12-ft = [ {include-group = "ml-dtypes" }, {include-group = "test" }, "cuda-toolkit[cudart]==12.*"]
+test-cu13-ft = [ {include-group = "ml-dtypes" }, {include-group = "test" }, "cuda-toolkit[cudart]==13.*"]
 
 [project.urls]
 homepage = "https://nvidia.github.io/cuda-python/"
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index 4e1500b491..df20d76aed 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -57,6 +57,12 @@ def skip_if_managed_memory_unsupported(device):
             pytest.skip("Device does not support managed memory pool operations")
     except AttributeError:
         pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later")
+    try:
+        ManagedMemoryResource()
+    except RuntimeError as e:
+        if "requires CUDA 13.0" in str(e):
+            pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later")
+        raise
 
 
 def create_managed_memory_resource_or_skip(*args, **kwargs):
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 8933dcba09..0f63d9b9f8 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -998,6 +998,155 @@ def test_managed_memory_resource_with_options(init_cuda):
     src_buffer.close()
 
 
+def test_managed_memory_resource_preferred_location_default(init_cuda):
+    """preferred_location property returns None when no preference is set."""
+    device = Device()
+    skip_if_managed_memory_unsupported(device)
+    device.set_current()
+
+    mr = create_managed_memory_resource_or_skip()
+    assert mr.preferred_location is None
+
+
+def test_managed_memory_resource_preferred_location_device(init_cuda):
+    """preferred_location returns ("device", ordinal) for device preference."""
+    device = Device()
+    skip_if_managed_memory_unsupported(device)
+    device.set_current()
+
+    # Legacy style
+    opts = ManagedMemoryResourceOptions(preferred_location=device.device_id)
+    mr = create_managed_memory_resource_or_skip(opts)
+    assert mr.preferred_location == ("device", device.device_id)
+
+    # Explicit style
+    opts = ManagedMemoryResourceOptions(
+        preferred_location=device.device_id,
+        preferred_location_type="device",
+    )
+    mr = create_managed_memory_resource_or_skip(opts)
+    assert mr.preferred_location == ("device", device.device_id)
+
+
+def test_managed_memory_resource_preferred_location_host(init_cuda):
+    """preferred_location returns ("host", None) for host preference."""
+    device = Device()
+    skip_if_managed_memory_unsupported(device)
+    device.set_current()
+
+    # Legacy style
+    opts = ManagedMemoryResourceOptions(preferred_location=-1)
+    mr = create_managed_memory_resource_or_skip(opts)
+    assert mr.preferred_location == ("host", None)
+
+    # Explicit style
+    opts = ManagedMemoryResourceOptions(preferred_location_type="host")
+    mr = create_managed_memory_resource_or_skip(opts)
+    assert mr.preferred_location == ("host", None)
+
+
+def test_managed_memory_resource_preferred_location_host_numa(init_cuda):
+    """preferred_location returns ("host_numa", id) for NUMA preference."""
+    device = Device()
+    skip_if_managed_memory_unsupported(device)
+    device.set_current()
+
+    numa_id = device.properties.host_numa_id
+    if numa_id < 0:
+        pytest.skip("System does not support NUMA")
+
+    # Auto-resolved from current device
+    opts = ManagedMemoryResourceOptions(preferred_location_type="host_numa")
+    mr = create_managed_memory_resource_or_skip(opts)
+    assert mr.preferred_location == ("host_numa", numa_id)
+
+    # Explicit NUMA node ID
+    opts = ManagedMemoryResourceOptions(
+        preferred_location=numa_id,
+        preferred_location_type="host_numa",
+    )
+    mr = create_managed_memory_resource_or_skip(opts)
+    assert mr.preferred_location == ("host_numa", numa_id)
+
+
+def test_managed_memory_resource_preferred_location_validation(init_cuda):
+    """Invalid preferred_location combinations raise errors."""
+    device = Device()
+    skip_if_managed_memory_unsupported(device)
+    device.set_current()
+
+    # Invalid preferred_location_type
+    with pytest.raises(ValueError, match="preferred_location_type must be one of"):
+        ManagedMemoryResource(
+            ManagedMemoryResourceOptions(
+                preferred_location_type="invalid",
+            )
+        )
+
+    # "device" requires a non-negative int
+    with pytest.raises(ValueError, match="must be a device ordinal"):
+        ManagedMemoryResource(
+            ManagedMemoryResourceOptions(
+                preferred_location_type="device",
+            )
+        )
+    with pytest.raises(ValueError, match="must be a device ordinal"):
+        ManagedMemoryResource(
+            ManagedMemoryResourceOptions(
+                preferred_location=-1,
+                preferred_location_type="device",
+            )
+        )
+
+    # "host" requires preferred_location=None
+    with pytest.raises(ValueError, match="must be None"):
+        ManagedMemoryResource(
+            ManagedMemoryResourceOptions(
+                preferred_location=0,
+                preferred_location_type="host",
+            )
+        )
+
+    # "host_numa" rejects negative IDs
+    with pytest.raises(ValueError, match="must be a NUMA node ID"):
+        ManagedMemoryResource(
+            ManagedMemoryResourceOptions(
+                preferred_location=-1,
+                preferred_location_type="host_numa",
+            )
+        )
+
+    # Legacy mode rejects invalid negative values
+    with pytest.raises(ValueError, match="preferred_location must be"):
+        ManagedMemoryResource(
+            ManagedMemoryResourceOptions(
+                preferred_location=-2,
+            )
+        )
+
+
+def test_managed_memory_resource_host_numa_auto_resolve_failure(init_cuda):
+    """host_numa with None raises RuntimeError when NUMA ID cannot be determined."""
+    from unittest.mock import MagicMock, patch
+
+    device = Device()
+    skip_if_managed_memory_unsupported(device)
+    device.set_current()
+
+    mock_dev = MagicMock()
+    mock_dev.properties.host_numa_id = -1
+
+    with (
+        patch("cuda.core._device.Device", return_value=mock_dev),
+        pytest.raises(RuntimeError, match="Cannot determine host NUMA ID"),
+    ):
+        ManagedMemoryResource(
+            ManagedMemoryResourceOptions(
+                preferred_location_type="host_numa",
+            )
+        )
+
+
 def test_mempool_ipc_errors(mempool_device):
     """Test error cases when IPC operations are disabled."""
     device = mempool_device
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index edf249eb60..0005777b52 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -57,6 +57,22 @@ def _get_nvrtc_version_for_tests():
         return None
 
 
+def _has_nvrtc_pch_apis_for_tests():
+    required = (
+        "nvrtcGetPCHHeapSize",
+        "nvrtcSetPCHHeapSize",
+        "nvrtcGetPCHCreateStatus",
+        "nvrtcGetPCHHeapSizeRequired",
+    )
+    return all(hasattr(nvrtc, name) for name in required)
+
+
+nvrtc_pch_available = pytest.mark.skipif(
+    (_get_nvrtc_version_for_tests() or 0) < 12800 or not _has_nvrtc_pch_apis_for_tests(),
+    reason="PCH runtime APIs require NVRTC >= 12.8 bindings",
+)
+
+
 _libnvvm_version = None
 _libnvvm_version_attempted = False
 
@@ -316,6 +332,25 @@ def test_cpp_program_with_pch_options(init_cuda, tmp_path):
         program.close()
 
 
+@nvrtc_pch_available
+def test_cpp_program_pch_auto_creates(init_cuda, tmp_path):
+    code = 'extern "C" __global__ void my_kernel() {}'
+    pch_path = str(tmp_path / "test.pch")
+    program = Program(code, "c++", ProgramOptions(create_pch=pch_path))
+    assert program.pch_status is None  # not compiled yet
+    program.compile("ptx")
+    assert program.pch_status in ("created", "not_attempted", "failed")
+    program.close()
+
+
+def test_cpp_program_pch_status_none_without_pch(init_cuda):
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    program.compile("ptx")
+    assert program.pch_status is None
+    program.close()
+
+
 options = [
     ProgramOptions(max_register_count=32),
     ProgramOptions(debug=True),
diff --git a/pytest.ini b/pytest.ini
index 0543760cd7..978e659bf0 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -19,3 +19,4 @@ markers =
     core: tests for cuda_core
     cython: cython tests
     smoke: meta-level smoke tests
+    flaky: mark test as flaky (provided by pytest-rerunfailures)

From c7f85ff825ec3dd7524a2ac0baebd2d872ddf00d Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 4 Mar 2026 09:55:45 -0800
Subject: [PATCH 4/5] Remove redundant Python-side peer access cleanup; fix
 peer access tests

- Remove __dealloc__ and close() override from DeviceMemoryResource
  that cleared peer access before destruction. The C++ RAII deleter
  already handles this for owned pools (nvbug 5698116 workaround).
  For non-owned pools (default device pool), clearing peer access
  on handle disposal was incorrect behavior.

- Update peer access tests to use owned pools (DeviceMemoryResourceOptions())
  instead of default pools. Default pools are shared and may have stale
  peer access state from prior tests, causing test failures.

Made-with: Cursor
---
 .../cuda/core/_memory/_device_memory_resource.pyx  | 14 --------------
 cuda_core/tests/test_memory_peer_access.py         | 12 ++++++++----
 2 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
index 09aa482234..1299f1bd57 100644
--- a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
@@ -135,20 +135,6 @@ cdef class DeviceMemoryResource(_MemPool):
     def __init__(self, device_id: Device | int, options=None):
         _DMR_init(self, device_id, options)
 
-    def __dealloc__(self):
-        try:
-            self.close()
-        except Exception:
-            pass
-
-    def close(self):
-        """Close the memory resource, revoking peer access before destruction."""
-        # nvbug 5698116: clear peer access before pool destruction; also
-        # needed for non-owned (default) pools to undo modifications.
-        if self._peer_accessible_by:
-            _DMR_set_peer_accessible_by(self, [])
-        super().close()
-
     def __reduce__(self):
         return DeviceMemoryResource.from_registry, (self.uuid,)
 
diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py
index bcae9576da..99426391db 100644
--- a/cuda_core/tests/test_memory_peer_access.py
+++ b/cuda_core/tests/test_memory_peer_access.py
@@ -3,7 +3,7 @@
 
 import cuda.core
 import pytest
-from cuda.core import DeviceMemoryResource
+from cuda.core import DeviceMemoryResource, DeviceMemoryResourceOptions
 from cuda.core._utils.cuda_utils import CUDAError
 from helpers.buffers import PatternGen, compare_buffer_to_constant, make_scratch_buffer
 
@@ -16,7 +16,8 @@ def test_peer_access_basic(mempool_device_x2):
     zero_on_dev0 = make_scratch_buffer(dev0, 0, NBYTES)
     one_on_dev0 = make_scratch_buffer(dev0, 1, NBYTES)
     stream_on_dev0 = dev0.create_stream()
-    dmr_on_dev1 = DeviceMemoryResource(dev1)
+    # Use owned pool to ensure clean initial state (no stale peer access).
+    dmr_on_dev1 = DeviceMemoryResource(dev1, DeviceMemoryResourceOptions())
     buf_on_dev1 = dmr_on_dev1.allocate(NBYTES)
 
     # No access at first.
@@ -51,7 +52,8 @@ def test_peer_access_property_x2(mempool_device_x2):
     # The peer access list is a sorted tuple and always excludes the self
     # device.
     dev0, dev1 = mempool_device_x2
-    dmr = DeviceMemoryResource(dev0)
+    # Use owned pool to ensure clean initial state (no stale peer access).
+    dmr = DeviceMemoryResource(dev0, DeviceMemoryResourceOptions())
 
     def check(expected):
         assert isinstance(dmr.peer_accessible_by, tuple)
@@ -97,7 +99,9 @@ def test_peer_access_transitions(mempool_device_x3):
     # Allocate per-device resources.
     streams = [dev.create_stream() for dev in devs]
     pgens = [PatternGen(devs[i], NBYTES, streams[i]) for i in range(3)]
-    dmrs = [DeviceMemoryResource(dev) for dev in devs]
+    # Use owned pools (with options) to ensure clean initial state.
+    # Default pools are shared and may have stale peer access from prior tests.
+    dmrs = [DeviceMemoryResource(dev, DeviceMemoryResourceOptions()) for dev in devs]
     bufs = [dmr.allocate(NBYTES) for dmr in dmrs]
 
     def verify_state(state, pattern_seed):

From 217dbf1d8484f8887ad6f7c487308e62a537dbef Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 9 Mar 2026 12:34:23 -0700
Subject: [PATCH 5/5] Fix DeviceMemoryResource.peer_accessible_by for non-owned
 pools

For non-owned (default/current) pools, always query the CUDA driver
for peer access state instead of caching. This ensures multiple
wrappers around the same shared pool see consistent state.

Closes #1720

Made-with: Cursor
---
 .github/actions/fetch_ctk/action.yml          |   2 +-
 .github/workflows/bandit.yml                  |   2 +-
 .github/workflows/build-wheel.yml             |   2 +-
 .github/workflows/codeql.yml                  |   4 +-
 .github/workflows/coverage.yml                |   3 +-
 .github/workflows/test-wheel-linux.yml        |   2 +-
 .github/workflows/test-wheel-windows.yml      |   2 +-
 AGENTS.md                                     | 340 +++++++++++++----
 CLAUDE.md                                     |   1 +
 cuda_bindings/AGENTS.md                       |  67 ++++
 cuda_bindings/CLAUDE.md                       |   1 +
 cuda_bindings/docs/source/conf.py             |   1 +
 .../0_Introduction/clock_nvrtc_test.py        |  92 ++---
 .../simpleCubemapTexture_test.py              | 180 ++++-----
 .../examples/0_Introduction/simpleP2P_test.py | 168 +++++----
 .../0_Introduction/simpleZeroCopy_test.py     | 122 +++---
 .../0_Introduction/systemWideAtomics_test.py  |  98 ++---
 .../0_Introduction/vectorAddDrv_test.py       | 136 +++----
 .../0_Introduction/vectorAddMMAP_test.py      | 210 +++++------
 .../streamOrderedAllocation_test.py           | 178 ++++-----
 .../globalToShmemAsyncCopy_test.py            | 352 +++++++++---------
 .../3_CUDA_Features/simpleCudaGraphs_test.py  | 328 ++++++++--------
 .../conjugateGradientMultiBlockCG_test.py     | 270 +++++++-------
 cuda_bindings/examples/common/common.py       |  78 ++--
 cuda_bindings/examples/common/helper_cuda.py  |  34 +-
 .../examples/common/helper_string.py          |   8 +-
 .../examples/extra/isoFDModelling_test.py     | 326 ++++++++--------
 .../examples/extra/jit_program_test.py        | 134 ++++---
 cuda_bindings/tests/nvml/test_device.py       |  12 +-
 cuda_core/AGENTS.md                           |  65 ++++
 cuda_core/CLAUDE.md                           |   1 +
 cuda_core/cuda/core/_linker.pyx               |  22 +-
 .../core/_memory/_device_memory_resource.pyx  |  35 +-
 cuda_core/cuda/core/_memory/_ipc.pyx          |   4 +
 cuda_core/cuda/core/_memoryview.pyx           |   4 +
 cuda_core/cuda/core/_program.pyx              |  22 +-
 cuda_core/docs/source/conf.py                 |   1 +
 cuda_core/docs/source/release/0.7.x-notes.rst |   6 +
 cuda_core/examples/cuda_graphs.py             |   6 +-
 cuda_core/examples/gl_interop_plasma.py       |   4 +-
 cuda_core/examples/pytorch_example.py         |  16 +-
 cuda_core/examples/saxpy.py                   |  30 +-
 .../examples/simple_multi_gpu_example.py      |  14 +-
 cuda_core/examples/strided_memory_view_gpu.py |  16 +-
 cuda_core/examples/thread_block_cluster.py    |   4 +-
 cuda_core/examples/vector_add.py              |  14 +-
 cuda_core/pixi.lock                           |  59 ++-
 cuda_core/pixi.toml                           |   1 +
 cuda_core/pyproject.toml                      |   2 +
 cuda_core/tests/memory_ipc/test_serialize.py  |  11 +-
 cuda_core/tests/test_memory_peer_access.py    |  29 ++
 .../tests/test_optional_dependency_imports.py | 123 ++++++
 cuda_core/tests/test_utils.py                 |  49 ++-
 cuda_pathfinder/AGENTS.md                     |  72 ++++
 cuda_pathfinder/CLAUDE.md                     |   1 +
 cuda_pathfinder/cuda/pathfinder/__init__.py   |   1 +
 .../_dynamic_libs/descriptor_catalog.py       |  23 ++
 .../_dynamic_libs/search_platform.py          |  12 +-
 .../_headers/find_nvidia_headers.py           |  36 ++
 .../cuda/pathfinder/_optional_cuda_import.py  |  43 +++
 cuda_pathfinder/docs/nv-versions.json         |   4 +
 cuda_pathfinder/docs/source/api.rst           |   1 +
 cuda_pathfinder/docs/source/conf.py           |   1 +
 .../docs/source/release/1.4.1-notes.rst       |  49 +++
 .../docs/source/release/1.4.2-notes.rst       |  15 +
 cuda_pathfinder/pyproject.toml                |   4 +-
 .../tests/test_find_nvidia_headers.py         | 126 ++++++-
 ...st_load_nvidia_dynamic_lib_using_mocker.py | 173 +++++++++
 .../tests/test_optional_cuda_import.py        |  67 ++++
 cuda_python/AGENTS.md                         |  24 ++
 cuda_python/CLAUDE.md                         |   1 +
 cuda_python/docs/exts/release_date.py         |  94 +++++
 cuda_python/docs/source/conf.py               |   1 +
 ruff.toml                                     |   4 +-
 74 files changed, 2871 insertions(+), 1572 deletions(-)
 create mode 120000 CLAUDE.md
 create mode 100644 cuda_bindings/AGENTS.md
 create mode 120000 cuda_bindings/CLAUDE.md
 create mode 100644 cuda_core/AGENTS.md
 create mode 120000 cuda_core/CLAUDE.md
 create mode 100644 cuda_core/tests/test_optional_dependency_imports.py
 create mode 100644 cuda_pathfinder/AGENTS.md
 create mode 120000 cuda_pathfinder/CLAUDE.md
 create mode 100644 cuda_pathfinder/cuda/pathfinder/_optional_cuda_import.py
 create mode 100644 cuda_pathfinder/docs/source/release/1.4.1-notes.rst
 create mode 100644 cuda_pathfinder/docs/source/release/1.4.2-notes.rst
 create mode 100644 cuda_pathfinder/tests/test_load_nvidia_dynamic_lib_using_mocker.py
 create mode 100644 cuda_pathfinder/tests/test_optional_cuda_import.py
 create mode 100644 cuda_python/AGENTS.md
 create mode 120000 cuda_python/CLAUDE.md
 create mode 100644 cuda_python/docs/exts/release_date.py

diff --git a/.github/actions/fetch_ctk/action.yml b/.github/actions/fetch_ctk/action.yml
index 001e3a84d8..e938fcc5b3 100644
--- a/.github/actions/fetch_ctk/action.yml
+++ b/.github/actions/fetch_ctk/action.yml
@@ -14,7 +14,7 @@ inputs:
   cuda-components:
     description: "A list of the CTK components to install as a comma-separated list. e.g. 'cuda_nvcc,cuda_nvrtc,cuda_cudart'"
     required: false
-    default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile,libnvfatbin"
+    default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,cuda_cupti,libnvjitlink,libcufile,libnvfatbin"
   cuda-path:
     description: "where the CTK components will be installed to, relative to $PWD"
     required: false
diff --git a/.github/workflows/bandit.yml b/.github/workflows/bandit.yml
index b7ed18b696..7ecbcdd1a1 100644
--- a/.github/workflows/bandit.yml
+++ b/.github/workflows/bandit.yml
@@ -42,6 +42,6 @@ jobs:
         with:
           args: "check --select S --ignore ${{ steps.ignore-codes.outputs.codes }} --output-format sarif --output-file results.sarif"
       - name: Upload SARIF file
-        uses: github/codeql-action/upload-sarif@v4.32.4
+        uses: github/codeql-action/upload-sarif@v4.32.5
         with:
           sarif_file: results.sarif
diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml
index dd2ede5c67..2a227d4ee9 100644
--- a/.github/workflows/build-wheel.yml
+++ b/.github/workflows/build-wheel.yml
@@ -369,7 +369,7 @@ jobs:
 
           OLD_BRANCH=$(yq '.backport_branch' ci/versions.yml)
           OLD_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda*-${{ inputs.host-platform }}*"
-          LATEST_PRIOR_RUN_ID=$(gh run list -b ${OLD_BRANCH} -L 1 -w "ci.yml" -s completed -R NVIDIA/cuda-python --json databaseId | jq '.[]| .databaseId')
+          LATEST_PRIOR_RUN_ID=$(gh run list -b ${OLD_BRANCH} -L 1 -w "ci.yml" -s success -R NVIDIA/cuda-python --json databaseId | jq '.[]| .databaseId')
           if [[ "$LATEST_PRIOR_RUN_ID" == "" ]]; then
             echo "LATEST_PRIOR_RUN_ID not found!"
             exit 1
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 8f02dcbd6a..eea2466f7d 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -31,13 +31,13 @@ jobs:
       uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
 
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@0ec47d036c68ae0cf94c629009b1029407111281  # v3.31.8
+      uses: github/codeql-action/init@40f0fa95c41fede7b43f035cb47aac899ee0ba0a  # v3.31.8
       with:
         languages: ${{ matrix.language }}
         build-mode: ${{ matrix.build-mode }}
         queries: security-extended
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@0ec47d036c68ae0cf94c629009b1029407111281  # v3.31.8
+      uses: github/codeql-action/analyze@40f0fa95c41fede7b43f035cb47aac899ee0ba0a  # v3.31.8
       with:
         category: "/language:${{matrix.language}}"
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
index 765b623b3a..e65439a77e 100644
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -98,7 +98,8 @@ jobs:
 
       - name: Build cuda-pathfinder
         run: |
-          .venv/bin/pip install -v ./cuda_pathfinder --group test
+          cd cuda_pathfinder
+          ../.venv/bin/pip install -v . --group test
 
       - name: Build cuda-bindings
         run: |
diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index 3c80128bb1..c5061a16eb 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -151,7 +151,7 @@ jobs:
 
           OLD_BRANCH=${{ needs.compute-matrix.outputs.OLD_BRANCH }}
           OLD_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda*-${{ inputs.host-platform }}*"
-          LATEST_PRIOR_RUN_ID=$(gh run list -b ${OLD_BRANCH} -L 1 -w "ci.yml" -s completed -R NVIDIA/cuda-python --json databaseId | jq '.[]| .databaseId')
+          LATEST_PRIOR_RUN_ID=$(gh run list -b ${OLD_BRANCH} -L 1 -w "ci.yml" -s success -R NVIDIA/cuda-python --json databaseId | jq '.[]| .databaseId')
           if [[ "$LATEST_PRIOR_RUN_ID" == "" ]]; then
             echo "LATEST_PRIOR_RUN_ID not found!"
             exit 1
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index dc9a31719f..478826c525 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -137,7 +137,7 @@ jobs:
         run: |
           $OLD_BRANCH = yq '.backport_branch' ci/versions.yml
           $OLD_BASENAME = "cuda-bindings-python${env:PYTHON_VERSION_FORMATTED}-cuda*-${{ inputs.host-platform }}*"
-          $runData = gh run list -b $OLD_BRANCH -L 1 -w "ci.yml" -s completed -R NVIDIA/cuda-python --json databaseId | ConvertFrom-Json
+          $runData = gh run list -b $OLD_BRANCH -L 1 -w "ci.yml" -s success -R NVIDIA/cuda-python --json databaseId | ConvertFrom-Json
           if (-not $runData -or $runData.Length -eq 0 -or -not $runData[0].databaseId -or [string]::IsNullOrEmpty($runData[0].databaseId)) {
               Write-Host "LATEST_PRIOR_RUN_ID not found!"
               exit 1
diff --git a/AGENTS.md b/AGENTS.md
index 06fd7da3ed..525d300801 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1,77 +1,263 @@
-# cuda_pathfinder agent instructions
-
-You are working on `cuda_pathfinder`, a Python sub-package of the
-[cuda-python](https://github.com/NVIDIA/cuda-python) monorepo. It finds and
-loads NVIDIA dynamic libraries (CTK, third-party, and driver) across Linux and
-Windows.
-
-## Workspace
-
-The workspace root is `cuda_pathfinder/` inside the monorepo. Use the
-`working_directory` parameter on the Shell tool when you need the monorepo root
-(one level up).
-
-## Conventions
-
-- **Python**: all source is pure Python (no Cython in this sub-package).
-- **Testing**: `pytest` with `pytest-mock` (`mocker` fixture). Use
-  `spawned_process_runner` for real-loading tests that need process isolation
-  (dynamic linker state leaks across tests otherwise). Use the
-  `info_summary_append` fixture to emit `INFO` lines visible in CI/QA logs.
-- **STRICTNESS env var**: `CUDA_PATHFINDER_TEST_LOAD_NVIDIA_DYNAMIC_LIB_STRICTNESS`
-  controls whether missing libs are tolerated (`see_what_works`, default) or
-  fatal (`all_must_work`).
-- **Formatting/linting**: rely on pre-commit (runs automatically on commit). Do
-  not run formatters manually.
-- **Imports**: use `from cuda.pathfinder._dynamic_libs...` for internal imports
-  in tests; public API is `from cuda.pathfinder import load_nvidia_dynamic_lib`.
-
-## Testing guidelines
-
-- **Real tests over mocks**: mocks are fine for hard-to-reach branches (e.g.
-  24-bit Python), but every loading path must also have a real-loading test that
-  runs in a spawned child process. Track results with `INFO` lines so CI logs
-  show what actually loaded.
-- **No real lib names in negative tests**: when parametrizing unsupported /
-  invalid libnames, use obviously fake names (`"bogus"`, `"not_a_real_lib"`)
-  to avoid confusion when searching the codebase.
-- **`functools.cache` awareness**: `load_nvidia_dynamic_lib` is cached. Tests
-  that patch internals it depends on must call
-  `load_nvidia_dynamic_lib.cache_clear()` first, or use a child process for
-  isolation.
-
-## Key modules
-
-- `cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py` -- main entry
-  point and dispatch logic (CTK vs driver).
-- `cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py` -- canonical
-  registry of sonames, DLLs, site-packages paths, and dependencies.
-- `cuda/pathfinder/_dynamic_libs/find_nvidia_dynamic_lib.py` -- CTK search
-  cascade (site-packages, conda, CUDA_HOME).
-- `tests/child_load_nvidia_dynamic_lib_helper.py` -- lightweight helper
-  imported by spawned child processes (avoids re-importing the full test
-  module).
-
-### Fix all code review findings from lib-descriptor-refactor review
-
-**Request:** Fix all 8 findings from the external code review.
-
-**Actions (in worktree `cuda_pathfinder_refactor`):**
-1. `search_steps.py`: Restored `os.path.normpath(dirname)` in
-   `_find_lib_dir_using_anchor` (regression from pre-refactor fix). Added
-   `NoReturn` annotation to `raise_not_found`.
-2. `search_platform.py`: Guarded `os.listdir(lib_dir)` in
-   `WindowsSearchPlatform.find_in_lib_dir` with `os.path.isdir` check to
-   prevent crash on missing directory.
-3. `test_descriptor_catalog.py`: Rewrote tautological tests as structural
-   invariant tests (uniqueness, valid names, strategy values, dep graph,
-   soname/dll format, driver lib constraints). 237 new parametrized cases.
-4. `platform_loader.py`: Eliminated `WindowsLoader`/`LinuxLoader` boilerplate
-   classes — assign the platform module directly as `LOADER`. Removed stale
-   `type: ignore`.
-5. `descriptor_catalog.py`: Trimmed default-valued fields from all entries,
-   added `# ---` section comments (CTK / third-party / driver).
-6. `load_nvidia_dynamic_lib.py`: Fixed import layout — `TYPE_CHECKING` block
-   now properly separated after unconditional imports.
-
-All 742 tests pass, all pre-commit hooks green.
+# cuda-python monorepo agent instructions
+
+This file contains repository-wide guidance.
+
+When a subdirectory has its own `AGENTS.md`, treat that file as the primary
+guide for package-specific conventions and workflows.
+
+## Package map
+
+- `cuda_pathfinder/`: Pure-Python library discovery and loading utilities.
+- `cuda_bindings/`: Low-level CUDA host API bindings (Cython-heavy).
+- `cuda_core/`: High-level Pythonic CUDA APIs built on top of bindings.
+- `cuda_python/`: Metapackage and docs aggregation.
+
+# General
+
+- When searching for text or files, prefer using `rg` or `rg --files`
+  respectively because `rg` is much faster than alternatives like `grep`. (If
+  the `rg` command is not found, then use alternatives.)
+- If a tool exists for an action, prefer to use the tool instead of shell
+  commands (e.g `read_file` over `cat`). Strictly avoid raw `cmd`/terminal when
+  a dedicated tool exists. Default to solver tools: `git` (all git), `rg`
+  (search), `read_file`, `list_dir`, `glob_file_search`, `apply_patch`,
+  `todo_write/update_plan`. Use `cmd`/`run_terminal_cmd` only when no listed
+  tool can perform the action.
+- When multiple tool calls can be parallelized (e.g., todo updates with other
+  actions, file searches, reading files), make these tool calls in parallel
+  instead of sequential. Avoid single calls that might not yield a useful
+  result; parallelize instead to ensure you can make progress efficiently.
+- Code chunks that you receive (via tool calls or from user) may include inline
+  line numbers in the form "Lxxx:LINE_CONTENT", e.g. "L123:LINE_CONTENT". Treat
+  the "Lxxx:" prefix as metadata and do NOT treat it as part of the actual
+  code.
+- Default expectation: deliver working code, not just a plan. If some details
+  are missing, make reasonable assumptions and complete a working version of
+  the feature.
+
+
+# Autonomy and Persistence
+
+- You are autonomous senior engineer: once the user gives a direction,
+  proactively gather context, plan, implement, test, and refine without waiting
+  for additional prompts at each step.
+- Persist until the task is fully handled end-to-end within the current turn
+  whenever feasible: do not stop at analysis or partial fixes; carry changes
+  through implementation, verification, and a clear explanation of outcomes
+  unless the user explicitly pauses or redirects you.
+- Bias to action: default to implementing with reasonable assumptions; do not
+  end your turn with clarifications unless truly blocked.
+- Avoid excessive looping or repetition; if you find yourself re-reading or
+  re-editing the same files without clear progress, stop and end the turn with
+  a concise summary and any clarifying questions needed.
+
+
+# Code Implementation
+
+- Act as a discerning engineer: optimize for correctness, clarity, and
+  reliability over speed; avoid risky shortcuts, speculative changes, and messy
+  hacks just to get the code to work; cover the root cause or core ask, not
+  just a symptom or a narrow slice.
+- Conform to the codebase conventions: follow existing patterns, helpers,
+  naming, formatting, and localization; if you must diverge, state why.
+- Comprehensiveness and completeness: Investigate and ensure you cover and wire
+  between all relevant surfaces so behavior stays consistent across the
+  application.
+- Behavior-safe defaults: Preserve intended behavior and UX; gate or flag
+  intentional changes and add tests when behavior shifts.
+- Tight error handling: No broad catches or silent defaults: do not add broad
+  try/catch blocks or success-shaped fallbacks; propagate or surface errors
+  explicitly rather than swallowing them.
+  - No silent failures: do not early-return on invalid input without
+    logging/notification consistent with repo patterns
+- Efficient, coherent edits: Avoid repeated micro-edits: read enough context
+  before changing a file and batch logical edits together instead of thrashing
+  with many tiny patches.
+- Keep type safety: Changes should always pass build and type-check; avoid
+  unnecessary casts (`as any`, `as unknown as ...`); prefer proper types and
+  guards, and reuse existing helpers (e.g., normalizing identifiers) instead of
+  type-asserting.
+- Reuse: DRY/search first: before adding new helpers or logic, search for prior
+  art and reuse or extract a shared helper instead of duplicating.
+- Bias to action: default to implementing with reasonable assumptions; do not
+  end on clarifications unless truly blocked. Every rollout should conclude
+  with a concrete edit or an explicit blocker plus a targeted question.
+
+
+# Editing constraints
+
+- Default to ASCII when editing or creating files. Only introduce non-ASCII or
+  other Unicode characters when there is a clear justification and the file
+  already uses them.
+- Add succinct code comments that explain what is going on if code is not
+  self-explanatory. You should not add comments like "Assigns the value to the
+  variable", but a brief comment might be useful ahead of a complex code block
+  that the user would otherwise have to spend time parsing out. Usage of these
+  comments should be rare.
+- Try to use apply_patch for single file edits, but it is fine to explore other
+  options to make the edit if it does not work well. Do not use apply_patch for
+  changes that are auto-generated (i.e. generating package.json or running
+  a lint or format command like gofmt) or when scripting is more efficient
+  (such as search and replacing a string across a codebase).
+- You may be in a dirty git worktree.
+    * NEVER revert existing changes you did not make unless explicitly
+      requested, since these changes were made by the user.
+    * If asked to make a commit or code edits and there are unrelated changes
+      to your work or changes that you didn't make in those files, don't revert
+      those changes.
+    * If the changes are in files you've touched recently, you should read
+      carefully and understand how you can work with the changes rather than
+      reverting them.
+    * If the changes are in unrelated files, just ignore them and don't revert
+      them.
+- Do not amend a commit unless explicitly requested to do so.
+- While you are working, you might notice unexpected changes that you didn't
+  make. If this happens, STOP IMMEDIATELY and ask the user how they would like
+  to proceed.
+- **NEVER** use destructive commands like `git reset --hard` or `git checkout
+  --` unless specifically requested or approved by the user.
+
+
+# Exploration and reading files
+
+- **Think first.** Before any tool call, decide ALL files/resources you will
+  need.
+- **Batch everything.** If you need multiple files (even from different
+  places), read them together.
+- **multi_tool_use.parallel** Use `multi_tool_use.parallel` to parallelize tool
+  calls and only this.
+- **Only make sequential calls if you truly cannot know the next file without
+  seeing a result first.**
+- **Workflow:** (a) plan all needed reads → (b) issue one parallel batch → (c)
+  analyze results → (d) repeat if new, unpredictable reads arise.
+- Additional notes:
+    - Always maximize parallelism. Never read files one-by-one unless logically unavoidable.
+    - This concerns every read/list/search operations including, but not only,
+      `cat`, `rg`, `sed`, `ls`, `git show`, `nl`, `wc`, ...
+    - Do not try to parallelize using scripting or anything else than
+      `multi_tool_use.parallel`.
+
+
+# Plan tool
+
+When using the planning tool:
+- Skip using the planning tool for straightforward tasks (roughly the easiest
+  25%).
+- Do not make single-step plans.
+- When you made a plan, update it after having performed one of the sub-tasks
+  that you shared on the plan.
+- Unless asked for a plan, never end the interaction with only a plan. Plans
+  guide your edits; the deliverable is working code.
+- Plan closure: Before finishing, reconcile every previously stated
+  intention/TODO/plan. Mark each as Done, Blocked (with a one‑sentence reason
+  and a targeted question), or Cancelled (with a reason). Do not end with
+  in_progress/pending items. If you created todos via a tool, update their
+  statuses accordingly.
+- Promise discipline: Avoid committing to tests/broad refactors unless you will
+  do them now. Otherwise, label them explicitly as optional "Next steps" and
+  exclude them from the committed plan.
+- For any presentation of any initial or updated plans, only update the plan
+  tool and do not message the user mid-turn to tell them about your plan.
+
+
+# Special user requests
+
+- If the user makes a simple request (such as asking for the time) which you
+  can fulfill by running a terminal command (such as `date`), you should do so.
+- If the user asks for a "review", default to a code review mindset: prioritise
+  identifying bugs, risks, behavioural regressions, and missing tests. Findings
+  must be the primary focus of the response - keep summaries or overviews brief
+  and only after enumerating the issues. Present findings first (ordered by
+  severity with file/line references), follow with open questions or
+  assumptions, and offer a change-summary only as a secondary detail. If no
+  findings are discovered, state that explicitly and mention any residual risks
+  or testing gaps.
+
+
+# Frontend tasks
+
+When doing frontend design tasks, avoid collapsing into "AI slop" or safe,
+average-looking layouts. Aim for interfaces that feel intentional, bold, and
+a bit surprising.
+- Typography: Use expressive, purposeful fonts and avoid default stacks (Inter,
+  Roboto, Arial, system).
+- Color & Look: Choose a clear visual direction; define CSS variables; avoid
+  purple-on-white defaults. No purple bias or dark mode bias.
+- Motion: Use a few meaningful animations (page-load, staggered reveals)
+  instead of generic micro-motions.
+- Background: Don't rely on flat, single-color backgrounds; use gradients,
+  shapes, or subtle patterns to build atmosphere.
+- Overall: Avoid boilerplate layouts and interchangeable UI patterns. Vary
+  themes, type families, and visual languages across outputs.
+- Ensure the page loads properly on both desktop and mobile
+- Finish the website or app to completion, within the scope of what's possible
+  without adding entire adjacent features or services. It should be in
+  a working state for a user to run and test.
+
+Exception: If working within an existing website or design system, preserve the
+established patterns, structure, and visual language.
+
+
+# Presenting your work and final message
+
+You are producing plain text that will later be styled by the CLI. Follow these
+rules exactly. Formatting should make results easy to scan, but not feel
+mechanical. Use judgment to decide how much structure adds value.
+
+- Default: be very concise; friendly coding teammate tone.
+- Format: Use natural language with high-level headings.
+- Ask only when needed; suggest ideas; mirror the user's style.
+- For substantial work, summarize clearly; follow final‑answer formatting.
+- Skip heavy formatting for simple confirmations.
+- Don't dump large files you've written; reference paths only.
+- No "save/copy this file" - User is on the same machine.
+- Offer logical next steps (tests, commits, build) briefly; add verify steps if
+  you couldn't do something.
+- For code changes:
+  * Lead with a quick explanation of the change, and then give more details on
+    the context covering where and why a change was made. Do not start this
+    explanation with "summary", just jump right in.
+  * If there are natural next steps the user may want to take, suggest them at
+    the end of your response. Do not make suggestions if there are no natural
+    next steps.
+  * When suggesting multiple options, use numeric lists for the suggestions so
+    the user can quickly respond with a single number.
+- The user does not command execution outputs. When asked to show the output of
+  a command (e.g. `git show`), relay the important details in your answer or
+  summarize the key lines so the user understands the result.
+
+## Final answer structure and style guidelines
+
+- Plain text; CLI handles styling. Use structure only when it helps
+  scanability.
+- Headers: optional; short Title Case (1-3 words) wrapped in **…**; no blank
+  line before the first bullet; add only if they truly help.
+- Bullets: use - ; merge related points; keep to one line when possible; 4–6
+  per list ordered by importance; keep phrasing consistent.
+- Monospace: backticks for commands/paths/env vars/code ids and inline
+  examples; use for literal keyword bullets; never combine with double asterisk.
+- Code samples or multi-line snippets should be wrapped in fenced code blocks;
+  include an info string as often as possible.
+- Structure: group related bullets; order sections general → specific
+  → supporting; for subsections, start with a bolded keyword bullet, then
+  items; match complexity to the task.
+- Tone: collaborative, concise, factual; present tense, active voice;
+  self‑contained; no "above/below"; parallel wording.
+- Don'ts: no nested bullets/hierarchies; no ANSI codes; don't cram unrelated
+  keywords; keep keyword lists short—wrap/reformat if long; avoid naming
+  formatting styles in answers.
+- Adaptation: code explanations → precise, structured with code refs; simple
+  tasks → lead with outcome; big changes → logical walkthrough + rationale
+  + next actions; casual one-offs → plain sentences, no headers/bullets.
+- File References: When referencing files in your response follow the below
+  rules:
+  * Use inline code to make file paths clickable.
+  * Each reference should have a stand alone path. Even if it's the same file.
+  * Accepted: absolute, workspace‑relative, a/ or b/ diff prefixes, or bare
+    filename/suffix.
+  * Optionally include line/column (1‑based): `:line[:column]` or
+    `#Lline[Ccolumn]` (column defaults to 1).
+  * Do not use URIs like `file://`, `vscode://`, or `https://`.
+  * Do not provide range of lines
+  * Examples: `src/app.ts`, src/app.ts:42, b/server/index.js#L10,
+    C:\repo\project\main.rs:12:5
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 120000
index 0000000000..47dc3e3d86
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1 @@
+AGENTS.md
\ No newline at end of file
diff --git a/cuda_bindings/AGENTS.md b/cuda_bindings/AGENTS.md
new file mode 100644
index 0000000000..9688c9f94c
--- /dev/null
+++ b/cuda_bindings/AGENTS.md
@@ -0,0 +1,67 @@
+This file describes `cuda_bindings`, the low-level CUDA host API bindings
+subpackage in the `cuda-python` monorepo.
+
+## Scope and principles
+
+- **Role**: provide low-level, close-to-CUDA interfaces under
+  `cuda.bindings.*` with broad API coverage.
+- **Style**: prioritize correctness and API compatibility over convenience
+  wrappers. High-level ergonomics belong in `cuda_core`, not here.
+- **Cross-platform**: preserve Linux and Windows behavior unless a change is
+  intentionally platform-specific.
+
+## Package architecture
+
+- **Public module layer**: Cython modules under `cuda/bindings/` expose user
+  APIs (`driver`, `runtime`, `nvrtc`, `nvjitlink`, `nvvm`, `cufile`, etc.).
+- **Internal binding layer**: `cuda/bindings/_bindings/` provides lower-level
+  glue and loader helpers used by public modules.
+- **Platform internals**: `cuda/bindings/_internal/` contains
+  platform-specific implementation files and support code.
+- **Build/codegen backend**: `build_hooks.py` drives header parsing, template
+  expansion, extension configuration, and Cythonization.
+
+## Generated-source workflow
+
+- **Do not hand-edit generated binding files**: many files under
+  `cuda/bindings/` (including `*.pyx`, `*.pxd`, `*.pyx.in`, and `*.pxd.in`)
+  are generated artifacts.
+- **Generated files are synchronized from another repository**: changes to these
+  files in this repo are expected to be overwritten by the next sync.
+- **If generated output must change**: make the change at the generation source
+  and sync the updated artifacts back here, rather than patching generated files
+  directly in this repo.
+- **Header-driven generation**: parser behavior and required CUDA headers are
+  defined in `build_hooks.py`; update those rules when introducing new symbols.
+- **Platform split files**: keep `_linux.pyx` and `_windows.pyx` variants
+  aligned when behavior should be equivalent.
+
+## Testing expectations
+
+- **Primary tests**: `pytest tests/`
+- **Cython tests**:
+  - build: `tests/cython/build_tests.sh` (or platform equivalent)
+  - run: `pytest tests/cython/`
+- **Examples**: example coverage is pytest-based under `examples/`.
+- **Benchmarks**: run with `pytest --benchmark-only benchmarks/` when needed.
+- **Orchestrated run**: from repo root, `scripts/run_tests.sh bindings`.
+
+## Build and environment notes
+
+- `CUDA_HOME` or `CUDA_PATH` must point to a valid CUDA Toolkit for source
+  builds that parse headers.
+- `CUDA_PYTHON_PARALLEL_LEVEL` controls build parallelism.
+- `CUDA_PYTHON_PARSER_CACHING` controls parser-cache behavior during generation.
+- Runtime behavior is affected by
+  `CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM` and
+  `CUDA_PYTHON_DISABLE_MAJOR_VERSION_WARNING`.
+
+## Editing guidance
+
+- Keep CUDA return/error semantics explicit and avoid broad fallback behavior.
+- Reuse existing helper layers (`_bindings`, `_internal`, `_lib`) before adding
+  new one-off utilities.
+- If you add or change exported APIs, update relevant docs under
+  `docs/source/module/` and tests in `tests/`.
+- Prefer changes that are easy to regenerate/rebuild rather than patching
+  generated output directly.
diff --git a/cuda_bindings/CLAUDE.md b/cuda_bindings/CLAUDE.md
new file mode 120000
index 0000000000..47dc3e3d86
--- /dev/null
+++ b/cuda_bindings/CLAUDE.md
@@ -0,0 +1 @@
+AGENTS.md
\ No newline at end of file
diff --git a/cuda_bindings/docs/source/conf.py b/cuda_bindings/docs/source/conf.py
index b55396376f..062c49db9a 100644
--- a/cuda_bindings/docs/source/conf.py
+++ b/cuda_bindings/docs/source/conf.py
@@ -40,6 +40,7 @@
     "enum_tools.autoenum",
     "sphinx_copybutton",
     "release_toc",
+    "release_date",
 ]
 
 nb_execution_mode = "off"
diff --git a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
index dc1084bea8..d67f180fe0 100644
--- a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
+++ b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors, findCudaDevice
+from common.helper_cuda import check_cuda_errors, find_cuda_device
 
 from cuda.bindings import driver as cuda
 
@@ -50,8 +50,8 @@
 }
 """
 
-NUM_BLOCKS = 64
-NUM_THREADS = 256
+num_blocks = 64
+num_threads = 256
 
 
 def elems_to_bytes(nelems, dt):
@@ -64,52 +64,52 @@ def main():
     if platform.machine() == "armv7l":
         pytest.skip("clock_nvrtc is not supported on ARMv7")
 
-    timer = np.empty(NUM_BLOCKS * 2, dtype="int64")
-    hinput = np.empty(NUM_THREADS * 2, dtype="float32")
+    timer = np.empty(num_blocks * 2, dtype="int64")
+    hinput = np.empty(num_threads * 2, dtype="float32")
 
-    for i in range(NUM_THREADS * 2):
+    for i in range(num_threads * 2):
         hinput[i] = i
 
-    devID = findCudaDevice()
-    with common.KernelHelper(clock_nvrtc, devID) as kernelHelper:
-        kernel_addr = kernelHelper.getFunction(b"timedReduction")
-
-        dinput = checkCudaErrors(cuda.cuMemAlloc(hinput.nbytes))
-        doutput = checkCudaErrors(cuda.cuMemAlloc(elems_to_bytes(NUM_BLOCKS, np.float32)))
-        dtimer = checkCudaErrors(cuda.cuMemAlloc(timer.nbytes))
-        checkCudaErrors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes))
-
-        args = ((dinput, doutput, dtimer), (None, None, None))
-        shared_memory_nbytes = elems_to_bytes(2 * NUM_THREADS, np.float32)
-
-        grid_dims = (NUM_BLOCKS, 1, 1)
-        block_dims = (NUM_THREADS, 1, 1)
-
-        checkCudaErrors(
-            cuda.cuLaunchKernel(
-                kernel_addr,
-                *grid_dims,  # grid dim
-                *block_dims,  # block dim
-                shared_memory_nbytes,
-                0,  # shared mem, stream
-                args,
-                0,
-            )
-        )  # arguments
-
-        checkCudaErrors(cuda.cuCtxSynchronize())
-        checkCudaErrors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes))
-        checkCudaErrors(cuda.cuMemFree(dinput))
-        checkCudaErrors(cuda.cuMemFree(doutput))
-        checkCudaErrors(cuda.cuMemFree(dtimer))
-
-    avgElapsedClocks = 0.0
-
-    for i in range(NUM_BLOCKS):
-        avgElapsedClocks += timer[i + NUM_BLOCKS] - timer[i]
-
-    avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS
-    print(f"Average clocks/block = {avgElapsedClocks}")
+    dev_id = find_cuda_device()
+    kernel_helper = common.KernelHelper(clock_nvrtc, dev_id)
+    kernel_addr = kernel_helper.get_function(b"timedReduction")
+
+    dinput = check_cuda_errors(cuda.cuMemAlloc(hinput.nbytes))
+    doutput = check_cuda_errors(cuda.cuMemAlloc(elems_to_bytes(num_blocks, np.float32)))
+    dtimer = check_cuda_errors(cuda.cuMemAlloc(timer.nbytes))
+    check_cuda_errors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes))
+
+    args = ((dinput, doutput, dtimer), (None, None, None))
+    shared_memory_nbytes = elems_to_bytes(2 * num_threads, np.float32)
+
+    grid_dims = (num_blocks, 1, 1)
+    block_dims = (num_threads, 1, 1)
+
+    check_cuda_errors(
+        cuda.cuLaunchKernel(
+            kernel_addr,
+            *grid_dims,  # grid dim
+            *block_dims,  # block dim
+            shared_memory_nbytes,
+            0,  # shared mem, stream
+            args,
+            0,
+        )
+    )  # arguments
+
+    check_cuda_errors(cuda.cuCtxSynchronize())
+    check_cuda_errors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes))
+    check_cuda_errors(cuda.cuMemFree(dinput))
+    check_cuda_errors(cuda.cuMemFree(doutput))
+    check_cuda_errors(cuda.cuMemFree(dtimer))
+
+    avg_elapsed_clocks = 0.0
+
+    for i in range(num_blocks):
+        avg_elapsed_clocks += timer[i + num_blocks] - timer[i]
+
+    avg_elapsed_clocks = avg_elapsed_clocks / num_blocks
+    print(f"Average clocks/block = {avg_elapsed_clocks}")
 
 
 if __name__ == "__main__":
diff --git a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
index 75f1b0800d..5d764509ce 100644
--- a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
@@ -7,12 +7,12 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors, findCudaDevice
+from common.helper_cuda import check_cuda_errors, find_cuda_device
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
 
-simpleCubemapTexture = """\
+simple_cubemap_texture = """\
 extern "C"
 __global__ void transformKernel(float *g_odata, int width, cudaTextureObject_t tex)
 {
@@ -83,14 +83,14 @@
 
 def main():
     # Use command-line specified CUDA device, otherwise use device with highest Gflops/s
-    devID = findCudaDevice()
+    dev_id = find_cuda_device()
 
     # Get number of SMs on this GPU
-    deviceProps = checkCudaErrors(cudart.cudaGetDeviceProperties(devID))
+    device_props = check_cuda_errors(cudart.cudaGetDeviceProperties(dev_id))
     print(
-        f"CUDA device [{deviceProps.name}] has {deviceProps.multiProcessorCount} Multi-Processors SM {deviceProps.major}.{deviceProps.minor}"
+        f"CUDA device [{device_props.name}] has {device_props.multiProcessorCount} Multi-Processors SM {device_props.major}.{device_props.minor}"
     )
-    if deviceProps.major < 2:
+    if device_props.major < 2:
         import pytest
 
         pytest.skip("Test requires SM 2.0 or higher for support of Texture Arrays.")
@@ -107,15 +107,15 @@ def main():
     h_data_ref = np.repeat(np.arange(num_layers, dtype=h_data.dtype), cubemap_size) - h_data
 
     # Allocate device memory for result
-    d_data = checkCudaErrors(cudart.cudaMalloc(size))
+    d_data = check_cuda_errors(cudart.cudaMalloc(size))
 
     # Allocate array and copy image data
-    channelDesc = checkCudaErrors(
+    channel_desc = check_cuda_errors(
         cudart.cudaCreateChannelDesc(32, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat)
     )
-    cu_3darray = checkCudaErrors(
+    cu_3darray = check_cuda_errors(
         cudart.cudaMalloc3DArray(
-            channelDesc,
+            channel_desc,
             cudart.make_cudaExtent(width, width, num_faces),
             cudart.cudaArrayCubemap,
         )
@@ -128,90 +128,90 @@ def main():
     myparms.dstArray = cu_3darray
     myparms.extent = cudart.make_cudaExtent(width, width, num_faces)
     myparms.kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
-    checkCudaErrors(cudart.cudaMemcpy3D(myparms))
-
-    texRes = cudart.cudaResourceDesc()
-    texRes.resType = cudart.cudaResourceType.cudaResourceTypeArray
-    texRes.res.array.array = cu_3darray
-
-    texDescr = cudart.cudaTextureDesc()
-    texDescr.normalizedCoords = True
-    texDescr.filterMode = cudart.cudaTextureFilterMode.cudaFilterModeLinear
-    texDescr.addressMode[0] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
-    texDescr.addressMode[1] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
-    texDescr.addressMode[2] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
-    texDescr.readMode = cudart.cudaTextureReadMode.cudaReadModeElementType
-
-    tex = checkCudaErrors(cudart.cudaCreateTextureObject(texRes, texDescr, None))
-    dimBlock = cudart.dim3()
-    dimBlock.x = 8
-    dimBlock.y = 8
-    dimBlock.z = 1
-    dimGrid = cudart.dim3()
-    dimGrid.x = width / dimBlock.x
-    dimGrid.y = width / dimBlock.y
-    dimGrid.z = 1
+    check_cuda_errors(cudart.cudaMemcpy3D(myparms))
+
+    tex_res = cudart.cudaResourceDesc()
+    tex_res.resType = cudart.cudaResourceType.cudaResourceTypeArray
+    tex_res.res.array.array = cu_3darray
+
+    tex_descr = cudart.cudaTextureDesc()
+    tex_descr.normalizedCoords = True
+    tex_descr.filterMode = cudart.cudaTextureFilterMode.cudaFilterModeLinear
+    tex_descr.addressMode[0] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
+    tex_descr.addressMode[1] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
+    tex_descr.addressMode[2] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
+    tex_descr.readMode = cudart.cudaTextureReadMode.cudaReadModeElementType
+
+    tex = check_cuda_errors(cudart.cudaCreateTextureObject(tex_res, tex_descr, None))
+    dim_block = cudart.dim3()
+    dim_block.x = 8
+    dim_block.y = 8
+    dim_block.z = 1
+    dim_grid = cudart.dim3()
+    dim_grid.x = width / dim_block.x
+    dim_grid.y = width / dim_block.y
+    dim_grid.z = 1
 
     print(
-        f"Covering Cubemap data array of {width}~3 x {num_layers}: Grid size is {dimGrid.x} x {dimGrid.y}, each block has 8 x 8 threads"
+        f"Covering Cubemap data array of {width}~3 x {num_layers}: Grid size is {dim_grid.x} x {dim_grid.y}, each block has 8 x 8 threads"
     )
 
-    with common.KernelHelper(simpleCubemapTexture, devID) as kernelHelper:
-        _transformKernel = kernelHelper.getFunction(b"transformKernel")
-        kernelArgs = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None))
-        checkCudaErrors(
-            cuda.cuLaunchKernel(
-                _transformKernel,
-                dimGrid.x,
-                dimGrid.y,
-                dimGrid.z,  # grid dim
-                dimBlock.x,
-                dimBlock.y,
-                dimBlock.z,  # block dim
-                0,
-                0,  # shared mem and stream
-                kernelArgs,
-                0,
-            )
-        )  # arguments
-
-        checkCudaErrors(cudart.cudaDeviceSynchronize())
-
-        start = time.time()
-
-        # Execute the kernel
-        checkCudaErrors(
-            cuda.cuLaunchKernel(
-                _transformKernel,
-                dimGrid.x,
-                dimGrid.y,
-                dimGrid.z,  # grid dim
-                dimBlock.x,
-                dimBlock.y,
-                dimBlock.z,  # block dim
-                0,
-                0,  # shared mem and stream
-                kernelArgs,
-                0,
-            )
-        )  # arguments
-
-        checkCudaErrors(cudart.cudaDeviceSynchronize())
-        stop = time.time()
-        print(f"Processing time: {stop - start:.3f} msec")
-        print(f"{cubemap_size / ((stop - start + 1) / 1000.0) / 1e6:.2f} Mtexlookups/sec")
-
-        # Allocate mem for the result on host side
-        h_odata = np.empty_like(h_data)
-        # Copy result from device to host
-        checkCudaErrors(cudart.cudaMemcpy(h_odata, d_data, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost))
-
-    checkCudaErrors(cudart.cudaDestroyTextureObject(tex))
-    checkCudaErrors(cudart.cudaFree(d_data))
-    checkCudaErrors(cudart.cudaFreeArray(cu_3darray))
-
-    MIN_EPSILON_ERROR = 5.0e-3
-    if np.max(np.abs(h_odata - h_data_ref)) > MIN_EPSILON_ERROR:
+    kernel_helper = common.KernelHelper(simple_cubemap_texture, dev_id)
+    _transform_kernel = kernel_helper.get_function(b"transformKernel")
+    kernel_args = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None))
+    check_cuda_errors(
+        cuda.cuLaunchKernel(
+            _transform_kernel,
+            dim_grid.x,
+            dim_grid.y,
+            dim_grid.z,  # grid dim
+            dim_block.x,
+            dim_block.y,
+            dim_block.z,  # block dim
+            0,
+            0,  # shared mem and stream
+            kernel_args,
+            0,
+        )
+    )  # arguments
+
+    check_cuda_errors(cudart.cudaDeviceSynchronize())
+
+    start = time.time()
+
+    # Execute the kernel
+    check_cuda_errors(
+        cuda.cuLaunchKernel(
+            _transform_kernel,
+            dim_grid.x,
+            dim_grid.y,
+            dim_grid.z,  # grid dim
+            dim_block.x,
+            dim_block.y,
+            dim_block.z,  # block dim
+            0,
+            0,  # shared mem and stream
+            kernel_args,
+            0,
+        )
+    )  # arguments
+
+    check_cuda_errors(cudart.cudaDeviceSynchronize())
+    stop = time.time()
+    print(f"Processing time: {stop - start:.3f} msec")
+    print(f"{cubemap_size / ((stop - start + 1) / 1000.0) / 1e6:.2f} Mtexlookups/sec")
+
+    # Allocate mem for the result on host side
+    h_odata = np.empty_like(h_data)
+    # Copy result from device to host
+    check_cuda_errors(cudart.cudaMemcpy(h_odata, d_data, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost))
+
+    check_cuda_errors(cudart.cudaDestroyTextureObject(tex))
+    check_cuda_errors(cudart.cudaFree(d_data))
+    check_cuda_errors(cudart.cudaFreeArray(cu_3darray))
+
+    min_epsilon_error = 5.0e-3
+    if np.max(np.abs(h_odata - h_data_ref)) > min_epsilon_error:
         print("Failed", file=sys.stderr)
         sys.exit(1)
 
diff --git a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
index a60dbac5bc..09dafa1be1 100644
--- a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors
+from common.helper_cuda import check_cuda_errors
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
@@ -41,24 +41,24 @@ def main():
 
     # Number of GPUs
     print("Checking for multiple GPUs...")
-    gpu_n = checkCudaErrors(cudart.cudaGetDeviceCount())
+    gpu_n = check_cuda_errors(cudart.cudaGetDeviceCount())
     print(f"CUDA-capable device count: {gpu_n}")
 
     if gpu_n < 2:
         pytest.skip("Two or more GPUs with Peer-to-Peer access capability are required")
 
-    prop = [checkCudaErrors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)]
+    prop = [check_cuda_errors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)]
     # Check possibility for peer access
     print("\nChecking GPU(s) for support of peer to peer memory access...")
 
-    p2pCapableGPUs = [-1, -1]
+    p2p_capable_gp_us = [-1, -1]
     for i in range(gpu_n):
-        p2pCapableGPUs[0] = i
+        p2p_capable_gp_us[0] = i
         for j in range(gpu_n):
             if i == j:
                 continue
-            i_access_j = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(i, j))
-            j_access_i = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(j, i))
+            i_access_j = check_cuda_errors(cudart.cudaDeviceCanAccessPeer(i, j))
+            j_access_i = check_cuda_errors(cudart.cudaDeviceCanAccessPeer(j, i))
             print(
                 "> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format(
                     prop[i].name, i, prop[j].name, j, "Yes" if i_access_j else "No"
@@ -70,54 +70,54 @@ def main():
                 )
             )
             if i_access_j and j_access_i:
-                p2pCapableGPUs[1] = j
+                p2p_capable_gp_us[1] = j
                 break
-        if p2pCapableGPUs[1] != -1:
+        if p2p_capable_gp_us[1] != -1:
             break
 
-    if p2pCapableGPUs[0] == -1 or p2pCapableGPUs[1] == -1:
+    if p2p_capable_gp_us[0] == -1 or p2p_capable_gp_us[1] == -1:
         pytest.skip("Peer to Peer access is not available amongst GPUs in the system")
 
     # Use first pair of p2p capable GPUs detected
-    gpuid = [p2pCapableGPUs[0], p2pCapableGPUs[1]]
+    gpuid = [p2p_capable_gp_us[0], p2p_capable_gp_us[1]]
 
     # Enable peer access
     print(f"Enabling peer access between GPU{gpuid[0]} and GPU{gpuid[1]}...")
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[0]))
-    checkCudaErrors(cudart.cudaDeviceEnablePeerAccess(gpuid[1], 0))
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[1]))
-    checkCudaErrors(cudart.cudaDeviceEnablePeerAccess(gpuid[0], 0))
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[0]))
+    check_cuda_errors(cudart.cudaDeviceEnablePeerAccess(gpuid[1], 0))
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[1]))
+    check_cuda_errors(cudart.cudaDeviceEnablePeerAccess(gpuid[0], 0))
 
     # Allocate buffers
     buf_size = 1024 * 1024 * 16 * np.dtype(np.float32).itemsize
     print(f"Allocating buffers ({int(buf_size / 1024 / 1024)}MB on GPU{gpuid[0]}, GPU{gpuid[1]} and CPU Host)...")
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[0]))
-    g0 = checkCudaErrors(cudart.cudaMalloc(buf_size))
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[1]))
-    g1 = checkCudaErrors(cudart.cudaMalloc(buf_size))
-    h0 = checkCudaErrors(cudart.cudaMallocHost(buf_size))  # Automatically portable with UVA
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[0]))
+    g0 = check_cuda_errors(cudart.cudaMalloc(buf_size))
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[1]))
+    g1 = check_cuda_errors(cudart.cudaMalloc(buf_size))
+    h0 = check_cuda_errors(cudart.cudaMallocHost(buf_size))  # Automatically portable with UVA
 
     # Create CUDA event handles
     print("Creating event handles...")
     eventflags = cudart.cudaEventBlockingSync
-    start_event = checkCudaErrors(cudart.cudaEventCreateWithFlags(eventflags))
-    stop_event = checkCudaErrors(cudart.cudaEventCreateWithFlags(eventflags))
+    start_event = check_cuda_errors(cudart.cudaEventCreateWithFlags(eventflags))
+    stop_event = check_cuda_errors(cudart.cudaEventCreateWithFlags(eventflags))
 
     # P2P memcopy() benchmark
-    checkCudaErrors(cudart.cudaEventRecord(start_event, cudart.cudaStream_t(0)))
+    check_cuda_errors(cudart.cudaEventRecord(start_event, cudart.cudaStream_t(0)))
 
     for i in range(100):
         # With UVA we don't need to specify source and target devices, the
         # runtime figures this out by itself from the pointers
         # Ping-pong copy between GPUs
         if i % 2 == 0:
-            checkCudaErrors(cudart.cudaMemcpy(g1, g0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault))
+            check_cuda_errors(cudart.cudaMemcpy(g1, g0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault))
         else:
-            checkCudaErrors(cudart.cudaMemcpy(g0, g1, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault))
+            check_cuda_errors(cudart.cudaMemcpy(g0, g1, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault))
 
-    checkCudaErrors(cudart.cudaEventRecord(stop_event, cudart.cudaStream_t(0)))
-    checkCudaErrors(cudart.cudaEventSynchronize(stop_event))
-    time_memcpy = checkCudaErrors(cudart.cudaEventElapsedTime(start_event, stop_event))
+    check_cuda_errors(cudart.cudaEventRecord(stop_event, cudart.cudaStream_t(0)))
+    check_cuda_errors(cudart.cudaEventSynchronize(stop_event))
+    time_memcpy = check_cuda_errors(cudart.cudaEventElapsedTime(start_event, stop_event))
     print(
         f"cudaMemcpyPeer / cudaMemcpy between GPU{gpuid[0]} and GPU{gpuid[1]}: {(1.0 / (time_memcpy / 1000.0)) * (100.0 * buf_size) / 1024.0 / 1024.0 / 1024.0:.2f}GB/s"
     )
@@ -129,8 +129,8 @@ def main():
     for i in range(int(buf_size / np.dtype(np.float32).itemsize)):
         h0_local[i] = i % 4096
 
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[0]))
-    checkCudaErrors(cudart.cudaMemcpy(g0, h0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault))
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[0]))
+    check_cuda_errors(cudart.cudaMemcpy(g0, h0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault))
 
     # Kernel launch configuration
     threads = cudart.dim3()
@@ -145,57 +145,61 @@ def main():
     # Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
     # output to the GPU 1 buffer
     print(f"Run kernel on GPU{gpuid[1]}, taking source data from GPU{gpuid[0]} and writing to GPU{gpuid[1]}...")
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[1]))
-
-    with common.KernelHelper(simplep2p, gpuid[1]) as kernelHelper:
-        simple_kernel_1 = kernelHelper.getFunction(b"SimpleKernel")
-        kernel_args_1 = ((g0, g1), (ctypes.c_void_p, ctypes.c_void_p))
-        checkCudaErrors(
-            cuda.cuLaunchKernel(
-                simple_kernel_1,
-                blocks.x,
-                blocks.y,
-                blocks.z,
-                threads.x,
-                threads.y,
-                threads.z,
-                0,
-                0,
-                kernel_args_1,
-                0,
-            )
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[1]))
+
+    kernel_helper = [None] * 2
+    _simple_kernel = [None] * 2
+    kernel_args = [None] * 2
+
+    kernel_helper[1] = common.KernelHelper(simplep2p, gpuid[1])
+    _simple_kernel[1] = kernel_helper[1].get_function(b"SimpleKernel")
+    kernel_args[1] = ((g0, g1), (ctypes.c_void_p, ctypes.c_void_p))
+    check_cuda_errors(
+        cuda.cuLaunchKernel(
+            _simple_kernel[1],
+            blocks.x,
+            blocks.y,
+            blocks.z,
+            threads.x,
+            threads.y,
+            threads.z,
+            0,
+            0,
+            kernel_args[1],
+            0,
         )
+    )
 
-    checkCudaErrors(cudart.cudaDeviceSynchronize())
+    check_cuda_errors(cudart.cudaDeviceSynchronize())
 
     # Run kernel on GPU 0, reading input from the GPU 1 buffer, writing
     # output to the GPU 0 buffer
     print(f"Run kernel on GPU{gpuid[0]}, taking source data from GPU{gpuid[1]} and writing to GPU{gpuid[0]}...")
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[0]))
-    with common.KernelHelper(simplep2p, gpuid[0]) as kernelHelper:
-        simple_kernel_0 = kernelHelper.getFunction(b"SimpleKernel")
-        kernel_args_0 = ((g1, g0), (ctypes.c_void_p, ctypes.c_void_p))
-        checkCudaErrors(
-            cuda.cuLaunchKernel(
-                simple_kernel_0,
-                blocks.x,
-                blocks.y,
-                blocks.z,
-                threads.x,
-                threads.y,
-                threads.z,
-                0,
-                0,
-                kernel_args_0,
-                0,
-            )
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[0]))
+    kernel_helper[0] = common.KernelHelper(simplep2p, gpuid[0])
+    _simple_kernel[0] = kernel_helper[0].get_function(b"SimpleKernel")
+    kernel_args[0] = ((g1, g0), (ctypes.c_void_p, ctypes.c_void_p))
+    check_cuda_errors(
+        cuda.cuLaunchKernel(
+            _simple_kernel[0],
+            blocks.x,
+            blocks.y,
+            blocks.z,
+            threads.x,
+            threads.y,
+            threads.z,
+            0,
+            0,
+            kernel_args[0],
+            0,
         )
+    )
 
-    checkCudaErrors(cudart.cudaDeviceSynchronize())
+    check_cuda_errors(cudart.cudaDeviceSynchronize())
 
     # Copy data back to host and verify
     print(f"Copy data back to host from GPU{gpuid[0]} and verify results...")
-    checkCudaErrors(cudart.cudaMemcpy(h0, g0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault))
+    check_cuda_errors(cudart.cudaMemcpy(h0, g0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault))
 
     error_count = 0
 
@@ -210,23 +214,23 @@ def main():
 
     # Disable peer access (also unregisters memory for non-UVA cases)
     print("Disabling peer access...")
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[0]))
-    checkCudaErrors(cudart.cudaDeviceDisablePeerAccess(gpuid[1]))
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[1]))
-    checkCudaErrors(cudart.cudaDeviceDisablePeerAccess(gpuid[0]))
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[0]))
+    check_cuda_errors(cudart.cudaDeviceDisablePeerAccess(gpuid[1]))
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[1]))
+    check_cuda_errors(cudart.cudaDeviceDisablePeerAccess(gpuid[0]))
 
     # Cleanup and shutdown
     print("Shutting down...")
-    checkCudaErrors(cudart.cudaEventDestroy(start_event))
-    checkCudaErrors(cudart.cudaEventDestroy(stop_event))
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[0]))
-    checkCudaErrors(cudart.cudaFree(g0))
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[1]))
-    checkCudaErrors(cudart.cudaFree(g1))
-    checkCudaErrors(cudart.cudaFreeHost(h0))
+    check_cuda_errors(cudart.cudaEventDestroy(start_event))
+    check_cuda_errors(cudart.cudaEventDestroy(stop_event))
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[0]))
+    check_cuda_errors(cudart.cudaFree(g0))
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[1]))
+    check_cuda_errors(cudart.cudaFree(g1))
+    check_cuda_errors(cudart.cudaFreeHost(h0))
 
     for i in range(gpu_n):
-        checkCudaErrors(cudart.cudaSetDevice(i))
+        check_cuda_errors(cudart.cudaSetDevice(i))
 
     if error_count != 0:
         print("Test failed!", file=sys.stderr)
diff --git a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
index ea64017b95..d4bf44e19a 100644
--- a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
@@ -9,13 +9,13 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors
-from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt
+from common.helper_cuda import check_cuda_errors
+from common.helper_string import check_cmd_line_flag, get_cmd_line_argument_int
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
 
-simpleZeroCopy = """\
+simple_zero_copy = """\
 extern "C"
 __global__ void vectorAddGPU(float *a, float *b, float *c, int N)
 {
@@ -31,7 +31,7 @@
 
 def main():
     idev = 0
-    bPinGenericMemory = False
+    b_pin_generic_memory = False
 
     import pytest
 
@@ -47,7 +47,7 @@ def main():
     if platform.machine() == "sbsa":
         pytest.skip("simpleZeroCopy is not supported on sbsa")
 
-    if checkCmdLineFlag("help"):
+    if check_cmd_line_flag("help"):
         print("Usage:  simpleZeroCopy [OPTION]\n", file=sys.stderr)
         print("Options:", file=sys.stderr)
         print("  device=[device #]  Specify the device to be used", file=sys.stderr)
@@ -55,50 +55,50 @@ def main():
         sys.exit(1)
 
     # Get the device selected by the user or default to 0, and then set it.
-    if checkCmdLineFlag("device="):
-        deviceCount = cudart.cudaGetDeviceCount()
-        idev = int(getCmdLineArgumentInt("device="))
+    if check_cmd_line_flag("device="):
+        device_count = cudart.cudaGetDeviceCount()
+        idev = int(get_cmd_line_argument_int("device="))
 
-        if idev >= deviceCount or idev < 0:
+        if idev >= device_count or idev < 0:
             print(f"Device number {idev} is invalid, will use default CUDA device 0.")
             idev = 0
 
-    if checkCmdLineFlag("use_generic_memory"):
-        bPinGenericMemory = True
+    if check_cmd_line_flag("use_generic_memory"):
+        b_pin_generic_memory = True
 
-    if bPinGenericMemory:
+    if b_pin_generic_memory:
         print("> Using Generic System Paged Memory (malloc)")
     else:
         print("> Using CUDA Host Allocated (cudaHostAlloc)")
 
-    checkCudaErrors(cudart.cudaSetDevice(idev))
+    check_cuda_errors(cudart.cudaSetDevice(idev))
 
     # Verify the selected device supports mapped memory and set the device flags for mapping host memory.
-    deviceProp = checkCudaErrors(cudart.cudaGetDeviceProperties(idev))
+    device_prop = check_cuda_errors(cudart.cudaGetDeviceProperties(idev))
 
-    if not deviceProp.canMapHostMemory:
+    if not device_prop.canMapHostMemory:
         pytest.skip(f"Device {idev} does not support mapping CPU host memory!")
 
-    checkCudaErrors(cudart.cudaSetDeviceFlags(cudart.cudaDeviceMapHost))
+    check_cuda_errors(cudart.cudaSetDeviceFlags(cudart.cudaDeviceMapHost))
 
     # Allocate mapped CPU memory
 
     nelem = 1048576
     num_bytes = nelem * np.dtype(np.float32).itemsize
 
-    if bPinGenericMemory:
+    if b_pin_generic_memory:
         a = np.empty(nelem, dtype=np.float32)
         b = np.empty(nelem, dtype=np.float32)
         c = np.empty(nelem, dtype=np.float32)
 
-        checkCudaErrors(cudart.cudaHostRegister(a, num_bytes, cudart.cudaHostRegisterMapped))
-        checkCudaErrors(cudart.cudaHostRegister(b, num_bytes, cudart.cudaHostRegisterMapped))
-        checkCudaErrors(cudart.cudaHostRegister(c, num_bytes, cudart.cudaHostRegisterMapped))
+        check_cuda_errors(cudart.cudaHostRegister(a, num_bytes, cudart.cudaHostRegisterMapped))
+        check_cuda_errors(cudart.cudaHostRegister(b, num_bytes, cudart.cudaHostRegisterMapped))
+        check_cuda_errors(cudart.cudaHostRegister(c, num_bytes, cudart.cudaHostRegisterMapped))
     else:
         flags = cudart.cudaHostAllocMapped
-        a_ptr = checkCudaErrors(cudart.cudaHostAlloc(num_bytes, flags))
-        b_ptr = checkCudaErrors(cudart.cudaHostAlloc(num_bytes, flags))
-        c_ptr = checkCudaErrors(cudart.cudaHostAlloc(num_bytes, flags))
+        a_ptr = check_cuda_errors(cudart.cudaHostAlloc(num_bytes, flags))
+        b_ptr = check_cuda_errors(cudart.cudaHostAlloc(num_bytes, flags))
+        c_ptr = check_cuda_errors(cudart.cudaHostAlloc(num_bytes, flags))
 
         a = (ctypes.c_float * nelem).from_address(a_ptr)
         b = (ctypes.c_float * nelem).from_address(b_ptr)
@@ -110,9 +110,9 @@ def main():
         b[n] = rnd.random()
 
     # Get the device pointers for the pinned CPU memory mapped into the GPU memory space
-    d_a = checkCudaErrors(cudart.cudaHostGetDevicePointer(a, 0))
-    d_b = checkCudaErrors(cudart.cudaHostGetDevicePointer(b, 0))
-    d_c = checkCudaErrors(cudart.cudaHostGetDevicePointer(c, 0))
+    d_a = check_cuda_errors(cudart.cudaHostGetDevicePointer(a, 0))
+    d_b = check_cuda_errors(cudart.cudaHostGetDevicePointer(b, 0))
+    d_c = check_cuda_errors(cudart.cudaHostGetDevicePointer(c, 0))
 
     # Call the GPU kernel using the CPU pointers residing in CPU mapped memory
     print("> vectorAddGPU kernel will add vectors using mapped CPU memory...")
@@ -124,57 +124,57 @@ def main():
     grid.x = math.ceil(nelem / float(block.x))
     grid.y = 1
     grid.z = 1
-    with common.KernelHelper(simpleZeroCopy, idev) as kernelHelper:
-        _vectorAddGPU = kernelHelper.getFunction(b"vectorAddGPU")
-        kernelArgs = (
-            (d_a, d_b, d_c, nelem),
-            (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int),
+    kernel_helper = common.KernelHelper(simple_zero_copy, idev)
+    _vector_add_gpu = kernel_helper.get_function(b"vectorAddGPU")
+    kernel_args = (
+        (d_a, d_b, d_c, nelem),
+        (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int),
+    )
+    check_cuda_errors(
+        cuda.cuLaunchKernel(
+            _vector_add_gpu,
+            grid.x,
+            grid.y,
+            grid.z,
+            block.x,
+            block.y,
+            block.z,
+            0,
+            cuda.CU_STREAM_LEGACY,
+            kernel_args,
+            0,
         )
-        checkCudaErrors(
-            cuda.cuLaunchKernel(
-                _vectorAddGPU,
-                grid.x,
-                grid.y,
-                grid.z,
-                block.x,
-                block.y,
-                block.z,
-                0,
-                cuda.CU_STREAM_LEGACY,
-                kernelArgs,
-                0,
-            )
-        )
-    checkCudaErrors(cudart.cudaDeviceSynchronize())
+    )
+    check_cuda_errors(cudart.cudaDeviceSynchronize())
 
     print("> Checking the results from vectorAddGPU() ...")
     # Compare the results
-    errorNorm = 0.0
-    refNorm = 0.0
+    error_norm = 0.0
+    ref_norm = 0.0
 
     for n in range(nelem):
         ref = a[n] + b[n]
         diff = c[n] - ref
-        errorNorm += diff * diff
-        refNorm += ref * ref
+        error_norm += diff * diff
+        ref_norm += ref * ref
 
-    errorNorm = math.sqrt(errorNorm)
-    refNorm = math.sqrt(refNorm)
+    error_norm = math.sqrt(error_norm)
+    ref_norm = math.sqrt(ref_norm)
 
     # Memory clean up
 
     print("Releasing CPU memory...")
 
-    if bPinGenericMemory:
-        checkCudaErrors(cudart.cudaHostUnregister(a))
-        checkCudaErrors(cudart.cudaHostUnregister(b))
-        checkCudaErrors(cudart.cudaHostUnregister(c))
+    if b_pin_generic_memory:
+        check_cuda_errors(cudart.cudaHostUnregister(a))
+        check_cuda_errors(cudart.cudaHostUnregister(b))
+        check_cuda_errors(cudart.cudaHostUnregister(c))
     else:
-        checkCudaErrors(cudart.cudaFreeHost(a))
-        checkCudaErrors(cudart.cudaFreeHost(b))
-        checkCudaErrors(cudart.cudaFreeHost(c))
+        check_cuda_errors(cudart.cudaFreeHost(a))
+        check_cuda_errors(cudart.cudaFreeHost(b))
+        check_cuda_errors(cudart.cudaFreeHost(c))
 
-    if errorNorm / refNorm >= 1.0e-7:
+    if error_norm / ref_norm >= 1.0e-7:
         print("FAILED", file=sys.stderr)
         sys.exit(1)
 
diff --git a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
index df52462854..94a356101f 100644
--- a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
+++ b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
@@ -7,12 +7,12 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors, findCudaDevice
+from common.helper_cuda import check_cuda_errors, find_cuda_device
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
 
-systemWideAtomics = """\
+system_wide_atomics = """\
 #define LOOP_NUM 50
 
 extern "C"
@@ -63,21 +63,21 @@
 #! @param reference  reference data, computed but preallocated
 #! @param idata      input data as provided to device
 #! @param len        number of elements in reference / idata
-def verify(testData, length):
+def verify(test_data, length):
     val = 0
 
     for i in range(length * LOOP_NUM):
         val += 10
 
-    if val != testData[0]:
-        print(f"atomicAdd failed val = {val} testData = {testData[0]}")
+    if val != test_data[0]:
+        print(f"atomicAdd failed val = {val} test_data = {test_data[0]}")
         return False
 
     val = 0
     found = False
     for i in range(length):
         # second element should be a member of [0, len)
-        if i == testData[1]:
+        if i == test_data[1]:
             found = True
             break
 
@@ -91,7 +91,7 @@ def verify(testData, length):
         # third element should be len-1
         val = max(val, i)
 
-    if val != testData[2]:
+    if val != test_data[2]:
         print("atomicMax failed")
         return False
 
@@ -100,7 +100,7 @@ def verify(testData, length):
     for i in range(length):
         val = min(val, i)
 
-    if val != testData[3]:
+    if val != test_data[3]:
         print("atomicMin failed")
         return False
 
@@ -110,7 +110,7 @@ def verify(testData, length):
     for i in range(length * LOOP_NUM):
         val = 0 if val >= limit else val + 1
 
-    if val != testData[4]:
+    if val != test_data[4]:
         print("atomicInc failed")
         return False
 
@@ -120,7 +120,7 @@ def verify(testData, length):
     for i in range(length * LOOP_NUM):
         val = limit if (val == 0) or (val > limit) else val - 1
 
-    if val != testData[5]:
+    if val != test_data[5]:
         print("atomicDec failed")
         return False
 
@@ -128,7 +128,7 @@ def verify(testData, length):
 
     for i in range(length):
         # seventh element should be a member of [0, len)
-        if i == testData[6]:
+        if i == test_data[6]:
             found = True
             break
 
@@ -142,13 +142,13 @@ def verify(testData, length):
         # 8th element should be 1
         val &= 2 * i + 7
 
-    if val != testData[7]:
+    if val != test_data[7]:
         print("atomicAnd failed")
         return False
 
     # 9th element should be 0xff
     val = -1
-    if val != testData[8]:
+    if val != test_data[8]:
         print("atomicOr failed")
         return False
 
@@ -158,7 +158,7 @@ def verify(testData, length):
         # 11th element should be 0xff
         val ^= i
 
-    if val != testData[9]:
+    if val != test_data[9]:
         print("atomicXor failed")
         return False
 
@@ -172,72 +172,74 @@ def main():
         pytest.skip("Atomics not supported on Windows")
 
     # set device
-    dev_id = findCudaDevice()
-    device_prop = checkCudaErrors(cudart.cudaGetDeviceProperties(dev_id))
+    dev_id = find_cuda_device()
+    device_prop = check_cuda_errors(cudart.cudaGetDeviceProperties(dev_id))
 
     if not device_prop.managedMemory:
         pytest.skip("Unified Memory not supported on this device")
 
-    computeMode = checkCudaErrors(cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeMode, dev_id))
-    if computeMode == cudart.cudaComputeMode.cudaComputeModeProhibited:
+    compute_mode = check_cuda_errors(
+        cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeMode, dev_id)
+    )
+    if compute_mode == cudart.cudaComputeMode.cudaComputeModeProhibited:
         pytest.skip("This sample requires a device in either default or process exclusive mode")
 
     if device_prop.major < 6:
         pytest.skip("Requires a minimum CUDA compute 6.0 capability")
 
-    numThreads = 256
-    numBlocks = 64
-    numData = 10
+    num_threads = 256
+    num_blocks = 64
+    num_data = 10
 
     if device_prop.pageableMemoryAccess:
         print("CAN access pageable memory")
-        atom_arr_h = (ctypes.c_int * numData)(0)
+        atom_arr_h = (ctypes.c_int * num_data)(0)
         atom_arr = ctypes.addressof(atom_arr_h)
     else:
         print("CANNOT access pageable memory")
-        atom_arr = checkCudaErrors(
-            cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * numData, cudart.cudaMemAttachGlobal)
+        atom_arr = check_cuda_errors(
+            cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * num_data, cudart.cudaMemAttachGlobal)
         )
-        atom_arr_h = (ctypes.c_int * numData).from_address(atom_arr)
+        atom_arr_h = (ctypes.c_int * num_data).from_address(atom_arr)
 
-    for i in range(numData):
+    for i in range(num_data):
         atom_arr_h[i] = 0
 
     # To make the AND and XOR tests generate something other than 0...
     atom_arr_h[7] = atom_arr_h[9] = 0xFF
 
-    with common.KernelHelper(systemWideAtomics, dev_id) as kernelHelper:
-        _atomicKernel = kernelHelper.getFunction(b"atomicKernel")
-        kernelArgs = ((atom_arr,), (ctypes.c_void_p,))
-        checkCudaErrors(
-            cuda.cuLaunchKernel(
-                _atomicKernel,
-                numBlocks,
-                1,
-                1,  # grid dim
-                numThreads,
-                1,
-                1,  # block dim
-                0,
-                cuda.CU_STREAM_LEGACY,  # shared mem and stream
-                kernelArgs,
-                0,
-            )
-        )  # arguments
+    kernel_helper = common.KernelHelper(system_wide_atomics, dev_id)
+    _atomic_kernel = kernel_helper.get_function(b"atomicKernel")
+    kernel_args = ((atom_arr,), (ctypes.c_void_p,))
+    check_cuda_errors(
+        cuda.cuLaunchKernel(
+            _atomic_kernel,
+            num_blocks,
+            1,
+            1,  # grid dim
+            num_threads,
+            1,
+            1,  # block dim
+            0,
+            cuda.CU_STREAM_LEGACY,  # shared mem and stream
+            kernel_args,
+            0,
+        )
+    )  # arguments
     # NOTE: Python doesn't have an equivalent system atomic operations
     # atomicKernel_CPU(atom_arr_h, numBlocks * numThreads)
 
-    checkCudaErrors(cudart.cudaDeviceSynchronize())
+    check_cuda_errors(cudart.cudaDeviceSynchronize())
 
     # Compute & verify reference solution
-    testResult = verify(atom_arr_h, numThreads * numBlocks)
+    test_result = verify(atom_arr_h, num_threads * num_blocks)
 
     if device_prop.pageableMemoryAccess:
         pass
     else:
-        checkCudaErrors(cudart.cudaFree(atom_arr))
+        check_cuda_errors(cudart.cudaFree(atom_arr))
 
-    if not testResult:
+    if not test_result:
         print("systemWideAtomics completed with errors", file=sys.stderr)
         sys.exit(1)
 
diff --git a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
index 8ee238e36b..8c70aadd3a 100644
--- a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
+++ b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
@@ -7,11 +7,11 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors, findCudaDeviceDRV
+from common.helper_cuda import check_cuda_errors, find_cuda_device_drv
 
 from cuda.bindings import driver as cuda
 
-vectorAddDrv = """\
+vector_add_drv = """\
 /* Vector addition: C = A + B.
  *
  * This sample is a very basic sample that implements element by element
@@ -32,82 +32,82 @@
 
 
 def main():
-    N = 50000
-    nbytes = N * np.dtype(np.float32).itemsize
+    n = 50000
+    nbytes = n * np.dtype(np.float32).itemsize
 
     # Initialize
-    checkCudaErrors(cuda.cuInit(0))
-    cuDevice = findCudaDeviceDRV()
+    check_cuda_errors(cuda.cuInit(0))
+    cu_device = find_cuda_device_drv()
     # Create context
-    cuContext = checkCudaErrors(cuda.cuCtxCreate(None, 0, cuDevice))
+    cu_context = check_cuda_errors(cuda.cuCtxCreate(None, 0, cu_device))
 
-    uvaSupported = checkCudaErrors(
-        cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice)
+    uva_supported = check_cuda_errors(
+        cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cu_device)
     )
-    if not uvaSupported:
+    if not uva_supported:
         import pytest
 
         pytest.skip("Accessing pageable memory directly requires UVA")
 
-    with common.KernelHelper(vectorAddDrv, int(cuDevice)) as kernelHelper:
-        _VecAdd_kernel = kernelHelper.getFunction(b"VecAdd_kernel")
-
-        # Allocate input vectors h_A and h_B in host memory
-        h_A = np.random.rand(N).astype(dtype=np.float32)
-        h_B = np.random.rand(N).astype(dtype=np.float32)
-        h_C = np.random.rand(N).astype(dtype=np.float32)
-
-        # Allocate vectors in device memory
-        d_A = checkCudaErrors(cuda.cuMemAlloc(nbytes))
-        d_B = checkCudaErrors(cuda.cuMemAlloc(nbytes))
-        d_C = checkCudaErrors(cuda.cuMemAlloc(nbytes))
-
-        # Copy vectors from host memory to device memory
-        checkCudaErrors(cuda.cuMemcpyHtoD(d_A, h_A, nbytes))
-        checkCudaErrors(cuda.cuMemcpyHtoD(d_B, h_B, nbytes))
-
-        if True:
-            # Grid/Block configuration
-            threadsPerBlock = 256
-            blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock
-
-            kernelArgs = ((d_A, d_B, d_C, N), (None, None, None, ctypes.c_int))
-
-            # Launch the CUDA kernel
-            checkCudaErrors(
-                cuda.cuLaunchKernel(
-                    _VecAdd_kernel,
-                    blocksPerGrid,
-                    1,
-                    1,
-                    threadsPerBlock,
-                    1,
-                    1,
-                    0,
-                    0,
-                    kernelArgs,
-                    0,
-                )
+    kernel_helper = common.KernelHelper(vector_add_drv, int(cu_device))
+    _vec_add_kernel = kernel_helper.get_function(b"VecAdd_kernel")
+
+    # Allocate input vectors h_A and h_B in host memory
+    h_a = np.random.rand(n).astype(dtype=np.float32)
+    h_b = np.random.rand(n).astype(dtype=np.float32)
+    h_c = np.random.rand(n).astype(dtype=np.float32)
+
+    # Allocate vectors in device memory
+    d_a = check_cuda_errors(cuda.cuMemAlloc(nbytes))
+    d_b = check_cuda_errors(cuda.cuMemAlloc(nbytes))
+    d_c = check_cuda_errors(cuda.cuMemAlloc(nbytes))
+
+    # Copy vectors from host memory to device memory
+    check_cuda_errors(cuda.cuMemcpyHtoD(d_a, h_a, nbytes))
+    check_cuda_errors(cuda.cuMemcpyHtoD(d_b, h_b, nbytes))
+
+    if True:
+        # Grid/Block configuration
+        threads_per_block = 256
+        blocks_per_grid = (n + threads_per_block - 1) / threads_per_block
+
+        kernel_args = ((d_a, d_b, d_c, n), (None, None, None, ctypes.c_int))
+
+        # Launch the CUDA kernel
+        check_cuda_errors(
+            cuda.cuLaunchKernel(
+                _vec_add_kernel,
+                blocks_per_grid,
+                1,
+                1,
+                threads_per_block,
+                1,
+                1,
+                0,
+                0,
+                kernel_args,
+                0,
             )
-        else:
-            pass
-
-        # Copy result from device memory to host memory
-        # h_C contains the result in host memory
-        checkCudaErrors(cuda.cuMemcpyDtoH(h_C, d_C, nbytes))
-
-        for i in range(N):
-            sum_all = h_A[i] + h_B[i]
-            if math.fabs(h_C[i] - sum_all) > 1e-7:
-                break
-
-        # Free device memory
-        checkCudaErrors(cuda.cuMemFree(d_A))
-        checkCudaErrors(cuda.cuMemFree(d_B))
-        checkCudaErrors(cuda.cuMemFree(d_C))
-
-    checkCudaErrors(cuda.cuCtxDestroy(cuContext))
-    if i + 1 != N:
+        )
+    else:
+        pass
+
+    # Copy result from device memory to host memory
+    # h_C contains the result in host memory
+    check_cuda_errors(cuda.cuMemcpyDtoH(h_c, d_c, nbytes))
+
+    for i in range(n):
+        sum_all = h_a[i] + h_b[i]
+        if math.fabs(h_c[i] - sum_all) > 1e-7:
+            break
+
+    # Free device memory
+    check_cuda_errors(cuda.cuMemFree(d_a))
+    check_cuda_errors(cuda.cuMemFree(d_b))
+    check_cuda_errors(cuda.cuMemFree(d_c))
+
+    check_cuda_errors(cuda.cuCtxDestroy(cu_context))
+    if i + 1 != n:
         print("Result = FAIL", file=sys.stderr)
         sys.exit(1)
 
diff --git a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
index c7f9e6275b..d5e2e3d26f 100644
--- a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
+++ b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
@@ -8,11 +8,11 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors, findCudaDeviceDRV
+from common.helper_cuda import check_cuda_errors, find_cuda_device_drv
 
 from cuda.bindings import driver as cuda
 
-vectorAddMMAP = """\
+vector_add_mmap = """\
 /* Vector addition: C = A + B.
  *
  * This sample is a very basic sample that implements element by element
@@ -36,35 +36,35 @@ def round_up(x, y):
     return int((x - 1) / y + 1) * y
 
 
-def getBackingDevices(cuDevice):
-    num_devices = checkCudaErrors(cuda.cuDeviceGetCount())
+def get_backing_devices(cu_device):
+    num_devices = check_cuda_errors(cuda.cuDeviceGetCount())
 
-    backingDevices = [cuDevice]
+    backing_devices = [cu_device]
     for dev in range(num_devices):
         # The mapping device is already in the backingDevices vector
-        if int(dev) == int(cuDevice):
+        if int(dev) == int(cu_device):
             continue
 
         # Only peer capable devices can map each others memory
-        capable = checkCudaErrors(cuda.cuDeviceCanAccessPeer(cuDevice, dev))
+        capable = check_cuda_errors(cuda.cuDeviceCanAccessPeer(cu_device, dev))
         if not capable:
             continue
 
         # The device needs to support virtual address management for the required apis to work
-        attributeVal = checkCudaErrors(
+        attribute_val = check_cuda_errors(
             cuda.cuDeviceGetAttribute(
                 cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
-                cuDevice,
+                cu_device,
             )
         )
-        if attributeVal == 0:
+        if attribute_val == 0:
             continue
 
-        backingDevices.append(cuda.CUdevice(dev))
-    return backingDevices
+        backing_devices.append(cuda.CUdevice(dev))
+    return backing_devices
 
 
-def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0):
+def simple_malloc_multi_device_mmap(size, resident_devices, mapping_devices, align=0):
     min_granularity = 0
 
     # Setup the properties common for all the chunks
@@ -77,7 +77,7 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0):
 
     # Get the minimum granularity needed for the resident devices
     # (the max of the minimum granularity of each participating device)
-    for device in residentDevices:
+    for device in resident_devices:
         prop.location.id = device
         status, granularity = cuda.cuMemGetAllocationGranularity(
             prop, cuda.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM
@@ -89,7 +89,7 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0):
 
     # Get the minimum granularity needed for the accessing devices
     # (the max of the minimum granularity of each participating device)
-    for device in mappingDevices:
+    for device in mapping_devices:
         prop.location.id = device
         status, granularity = cuda.cuMemGetAllocationGranularity(
             prop, cuda.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM
@@ -103,28 +103,28 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0):
     # Essentially size = N * residentDevices.size() * min_granularity is the requirement,
     # since each piece of the allocation will be stripeSize = N * min_granularity
     # and the min_granularity requirement applies to each stripeSize piece of the allocation.
-    size = round_up(size, len(residentDevices) * min_granularity)
-    stripeSize = size / len(residentDevices)
+    size = round_up(size, len(resident_devices) * min_granularity)
+    stripe_size = size / len(resident_devices)
 
     # Return the rounded up size to the caller for use in the free
-    allocationSize = size
+    allocation_size = size
 
     # Reserve the required contiguous VA space for the allocations
     status, dptr = cuda.cuMemAddressReserve(size, align, cuda.CUdeviceptr(0), 0)
     if status != cuda.CUresult.CUDA_SUCCESS:
-        simpleFreeMultiDeviceMmap(dptr, size)
+        simple_free_multi_device_mmap(dptr, size)
         return status, None, None
 
     # Create and map the backings on each gpu
     # note: reusing CUmemAllocationProp prop from earlier with prop.type & prop.location.type already specified.
-    for idx in range(len(residentDevices)):
+    for idx in range(len(resident_devices)):
         # Set the location for this chunk to this device
-        prop.location.id = residentDevices[idx]
+        prop.location.id = resident_devices[idx]
 
         # Create the allocation as a pinned allocation on this device
-        status, allocationHandle = cuda.cuMemCreate(stripeSize, prop, 0)
+        status, allocation_handle = cuda.cuMemCreate(stripe_size, prop, 0)
         if status != cuda.CUresult.CUDA_SUCCESS:
-            simpleFreeMultiDeviceMmap(dptr, size)
+            simple_free_multi_device_mmap(dptr, size)
             return status, None, None
 
         # Assign the chunk to the appropriate VA range and release the handle.
@@ -132,10 +132,10 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0):
         # Since we do not need to make any other mappings of this memory or export it,
         # we no longer need and can release the allocationHandle.
         # The allocation will be kept live until it is unmapped.
-        (status,) = cuda.cuMemMap(int(dptr) + (stripeSize * idx), stripeSize, 0, allocationHandle, 0)
+        (status,) = cuda.cuMemMap(int(dptr) + (stripe_size * idx), stripe_size, 0, allocation_handle, 0)
 
         # the handle needs to be released even if the mapping failed.
-        (status2,) = cuda.cuMemRelease(allocationHandle)
+        (status2,) = cuda.cuMemRelease(allocation_handle)
         if status != cuda.CUresult.CUDA_SUCCESS:
             # cuMemRelease should not have failed here
             # as the handle was just allocated successfully
@@ -144,31 +144,31 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0):
 
         # Cleanup in case of any mapping failures.
         if status != cuda.CUresult.CUDA_SUCCESS:
-            simpleFreeMultiDeviceMmap(dptr, size)
+            simple_free_multi_device_mmap(dptr, size)
             return status, None, None
 
     # Each accessDescriptor will describe the mapping requirement for a single device
-    accessDescriptors = [cuda.CUmemAccessDesc()] * len(mappingDevices)
+    access_descriptors = [cuda.CUmemAccessDesc()] * len(mapping_devices)
 
     # Prepare the access descriptor array indicating where and how the backings should be visible.
-    for idx in range(len(mappingDevices)):
+    for idx in range(len(mapping_devices)):
         # Specify which device we are adding mappings for.
-        accessDescriptors[idx].location.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-        accessDescriptors[idx].location.id = mappingDevices[idx]
+        access_descriptors[idx].location.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+        access_descriptors[idx].location.id = mapping_devices[idx]
 
         # Specify both read and write access.
-        accessDescriptors[idx].flags = cuda.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
+        access_descriptors[idx].flags = cuda.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
 
     # Apply the access descriptors to the whole VA range.
-    (status,) = cuda.cuMemSetAccess(dptr, size, accessDescriptors, len(accessDescriptors))
+    (status,) = cuda.cuMemSetAccess(dptr, size, access_descriptors, len(access_descriptors))
     if status != cuda.CUresult.CUDA_SUCCESS:
-        simpleFreeMultiDeviceMmap(dptr, size)
+        simple_free_multi_device_mmap(dptr, size)
         return status, None, None
 
-    return (status, dptr, allocationSize)
+    return (status, dptr, allocation_size)
 
 
-def simpleFreeMultiDeviceMmap(dptr, size):
+def simple_free_multi_device_mmap(dptr, size):
     # Unmap the mapped virtual memory region
     # Since the handles to the mapped backing stores have already been released
     # by cuMemRelease, and these are the only/last mappings referencing them,
@@ -204,97 +204,97 @@ def main():
     if platform.machine() == "sbsa":
         pytest.skip("vectorAddMMAP is not supported on sbsa")
 
-    N = 50000
-    size = N * np.dtype(np.float32).itemsize
+    n = 50000
+    size = n * np.dtype(np.float32).itemsize
 
     # Initialize
-    checkCudaErrors(cuda.cuInit(0))
+    check_cuda_errors(cuda.cuInit(0))
 
-    cuDevice = findCudaDeviceDRV()
+    cu_device = find_cuda_device_drv()
 
     # Check that the selected device supports virtual address management
-    attributeVal = checkCudaErrors(
+    attribute_val = check_cuda_errors(
         cuda.cuDeviceGetAttribute(
             cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
-            cuDevice,
+            cu_device,
         )
     )
-    print(f"Device {cuDevice} VIRTUAL ADDRESS MANAGEMENT SUPPORTED = {attributeVal}.")
-    if not attributeVal:
-        pytest.skip(f"Device {cuDevice} doesn't support VIRTUAL ADDRESS MANAGEMENT.")
+    print(f"Device {cu_device} VIRTUAL ADDRESS MANAGEMENT SUPPORTED = {attribute_val}.")
+    if not attribute_val:
+        pytest.skip(f"Device {cu_device} doesn't support VIRTUAL ADDRESS MANAGEMENT.")
 
     # The vector addition happens on cuDevice, so the allocations need to be mapped there.
-    mappingDevices = [cuDevice]
+    mapping_devices = [cu_device]
 
     # Collect devices accessible by the mapping device (cuDevice) into the backingDevices vector.
-    backingDevices = getBackingDevices(cuDevice)
+    backing_devices = get_backing_devices(cu_device)
 
     # Create context
-    cuContext = checkCudaErrors(cuda.cuCtxCreate(None, 0, cuDevice))
-
-    with common.KernelHelper(vectorAddMMAP, int(cuDevice)) as kernelHelper:
-        _VecAdd_kernel = kernelHelper.getFunction(b"VecAdd_kernel")
-
-        # Allocate input vectors h_A and h_B in host memory
-        h_A = np.random.rand(size).astype(dtype=np.float32)
-        h_B = np.random.rand(size).astype(dtype=np.float32)
-        h_C = np.random.rand(size).astype(dtype=np.float32)
-
-        # Allocate vectors in device memory
-        # note that a call to cuCtxEnablePeerAccess is not needed even though
-        # the backing devices and mapping device are not the same.
-        # This is because the cuMemSetAccess call explicitly specifies
-        # the cross device mapping.
-        # cuMemSetAccess is still subject to the constraints of cuDeviceCanAccessPeer
-        # for cross device mappings (hence why we checked cuDeviceCanAccessPeer earlier).
-        d_A, allocationSize = checkCudaErrors(simpleMallocMultiDeviceMmap(size, backingDevices, mappingDevices))
-        d_B, _ = checkCudaErrors(simpleMallocMultiDeviceMmap(size, backingDevices, mappingDevices))
-        d_C, _ = checkCudaErrors(simpleMallocMultiDeviceMmap(size, backingDevices, mappingDevices))
-
-        # Copy vectors from host memory to device memory
-        checkCudaErrors(cuda.cuMemcpyHtoD(d_A, h_A, size))
-        checkCudaErrors(cuda.cuMemcpyHtoD(d_B, h_B, size))
-
-        # Grid/Block configuration
-        threadsPerBlock = 256
-        blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock
-
-        kernelArgs = ((d_A, d_B, d_C, N), (None, None, None, ctypes.c_int))
-
-        # Launch the CUDA kernel
-        checkCudaErrors(
-            cuda.cuLaunchKernel(
-                _VecAdd_kernel,
-                blocksPerGrid,
-                1,
-                1,
-                threadsPerBlock,
-                1,
-                1,
-                0,
-                0,
-                kernelArgs,
-                0,
-            )
+    cu_context = check_cuda_errors(cuda.cuCtxCreate(None, 0, cu_device))
+
+    kernel_helper = common.KernelHelper(vector_add_mmap, int(cu_device))
+    _vec_add_kernel = kernel_helper.get_function(b"VecAdd_kernel")
+
+    # Allocate input vectors h_A and h_B in host memory
+    h_a = np.random.rand(size).astype(dtype=np.float32)
+    h_b = np.random.rand(size).astype(dtype=np.float32)
+    h_c = np.random.rand(size).astype(dtype=np.float32)
+
+    # Allocate vectors in device memory
+    # note that a call to cuCtxEnablePeerAccess is not needed even though
+    # the backing devices and mapping device are not the same.
+    # This is because the cuMemSetAccess call explicitly specifies
+    # the cross device mapping.
+    # cuMemSetAccess is still subject to the constraints of cuDeviceCanAccessPeer
+    # for cross device mappings (hence why we checked cuDeviceCanAccessPeer earlier).
+    d_a, allocation_size = check_cuda_errors(simple_malloc_multi_device_mmap(size, backing_devices, mapping_devices))
+    d_b, _ = check_cuda_errors(simple_malloc_multi_device_mmap(size, backing_devices, mapping_devices))
+    d_c, _ = check_cuda_errors(simple_malloc_multi_device_mmap(size, backing_devices, mapping_devices))
+
+    # Copy vectors from host memory to device memory
+    check_cuda_errors(cuda.cuMemcpyHtoD(d_a, h_a, size))
+    check_cuda_errors(cuda.cuMemcpyHtoD(d_b, h_b, size))
+
+    # Grid/Block configuration
+    threads_per_block = 256
+    blocks_per_grid = (n + threads_per_block - 1) / threads_per_block
+
+    kernel_args = ((d_a, d_b, d_c, n), (None, None, None, ctypes.c_int))
+
+    # Launch the CUDA kernel
+    check_cuda_errors(
+        cuda.cuLaunchKernel(
+            _vec_add_kernel,
+            blocks_per_grid,
+            1,
+            1,
+            threads_per_block,
+            1,
+            1,
+            0,
+            0,
+            kernel_args,
+            0,
         )
+    )
 
-        # Copy result from device memory to host memory
-        # h_C contains the result in host memory
-        checkCudaErrors(cuda.cuMemcpyDtoH(h_C, d_C, size))
+    # Copy result from device memory to host memory
+    # h_C contains the result in host memory
+    check_cuda_errors(cuda.cuMemcpyDtoH(h_c, d_c, size))
 
-        # Verify result
-        for i in range(N):
-            sum_all = h_A[i] + h_B[i]
-            if math.fabs(h_C[i] - sum_all) > 1e-7:
-                break
+    # Verify result
+    for i in range(n):
+        sum_all = h_a[i] + h_b[i]
+        if math.fabs(h_c[i] - sum_all) > 1e-7:
+            break
 
-        checkCudaErrors(simpleFreeMultiDeviceMmap(d_A, allocationSize))
-        checkCudaErrors(simpleFreeMultiDeviceMmap(d_B, allocationSize))
-        checkCudaErrors(simpleFreeMultiDeviceMmap(d_C, allocationSize))
+    check_cuda_errors(simple_free_multi_device_mmap(d_a, allocation_size))
+    check_cuda_errors(simple_free_multi_device_mmap(d_b, allocation_size))
+    check_cuda_errors(simple_free_multi_device_mmap(d_c, allocation_size))
 
-    checkCudaErrors(cuda.cuCtxDestroy(cuContext))
+    check_cuda_errors(cuda.cuCtxDestroy(cu_context))
 
-    if i + 1 != N:
+    if i + 1 != n:
         print("Result = FAIL", file=sys.stderr)
         sys.exit(1)
 
diff --git a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
index afe769ca15..f26dd2dabe 100644
--- a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
+++ b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
@@ -9,13 +9,13 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors, findCudaDevice
-from common.helper_string import checkCmdLineFlag
+from common.helper_cuda import check_cuda_errors, find_cuda_device
+from common.helper_string import check_cmd_line_flag
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
 
-streamOrderedAllocation = """\
+stream_ordered_allocation = """\
 /* Add two vectors on the GPU */
 extern "C"
 __global__ void vectorAddGPU(const float *a, const float *b, float *c, int N)
@@ -31,18 +31,18 @@
 MAX_ITER = 20
 
 
-def basicStreamOrderedAllocation(dev, nelem, a, b, c):
+def basic_stream_ordered_allocation(dev, nelem, a, b, c):
     num_bytes = nelem * np.dtype(np.float32).itemsize
 
     print("Starting basicStreamOrderedAllocation()")
-    checkCudaErrors(cudart.cudaSetDevice(dev))
-    stream = checkCudaErrors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking))
+    check_cuda_errors(cudart.cudaSetDevice(dev))
+    stream = check_cuda_errors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking))
 
-    d_a = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream))
-    d_b = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream))
-    d_c = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream))
-    checkCudaErrors(cudart.cudaMemcpyAsync(d_a, a, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream))
-    checkCudaErrors(cudart.cudaMemcpyAsync(d_b, b, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream))
+    d_a = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream))
+    d_b = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream))
+    d_c = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream))
+    check_cuda_errors(cudart.cudaMemcpyAsync(d_a, a, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream))
+    check_cuda_errors(cudart.cudaMemcpyAsync(d_b, b, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream))
 
     block = cudart.dim3()
     block.x = 256
@@ -53,13 +53,13 @@ def basicStreamOrderedAllocation(dev, nelem, a, b, c):
     grid.y = 1
     grid.z = 1
 
-    kernelArgs = (
+    kernel_args = (
         (d_a, d_b, d_c, nelem),
         (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int),
     )
-    checkCudaErrors(
+    check_cuda_errors(
         cuda.cuLaunchKernel(
-            _vectorAddGPU,
+            _vector_add_gpu,
             grid.x,
             grid.y,
             grid.z,  # grid dim
@@ -68,68 +68,72 @@ def basicStreamOrderedAllocation(dev, nelem, a, b, c):
             block.z,  # block dim
             0,
             stream,  # shared mem and stream
-            kernelArgs,
+            kernel_args,
             0,
         )
     )  # arguments
 
-    checkCudaErrors(cudart.cudaFreeAsync(d_a, stream))
-    checkCudaErrors(cudart.cudaFreeAsync(d_b, stream))
-    checkCudaErrors(cudart.cudaMemcpyAsync(c, d_c, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream))
-    checkCudaErrors(cudart.cudaFreeAsync(d_c, stream))
-    checkCudaErrors(cudart.cudaStreamSynchronize(stream))
+    check_cuda_errors(cudart.cudaFreeAsync(d_a, stream))
+    check_cuda_errors(cudart.cudaFreeAsync(d_b, stream))
+    check_cuda_errors(cudart.cudaMemcpyAsync(c, d_c, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream))
+    check_cuda_errors(cudart.cudaFreeAsync(d_c, stream))
+    check_cuda_errors(cudart.cudaStreamSynchronize(stream))
 
     # Compare the results
     print("> Checking the results from vectorAddGPU() ...")
-    errorNorm = 0.0
-    refNorm = 0.0
+    error_norm = 0.0
+    ref_norm = 0.0
 
     for n in range(nelem):
         ref = a[n] + b[n]
         diff = c[n] - ref
-        errorNorm += diff * diff
-        refNorm += ref * ref
+        error_norm += diff * diff
+        ref_norm += ref * ref
 
-    errorNorm = math.sqrt(errorNorm)
-    refNorm = math.sqrt(refNorm)
+    error_norm = math.sqrt(error_norm)
+    ref_norm = math.sqrt(ref_norm)
 
-    checkCudaErrors(cudart.cudaStreamDestroy(stream))
+    check_cuda_errors(cudart.cudaStreamDestroy(stream))
 
-    return errorNorm / refNorm < 1.0e-6
+    return error_norm / ref_norm < 1.0e-6
 
 
 # streamOrderedAllocationPostSync(): demonstrates If the application wants the memory to persist in the pool beyond
 # synchronization, then it sets the release threshold on the pool. This way, when the application reaches the "steady state",
 # it is no longer allocating/freeing memory from the OS.
-def streamOrderedAllocationPostSync(dev, nelem, a, b, c):
+def stream_ordered_allocation_post_sync(dev, nelem, a, b, c):
     num_bytes = nelem * np.dtype(np.float32).itemsize
 
     print("Starting streamOrderedAllocationPostSync()")
-    checkCudaErrors(cudart.cudaSetDevice(dev))
-    stream = checkCudaErrors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking))
-    start = checkCudaErrors(cudart.cudaEventCreate())
-    end = checkCudaErrors(cudart.cudaEventCreate())
+    check_cuda_errors(cudart.cudaSetDevice(dev))
+    stream = check_cuda_errors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking))
+    start = check_cuda_errors(cudart.cudaEventCreate())
+    end = check_cuda_errors(cudart.cudaEventCreate())
 
-    memPool = checkCudaErrors(cudart.cudaDeviceGetDefaultMemPool(dev))
-    thresholdVal = cuda.cuuint64_t(ctypes.c_uint64(-1).value)
+    mem_pool = check_cuda_errors(cudart.cudaDeviceGetDefaultMemPool(dev))
+    threshold_val = cuda.cuuint64_t(ctypes.c_uint64(-1).value)
     # Set high release threshold on the default pool so that cudaFreeAsync will not actually release memory to the system.
     # By default, the release threshold for a memory pool is set to zero. This implies that the CUDA driver is
     # allowed to release a memory chunk back to the system as long as it does not contain any active suballocations.
-    checkCudaErrors(
+    check_cuda_errors(
         cudart.cudaMemPoolSetAttribute(
-            memPool,
+            mem_pool,
             cudart.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold,
-            thresholdVal,
+            threshold_val,
         )
     )
     # Record teh start event
-    checkCudaErrors(cudart.cudaEventRecord(start, stream))
+    check_cuda_errors(cudart.cudaEventRecord(start, stream))
     for _i in range(MAX_ITER):
-        d_a = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream))
-        d_b = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream))
-        d_c = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream))
-        checkCudaErrors(cudart.cudaMemcpyAsync(d_a, a, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream))
-        checkCudaErrors(cudart.cudaMemcpyAsync(d_b, b, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream))
+        d_a = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream))
+        d_b = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream))
+        d_c = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream))
+        check_cuda_errors(
+            cudart.cudaMemcpyAsync(d_a, a, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
+        )
+        check_cuda_errors(
+            cudart.cudaMemcpyAsync(d_b, b, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
+        )
 
         block = cudart.dim3()
         block.x = 256
@@ -140,13 +144,13 @@ def streamOrderedAllocationPostSync(dev, nelem, a, b, c):
         grid.y = 1
         grid.z = 1
 
-        kernelArgs = (
+        kernel_args = (
             (d_a, d_b, d_c, nelem),
             (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int),
         )
-        checkCudaErrors(
+        check_cuda_errors(
             cuda.cuLaunchKernel(
-                _vectorAddGPU,
+                _vector_add_gpu,
                 grid.x,
                 grid.y,
                 grid.z,  # grid dim
@@ -155,40 +159,42 @@ def streamOrderedAllocationPostSync(dev, nelem, a, b, c):
                 block.z,  # block dim
                 0,
                 stream,  # shared mem and stream
-                kernelArgs,
+                kernel_args,
                 0,
             )
         )  # arguments
 
-        checkCudaErrors(cudart.cudaFreeAsync(d_a, stream))
-        checkCudaErrors(cudart.cudaFreeAsync(d_b, stream))
-        checkCudaErrors(cudart.cudaMemcpyAsync(c, d_c, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream))
-        checkCudaErrors(cudart.cudaFreeAsync(d_c, stream))
-        checkCudaErrors(cudart.cudaStreamSynchronize(stream))
-    checkCudaErrors(cudart.cudaEventRecord(end, stream))
+        check_cuda_errors(cudart.cudaFreeAsync(d_a, stream))
+        check_cuda_errors(cudart.cudaFreeAsync(d_b, stream))
+        check_cuda_errors(
+            cudart.cudaMemcpyAsync(c, d_c, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
+        )
+        check_cuda_errors(cudart.cudaFreeAsync(d_c, stream))
+        check_cuda_errors(cudart.cudaStreamSynchronize(stream))
+    check_cuda_errors(cudart.cudaEventRecord(end, stream))
     # Wait for the end event to complete
-    checkCudaErrors(cudart.cudaEventSynchronize(end))
+    check_cuda_errors(cudart.cudaEventSynchronize(end))
 
-    msecTotal = checkCudaErrors(cudart.cudaEventElapsedTime(start, end))
-    print(f"Total elapsed time = {msecTotal} ms over {MAX_ITER} iterations")
+    msec_total = check_cuda_errors(cudart.cudaEventElapsedTime(start, end))
+    print(f"Total elapsed time = {msec_total} ms over {MAX_ITER} iterations")
 
     # Compare the results
     print("> Checking the results from vectorAddGPU() ...")
-    errorNorm = 0.0
-    refNorm = 0.0
+    error_norm = 0.0
+    ref_norm = 0.0
 
     for n in range(nelem):
         ref = a[n] + b[n]
         diff = c[n] - ref
-        errorNorm += diff * diff
-        refNorm += ref * ref
+        error_norm += diff * diff
+        ref_norm += ref * ref
 
-    errorNorm = math.sqrt(errorNorm)
-    refNorm = math.sqrt(refNorm)
+    error_norm = math.sqrt(error_norm)
+    ref_norm = math.sqrt(ref_norm)
 
-    checkCudaErrors(cudart.cudaStreamDestroy(stream))
+    check_cuda_errors(cudart.cudaStreamDestroy(stream))
 
-    return errorNorm / refNorm < 1.0e-6
+    return error_norm / ref_norm < 1.0e-6
 
 
 def main():
@@ -198,42 +204,42 @@ def main():
         pytest.skip("streamOrderedAllocation is not supported on Mac OSX")
 
     cuda.cuInit(0)
-    if checkCmdLineFlag("help"):
+    if check_cmd_line_flag("help"):
         print("Usage:  streamOrderedAllocation [OPTION]\n", file=sys.stderr)
         print("Options:", file=sys.stderr)
         print("  device=[device #]  Specify the device to be used", file=sys.stderr)
         sys.exit(1)
 
-    dev = findCudaDevice()
+    dev = find_cuda_device()
 
-    version = checkCudaErrors(cudart.cudaDriverGetVersion())
+    version = check_cuda_errors(cudart.cudaDriverGetVersion())
     if version < 11030:
-        isMemPoolSupported = False
+        is_mem_pool_supported = False
     else:
-        isMemPoolSupported = checkCudaErrors(
+        is_mem_pool_supported = check_cuda_errors(
             cudart.cudaDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev)
         )
-    if not isMemPoolSupported:
+    if not is_mem_pool_supported:
         pytest.skip("Waiving execution as device does not support Memory Pools")
 
-    global _vectorAddGPU
-    with common.KernelHelper(streamOrderedAllocation, dev) as kernelHelper:
-        _vectorAddGPU = kernelHelper.getFunction(b"vectorAddGPU")
+    global _vector_add_gpu
+    kernel_helper = common.KernelHelper(stream_ordered_allocation, dev)
+    _vector_add_gpu = kernel_helper.get_function(b"vectorAddGPU")
 
-        # Allocate CPU memory
-        nelem = 1048576
-        nelem * np.dtype(np.float32).itemsize
+    # Allocate CPU memory
+    nelem = 1048576
+    nelem * np.dtype(np.float32).itemsize
 
-        a = np.zeros(nelem, dtype="float32")
-        b = np.zeros(nelem, dtype="float32")
-        c = np.zeros(nelem, dtype="float32")
-        # Initialize the vectors
-        for i in range(nelem):
-            a[i] = rnd.random()
-            b[i] = rnd.random()
+    a = np.zeros(nelem, dtype="float32")
+    b = np.zeros(nelem, dtype="float32")
+    c = np.zeros(nelem, dtype="float32")
+    # Initialize the vectors
+    for i in range(nelem):
+        a[i] = rnd.random()
+        b[i] = rnd.random()
 
-        ret1 = basicStreamOrderedAllocation(dev, nelem, a, b, c)
-        ret2 = streamOrderedAllocationPostSync(dev, nelem, a, b, c)
+    ret1 = basic_stream_ordered_allocation(dev, nelem, a, b, c)
+    ret2 = stream_ordered_allocation_post_sync(dev, nelem, a, b, c)
 
     if not ret1 or not ret2:
         sys.exit(1)
diff --git a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
index aaa03e446a..722d19dcb5 100644
--- a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
+++ b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
@@ -9,16 +9,16 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors, findCudaDevice
-from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt
+from common.helper_cuda import check_cuda_errors, find_cuda_device
+from common.helper_string import check_cmd_line_flag, get_cmd_line_argument_int
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
 
-blockSize = 16
+block_size = 16
 
 
-class kernels(Enum):
+class Kernels(Enum):
     AsyncCopyMultiStageLargeChunk = 0
     AsyncCopyLargeChunk = 1
     AsyncCopyLargeChunkAWBarrier = 2
@@ -29,7 +29,7 @@ class kernels(Enum):
     NaiveLargeChunk = 7
 
 
-kernelNames = [
+kernel_names = [
     "AsyncCopyMultiStageLargeChunk",
     "AsyncCopyLargeChunk",
     "AsyncCopyLargeChunkAWBarrier",
@@ -40,7 +40,7 @@ class kernels(Enum):
     "NaiveLargeChunk",
 ]
 
-globalToShmemAsyncCopy = """\
+global_to_shmem_async_copy = """\
 #line __LINE__
 #if __CUDA_ARCH__ >= 700
 #include <cuda/barrier>
@@ -709,7 +709,7 @@ class kernels(Enum):
 """
 
 
-def ConstantInit(data, size, val):
+def constant_init(data, size, val):
     p_data = (ctypes.c_float * size).from_address(data)
     for i in range(size):
         p_data[i] = val
@@ -718,78 +718,82 @@ def ConstantInit(data, size, val):
 #
 # Run matrix multiplication using CUDA
 #
-def MatrixMultiply(dimsA, dimsB, kernel_number):
+def matrix_multiply(dims_a, dims_b, kernel_number):
     # Allocate host memory for matricies A and B
-    size_A = dimsA.x * dimsA.y
-    mem_size_A = np.dtype(np.float32).itemsize * size_A
-    h_A = checkCudaErrors(cudart.cudaMallocHost(mem_size_A))
-    size_B = dimsB.x * dimsB.y
-    mem_size_B = np.dtype(np.float32).itemsize * size_B
-    h_B = checkCudaErrors(cudart.cudaMallocHost(mem_size_B))
+    size_a = dims_a.x * dims_a.y
+    mem_size_a = np.dtype(np.float32).itemsize * size_a
+    h_a = check_cuda_errors(cudart.cudaMallocHost(mem_size_a))
+    size_b = dims_b.x * dims_b.y
+    mem_size_b = np.dtype(np.float32).itemsize * size_b
+    h_b = check_cuda_errors(cudart.cudaMallocHost(mem_size_b))
 
     # Initialize host memory
-    valB = 2.10
-    ConstantInit(h_A, size_A, 1.0)
-    ConstantInit(h_B, size_B, valB)
+    val_b = 2.10
+    constant_init(h_a, size_a, 1.0)
+    constant_init(h_b, size_b, val_b)
 
     # Allocate Device Memory
 
     # Allocate host matrix C
-    dimsC = cudart.dim3()
-    dimsC.x = dimsB.x
-    dimsC.y = dimsA.y
-    dimsC.z = 1
-    mem_size_C = dimsC.x * dimsC.y * np.dtype(np.float32).itemsize
-    h_C = checkCudaErrors(cudart.cudaMallocHost(mem_size_C))
-
-    if h_C == 0:
+    dims_c = cudart.dim3()
+    dims_c.x = dims_b.x
+    dims_c.y = dims_a.y
+    dims_c.z = 1
+    mem_size_c = dims_c.x * dims_c.y * np.dtype(np.float32).itemsize
+    h_c = check_cuda_errors(cudart.cudaMallocHost(mem_size_c))
+
+    if h_c == 0:
         print("Failed to allocate host matrix C!", file=sys.stderr)
         sys.exit(1)
 
-    d_A = checkCudaErrors(cudart.cudaMalloc(mem_size_A))
-    d_B = checkCudaErrors(cudart.cudaMalloc(mem_size_B))
-    d_C = checkCudaErrors(cudart.cudaMalloc(mem_size_C))
+    d_a = check_cuda_errors(cudart.cudaMalloc(mem_size_a))
+    d_b = check_cuda_errors(cudart.cudaMalloc(mem_size_b))
+    d_c = check_cuda_errors(cudart.cudaMalloc(mem_size_c))
     # Allocate CUDA events that we'll use for timing
-    start = checkCudaErrors(cudart.cudaEventCreate())
-    stop = checkCudaErrors(cudart.cudaEventCreate())
+    start = check_cuda_errors(cudart.cudaEventCreate())
+    stop = check_cuda_errors(cudart.cudaEventCreate())
 
-    stream = checkCudaErrors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking))
+    stream = check_cuda_errors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking))
 
     # Copy host memory to device
-    checkCudaErrors(cudart.cudaMemcpyAsync(d_A, h_A, mem_size_A, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream))
-    checkCudaErrors(cudart.cudaMemcpyAsync(d_B, h_B, mem_size_B, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream))
-    checkCudaErrors(cudart.cudaMemsetAsync(d_C, 0, mem_size_C, stream))
+    check_cuda_errors(
+        cudart.cudaMemcpyAsync(d_a, h_a, mem_size_a, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
+    )
+    check_cuda_errors(
+        cudart.cudaMemcpyAsync(d_b, h_b, mem_size_b, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
+    )
+    check_cuda_errors(cudart.cudaMemsetAsync(d_c, 0, mem_size_c, stream))
 
     # Setup execution parameters
     threads = cudart.dim3()
-    threads.x = threads.y = blockSize
+    threads.x = threads.y = block_size
     threads.z = 1
     grid = cudart.dim3()
-    grid.x = dimsB.x / threads.x
-    grid.y = dimsA.y / threads.y
+    grid.x = dims_b.x / threads.x
+    grid.y = dims_a.y / threads.y
     grid.z = 1
 
     # Here the block size is 16x18, where first 16 rows are consumer thread group
     # and last 2 rows (1 warp) is producer thread group
-    threadsSharedStateKernel = cudart.dim3()
-    threadsSharedStateKernel.x = blockSize
-    threadsSharedStateKernel.y = blockSize + 2
-    threadsSharedStateKernel.z = 1
-    gridSharedStateKernel = cudart.dim3()
-    gridSharedStateKernel.x = dimsB.x / threadsSharedStateKernel.x
-    gridSharedStateKernel.y = dimsA.y / threadsSharedStateKernel.x
-
-    print(f"Running kernel = {kernel_number} - {kernelNames[kernel_number.value]}")
+    threads_shared_state_kernel = cudart.dim3()
+    threads_shared_state_kernel.x = block_size
+    threads_shared_state_kernel.y = block_size + 2
+    threads_shared_state_kernel.z = 1
+    grid_shared_state_kernel = cudart.dim3()
+    grid_shared_state_kernel.x = dims_b.x / threads_shared_state_kernel.x
+    grid_shared_state_kernel.y = dims_a.y / threads_shared_state_kernel.x
+
+    print(f"Running kernel = {kernel_number} - {kernel_names[kernel_number.value]}")
     # Create and start timer
     print("Computing result using CUDA Kernel...")
 
     # Performs warmup operation using matrixMul CUDA kernel
-    kernelArguments = (
-        (d_C, d_A, d_B, dimsA.x, dimsB.x),
+    kernel_arguments = (
+        (d_c, d_a, d_b, dims_a.x, dims_b.x),
         (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int),
     )
-    if kernel_number == kernels.AsyncCopyMultiStageLargeChunk:
-        checkCudaErrors(
+    if kernel_number == Kernels.AsyncCopyMultiStageLargeChunk:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopyMultiStageLargeChunk,
                 grid.x,
@@ -800,12 +804,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.AsyncCopyLargeChunk:
-        checkCudaErrors(
+    elif kernel_number == Kernels.AsyncCopyLargeChunk:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopyLargeChunk,
                 grid.x,
@@ -816,12 +820,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.AsyncCopyLargeChunkAWBarrier:
-        checkCudaErrors(
+    elif kernel_number == Kernels.AsyncCopyLargeChunkAWBarrier:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopyLargeChunkAWBarrier,
                 grid.x,
@@ -832,28 +836,28 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.AsyncCopyMultiStageSharedState:
-        checkCudaErrors(
+    elif kernel_number == Kernels.AsyncCopyMultiStageSharedState:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopyMultiStageSharedState,
-                gridSharedStateKernel.x,
-                gridSharedStateKernel.y,
-                gridSharedStateKernel.z,  # grid dim
-                threadsSharedStateKernel.x,
-                threadsSharedStateKernel.y,
-                threadsSharedStateKernel.z,  # block dim
+                grid_shared_state_kernel.x,
+                grid_shared_state_kernel.y,
+                grid_shared_state_kernel.z,  # grid dim
+                threads_shared_state_kernel.x,
+                threads_shared_state_kernel.y,
+                threads_shared_state_kernel.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.AsyncCopyMultiStage:
-        checkCudaErrors(
+    elif kernel_number == Kernels.AsyncCopyMultiStage:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopyMultiStage,
                 grid.x,
@@ -864,12 +868,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.AsyncCopySingleStage:
-        checkCudaErrors(
+    elif kernel_number == Kernels.AsyncCopySingleStage:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopySingleStage,
                 grid.x,
@@ -880,12 +884,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.Naive:
-        checkCudaErrors(
+    elif kernel_number == Kernels.Naive:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulNaive,
                 grid.x,
@@ -896,12 +900,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.NaiveLargeChunk:
-        checkCudaErrors(
+    elif kernel_number == Kernels.NaiveLargeChunk:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulNaiveLargeChunk,
                 grid.x,
@@ -912,21 +916,21 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
 
-    checkCudaErrors(cudart.cudaStreamSynchronize(stream))
+    check_cuda_errors(cudart.cudaStreamSynchronize(stream))
 
     # Execute the kernel
-    nIter = 100
+    n_iter = 100
 
     # Record the start event
-    checkCudaErrors(cudart.cudaEventRecord(start, stream))
+    check_cuda_errors(cudart.cudaEventRecord(start, stream))
 
-    if kernel_number == kernels.AsyncCopyMultiStageLargeChunk:
-        checkCudaErrors(
+    if kernel_number == Kernels.AsyncCopyMultiStageLargeChunk:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopyMultiStageLargeChunk,
                 grid.x,
@@ -937,12 +941,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.AsyncCopyLargeChunk:
-        checkCudaErrors(
+    elif kernel_number == Kernels.AsyncCopyLargeChunk:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopyLargeChunk,
                 grid.x,
@@ -953,12 +957,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.AsyncCopyLargeChunkAWBarrier:
-        checkCudaErrors(
+    elif kernel_number == Kernels.AsyncCopyLargeChunkAWBarrier:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopyLargeChunkAWBarrier,
                 grid.x,
@@ -969,28 +973,28 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.AsyncCopyMultiStageSharedState:
-        checkCudaErrors(
+    elif kernel_number == Kernels.AsyncCopyMultiStageSharedState:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopyMultiStageSharedState,
-                gridSharedStateKernel.x,
-                gridSharedStateKernel.y,
-                gridSharedStateKernel.z,  # grid dim
-                threadsSharedStateKernel.x,
-                threadsSharedStateKernel.y,
-                threadsSharedStateKernel.z,  # block dim
+                grid_shared_state_kernel.x,
+                grid_shared_state_kernel.y,
+                grid_shared_state_kernel.z,  # grid dim
+                threads_shared_state_kernel.x,
+                threads_shared_state_kernel.y,
+                threads_shared_state_kernel.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.AsyncCopyMultiStage:
-        checkCudaErrors(
+    elif kernel_number == Kernels.AsyncCopyMultiStage:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopyMultiStage,
                 grid.x,
@@ -1001,12 +1005,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.AsyncCopySingleStage:
-        checkCudaErrors(
+    elif kernel_number == Kernels.AsyncCopySingleStage:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopySingleStage,
                 grid.x,
@@ -1017,12 +1021,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.Naive:
-        checkCudaErrors(
+    elif kernel_number == Kernels.Naive:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulNaive,
                 grid.x,
@@ -1033,12 +1037,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.NaiveLargeChunk:
-        checkCudaErrors(
+    elif kernel_number == Kernels.NaiveLargeChunk:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulNaiveLargeChunk,
                 grid.x,
@@ -1049,31 +1053,33 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
 
     # Record the stop event
-    checkCudaErrors(cudart.cudaEventRecord(stop, stream))
+    check_cuda_errors(cudart.cudaEventRecord(stop, stream))
 
     # Wait for the stop event to complete
-    checkCudaErrors(cudart.cudaEventSynchronize(stop))
+    check_cuda_errors(cudart.cudaEventSynchronize(stop))
 
-    msecTotal = checkCudaErrors(cudart.cudaEventElapsedTime(start, stop))
+    msec_total = check_cuda_errors(cudart.cudaEventElapsedTime(start, stop))
 
     # Compute and print the performance
-    msecPerMatrixMul = msecTotal / nIter
-    flopsPerMatrixMul = 2.0 * dimsA.x * dimsA.y * dimsB.x
-    gigaFlops = (flopsPerMatrixMul * 1.0e-9) / (msecPerMatrixMul / 1000.0)
+    msec_per_matrix_mul = msec_total / n_iter
+    flops_per_matrix_mul = 2.0 * dims_a.x * dims_a.y * dims_b.x
+    giga_flops = (flops_per_matrix_mul * 1.0e-9) / (msec_per_matrix_mul / 1000.0)
 
     print(
-        f"Performance= {gigaFlops:.2f} GFlop/s, Time= {msecPerMatrixMul:.2f} msec, Size= {flopsPerMatrixMul:.0f} Ops, WorkgroupSize= {threads.x * threads.y} threads/block"
+        f"Performance= {giga_flops:.2f} GFlop/s, Time= {msec_per_matrix_mul:.2f} msec, Size= {flops_per_matrix_mul:.0f} Ops, WorkgroupSize= {threads.x * threads.y} threads/block"
     )
 
     # Copy result from device to host
-    checkCudaErrors(cudart.cudaMemcpyAsync(h_C, d_C, mem_size_C, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream))
-    checkCudaErrors(cudart.cudaStreamSynchronize(stream))
+    check_cuda_errors(
+        cudart.cudaMemcpyAsync(h_c, d_c, mem_size_c, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
+    )
+    check_cuda_errors(cudart.cudaStreamSynchronize(stream))
 
     correct = True
 
@@ -1081,16 +1087,16 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
     # |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
     eps = 1.0e-6
 
-    h_C_local = (ctypes.c_float * (dimsC.x * dimsC.y)).from_address(h_C)
-    for i in range(dimsC.x * dimsC.y):
-        abs_err = math.fabs(h_C_local[i] - (dimsA.x * valB))
-        dot_length = dimsA.x
-        abs_val = math.fabs(h_C_local[i])
+    h_c_local = (ctypes.c_float * (dims_c.x * dims_c.y)).from_address(h_c)
+    for i in range(dims_c.x * dims_c.y):
+        abs_err = math.fabs(h_c_local[i] - (dims_a.x * val_b))
+        dot_length = dims_a.x
+        abs_val = math.fabs(h_c_local[i])
         rel_err = abs_err / abs_val / dot_length
 
         if rel_err > eps:
             print(
-                f"Error! Matrix[{i:.5f}]={h_C_local[i]:.8f} ref={dimsA.x * valB:.8f} err term is > {rel_err}",
+                f"Error! Matrix[{i:.5f}]={h_c_local[i]:.8f} ref={dims_a.x * val_b:.8f} err term is > {rel_err}",
                 file=sys.stderr,
             )
             correct = False
@@ -1099,14 +1105,14 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
         print("Result = FAIL", file=sys.stderr)
 
     # Clean up memory
-    checkCudaErrors(cudart.cudaFreeHost(h_A))
-    checkCudaErrors(cudart.cudaFreeHost(h_B))
-    checkCudaErrors(cudart.cudaFreeHost(h_C))
-    checkCudaErrors(cudart.cudaFree(d_A))
-    checkCudaErrors(cudart.cudaFree(d_B))
-    checkCudaErrors(cudart.cudaFree(d_C))
-    checkCudaErrors(cudart.cudaEventDestroy(start))
-    checkCudaErrors(cudart.cudaEventDestroy(stop))
+    check_cuda_errors(cudart.cudaFreeHost(h_a))
+    check_cuda_errors(cudart.cudaFreeHost(h_b))
+    check_cuda_errors(cudart.cudaFreeHost(h_c))
+    check_cuda_errors(cudart.cudaFree(d_a))
+    check_cuda_errors(cudart.cudaFree(d_b))
+    check_cuda_errors(cudart.cudaFree(d_c))
+    check_cuda_errors(cudart.cudaEventDestroy(start))
+    check_cuda_errors(cudart.cudaEventDestroy(stop))
     print(
         "\nNOTE: The CUDA Samples are not meant for performance "
         "measurements. Results may vary when GPU Boost is enabled."
@@ -1119,16 +1125,16 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
 def main():
     import pytest
 
-    common.pytest_skipif_compute_capability_too_low(findCudaDevice(), (7, 0))
+    common.pytest_skipif_compute_capability_too_low(find_cuda_device(), (7, 0))
 
     if platform.machine() == "qnx":
         pytest.skip("globalToShmemAsyncCopy is not supported on QNX")
 
-    version = checkCudaErrors(cuda.cuDriverGetVersion())
+    version = check_cuda_errors(cuda.cuDriverGetVersion())
     if version < 11010:
         pytest.skip("CUDA Toolkit 11.1 or greater is required")
 
-    if checkCmdLineFlag("help") or checkCmdLineFlag("?"):
+    if check_cmd_line_flag("help") or check_cmd_line_flag("?"):
         print("Usage device=n (n >= 0 for deviceID)", file=sys.stderr)
         print("      wA=WidthA hA=HeightA (Width x Height of Matrix A)", file=sys.stderr)
         print("      wB=WidthB hB=HeightB (Width x Height of Matrix B)", file=sys.stderr)
@@ -1149,54 +1155,54 @@ def main():
 
     # This will pick the best possible CUDA capable device, otherwise
     # override the device ID based on input provided at the command line
-    devID = findCudaDevice()
+    dev_id = find_cuda_device()
 
-    matrixBlock = 32
-    dimsA = cudart.dim3()
-    dimsA.x = dimsA.y = 10 * 4 * matrixBlock
-    dimsA.z = 1
-    dimsB = cudart.dim3()
-    dimsB.x = dimsB.y = 10 * 4 * matrixBlock
-    dimsB.z = 1
+    matrix_block = 32
+    dims_a = cudart.dim3()
+    dims_a.x = dims_a.y = 10 * 4 * matrix_block
+    dims_a.z = 1
+    dims_b = cudart.dim3()
+    dims_b.x = dims_b.y = 10 * 4 * matrix_block
+    dims_b.z = 1
 
     # width of Matrix A
-    if checkCmdLineFlag("wA="):
-        dimsA.x = int(getCmdLineArgumentInt("wA="))
+    if check_cmd_line_flag("wA="):
+        dims_a.x = int(get_cmd_line_argument_int("wA="))
 
     # height of Matrix A
-    if checkCmdLineFlag("hA="):
-        dimsA.y = int(getCmdLineArgumentInt("hA="))
+    if check_cmd_line_flag("hA="):
+        dims_a.y = int(get_cmd_line_argument_int("hA="))
 
     # width of Matrix B
-    if checkCmdLineFlag("wB="):
-        dimsB.x = int(getCmdLineArgumentInt("wB="))
+    if check_cmd_line_flag("wB="):
+        dims_b.x = int(get_cmd_line_argument_int("wB="))
 
     # height of Matrix B
-    if checkCmdLineFlag("hB="):
-        dimsB.y = int(getCmdLineArgumentInt("hB="))
+    if check_cmd_line_flag("hB="):
+        dims_b.y = int(get_cmd_line_argument_int("hB="))
 
-    if dimsA.x != dimsB.y:
-        print(f"Error: outer matrix dimensions must be equal. ({dimsA.x} != {dimsB.y})", file=sys.stderr)
+    if dims_a.x != dims_b.y:
+        print(f"Error: outer matrix dimensions must be equal. ({dims_a.x} != {dims_b.y})", file=sys.stderr)
         sys.exit(1)
 
-    selected_kernel = kernels.AsyncCopyMultiStageLargeChunk
+    selected_kernel = Kernels.AsyncCopyMultiStageLargeChunk
 
     # kernel to run - default (AsyncCopyMultiStageLargeChunk == 0)
-    if checkCmdLineFlag("kernel="):
-        kernel_number = int(getCmdLineArgumentInt("kernel="))
+    if check_cmd_line_flag("kernel="):
+        kernel_number = int(get_cmd_line_argument_int("kernel="))
         if kernel_number < 8:
-            selected_kernel = kernels(kernel_number)
+            selected_kernel = Kernels(kernel_number)
         else:
             print("Error: kernel number should be between 0 to 7", file=sys.stderr)
             sys.exit(1)
 
-    major = checkCudaErrors(
-        cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID)
+    major = check_cuda_errors(
+        cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, dev_id)
     )
     if major < 7:
         pytest.skip("globalToShmemAsyncCopy requires SM 7.0 or higher.")
 
-    print(f"MatrixA({dimsA.x},{dimsA.y}), MatrixB({dimsB.x},{dimsB.y})")
+    print(f"MatrixA({dims_a.x},{dims_a.y}), MatrixB({dims_b.x},{dims_b.y})")
 
     global _MatrixMulAsyncCopyMultiStageLargeChunk
     global _MatrixMulAsyncCopyLargeChunk
@@ -1206,17 +1212,17 @@ def main():
     global _MatrixMulAsyncCopySingleStage
     global _MatrixMulNaive
     global _MatrixMulNaiveLargeChunk
-    with common.KernelHelper(globalToShmemAsyncCopy, devID) as kernelHelper:
-        _MatrixMulAsyncCopyMultiStageLargeChunk = kernelHelper.getFunction(b"MatrixMulAsyncCopyMultiStageLargeChunk")
-        _MatrixMulAsyncCopyLargeChunk = kernelHelper.getFunction(b"MatrixMulAsyncCopyLargeChunk")
-        _MatrixMulAsyncCopyLargeChunkAWBarrier = kernelHelper.getFunction(b"MatrixMulAsyncCopyLargeChunkAWBarrier")
-        _MatrixMulAsyncCopyMultiStageSharedState = kernelHelper.getFunction(b"MatrixMulAsyncCopyMultiStageSharedState")
-        _MatrixMulAsyncCopyMultiStage = kernelHelper.getFunction(b"MatrixMulAsyncCopyMultiStage")
-        _MatrixMulAsyncCopySingleStage = kernelHelper.getFunction(b"MatrixMulAsyncCopySingleStage")
-        _MatrixMulNaive = kernelHelper.getFunction(b"MatrixMulNaive")
-        _MatrixMulNaiveLargeChunk = kernelHelper.getFunction(b"MatrixMulNaiveLargeChunk")
-
-        matrix_result = MatrixMultiply(dimsA, dimsB, selected_kernel)
+    kernel_helper = common.KernelHelper(global_to_shmem_async_copy, dev_id)
+    _MatrixMulAsyncCopyMultiStageLargeChunk = kernel_helper.get_function(b"MatrixMulAsyncCopyMultiStageLargeChunk")
+    _MatrixMulAsyncCopyLargeChunk = kernel_helper.get_function(b"MatrixMulAsyncCopyLargeChunk")
+    _MatrixMulAsyncCopyLargeChunkAWBarrier = kernel_helper.get_function(b"MatrixMulAsyncCopyLargeChunkAWBarrier")
+    _MatrixMulAsyncCopyMultiStageSharedState = kernel_helper.get_function(b"MatrixMulAsyncCopyMultiStageSharedState")
+    _MatrixMulAsyncCopyMultiStage = kernel_helper.get_function(b"MatrixMulAsyncCopyMultiStage")
+    _MatrixMulAsyncCopySingleStage = kernel_helper.get_function(b"MatrixMulAsyncCopySingleStage")
+    _MatrixMulNaive = kernel_helper.get_function(b"MatrixMulNaive")
+    _MatrixMulNaiveLargeChunk = kernel_helper.get_function(b"MatrixMulNaiveLargeChunk")
+
+    matrix_result = matrix_multiply(dims_a, dims_b, selected_kernel)
 
     if matrix_result != 0:
         sys.exit(1)
diff --git a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
index 7746bd08e3..b08da3edc0 100644
--- a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
+++ b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors, findCudaDevice
+from common.helper_cuda import check_cuda_errors, find_cuda_device
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
@@ -14,7 +14,7 @@
 THREADS_PER_BLOCK = 512
 GRAPH_LAUNCH_ITERATIONS = 3
 
-simpleCudaGraphs = """\
+simple_cuda_graphs = """\
 #include <cooperative_groups.h>
 #include <cuda_runtime.h>
 
@@ -121,185 +121,185 @@ def init_input(a, size):
         a_list[i] = rnd.random()
 
 
-def cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, numOfBlocks):
+def cuda_graphs_manual(input_vec_h, input_vec_d, output_vec_d, result_d, input_size, num_of_blocks):
     result_h = ctypes.c_double(0.0)
-    nodeDependencies = []
+    node_dependencies = []
 
-    streamForGraph = checkCudaErrors(cudart.cudaStreamCreate())
+    stream_for_graph = check_cuda_errors(cudart.cudaStreamCreate())
 
-    kernelNodeParams = cuda.CUDA_KERNEL_NODE_PARAMS()
-    memcpyParams = cudart.cudaMemcpy3DParms()
-    memsetParams = cudart.cudaMemsetParams()
+    kernel_node_params = cuda.CUDA_KERNEL_NODE_PARAMS()
+    memcpy_params = cudart.cudaMemcpy3DParms()
+    memset_params = cudart.cudaMemsetParams()
 
-    memcpyParams.srcArray = None
-    memcpyParams.srcPos = cudart.make_cudaPos(0, 0, 0)
-    memcpyParams.srcPtr = cudart.make_cudaPitchedPtr(
-        inputVec_h, np.dtype(np.float32).itemsize * inputSize, inputSize, 1
+    memcpy_params.srcArray = None
+    memcpy_params.srcPos = cudart.make_cudaPos(0, 0, 0)
+    memcpy_params.srcPtr = cudart.make_cudaPitchedPtr(
+        input_vec_h, np.dtype(np.float32).itemsize * input_size, input_size, 1
     )
-    memcpyParams.dstArray = None
-    memcpyParams.dstPos = cudart.make_cudaPos(0, 0, 0)
-    memcpyParams.dstPtr = cudart.make_cudaPitchedPtr(
-        inputVec_d, np.dtype(np.float32).itemsize * inputSize, inputSize, 1
+    memcpy_params.dstArray = None
+    memcpy_params.dstPos = cudart.make_cudaPos(0, 0, 0)
+    memcpy_params.dstPtr = cudart.make_cudaPitchedPtr(
+        input_vec_d, np.dtype(np.float32).itemsize * input_size, input_size, 1
     )
-    memcpyParams.extent = cudart.make_cudaExtent(np.dtype(np.float32).itemsize * inputSize, 1, 1)
-    memcpyParams.kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
+    memcpy_params.extent = cudart.make_cudaExtent(np.dtype(np.float32).itemsize * input_size, 1, 1)
+    memcpy_params.kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
 
-    memsetParams.dst = outputVec_d
-    memsetParams.value = 0
-    memsetParams.pitch = 0
-    memsetParams.elementSize = np.dtype(np.float32).itemsize  # elementSize can be max 4 bytes
-    memsetParams.width = numOfBlocks * 2
-    memsetParams.height = 1
+    memset_params.dst = output_vec_d
+    memset_params.value = 0
+    memset_params.pitch = 0
+    memset_params.elementSize = np.dtype(np.float32).itemsize  # elementSize can be max 4 bytes
+    memset_params.width = num_of_blocks * 2
+    memset_params.height = 1
 
-    graph = checkCudaErrors(cudart.cudaGraphCreate(0))
+    graph = check_cuda_errors(cudart.cudaGraphCreate(0))
 
-    memcpyNode = checkCudaErrors(cudart.cudaGraphAddMemcpyNode(graph, None, 0, memcpyParams))
-    memsetNode = checkCudaErrors(cudart.cudaGraphAddMemsetNode(graph, None, 0, memsetParams))
+    memcpy_node = check_cuda_errors(cudart.cudaGraphAddMemcpyNode(graph, None, 0, memcpy_params))
+    memset_node = check_cuda_errors(cudart.cudaGraphAddMemsetNode(graph, None, 0, memset_params))
 
-    nodeDependencies.append(memsetNode)
-    nodeDependencies.append(memcpyNode)
+    node_dependencies.append(memset_node)
+    node_dependencies.append(memcpy_node)
 
-    kernelArgs = (
-        (inputVec_d, outputVec_d, inputSize, numOfBlocks),
+    kernel_args = (
+        (input_vec_d, output_vec_d, input_size, num_of_blocks),
         (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_uint),
     )
 
-    kernelNodeParams.func = _reduce
-    kernelNodeParams.gridDimX = numOfBlocks
-    kernelNodeParams.gridDimY = kernelNodeParams.gridDimZ = 1
-    kernelNodeParams.blockDimX = THREADS_PER_BLOCK
-    kernelNodeParams.blockDimY = kernelNodeParams.blockDimZ = 1
-    kernelNodeParams.sharedMemBytes = 0
-    kernelNodeParams.kernelParams = kernelArgs
+    kernel_node_params.func = _reduce
+    kernel_node_params.gridDimX = num_of_blocks
+    kernel_node_params.gridDimY = kernel_node_params.gridDimZ = 1
+    kernel_node_params.blockDimX = THREADS_PER_BLOCK
+    kernel_node_params.blockDimY = kernel_node_params.blockDimZ = 1
+    kernel_node_params.sharedMemBytes = 0
+    kernel_node_params.kernelParams = kernel_args
     # kernelNodeParams.extra = None
 
-    kernelNode = checkCudaErrors(
-        cuda.cuGraphAddKernelNode(graph, nodeDependencies, len(nodeDependencies), kernelNodeParams)
+    kernel_node = check_cuda_errors(
+        cuda.cuGraphAddKernelNode(graph, node_dependencies, len(node_dependencies), kernel_node_params)
     )
 
-    nodeDependencies.clear()
-    nodeDependencies.append(kernelNode)
-
-    memsetParams = cudart.cudaMemsetParams()
-    memsetParams.dst = result_d
-    memsetParams.value = 0
-    memsetParams.elementSize = np.dtype(np.float32).itemsize
-    memsetParams.width = 2
-    memsetParams.height = 1
-    memsetNode = checkCudaErrors(cudart.cudaGraphAddMemsetNode(graph, None, 0, memsetParams))
-
-    nodeDependencies.append(memsetNode)
-
-    kernelNodeParams = cuda.CUDA_KERNEL_NODE_PARAMS()
-    kernelNodeParams.func = _reduceFinal
-    kernelNodeParams.gridDimX = kernelNodeParams.gridDimY = kernelNodeParams.gridDimZ = 1
-    kernelNodeParams.blockDimX = THREADS_PER_BLOCK
-    kernelNodeParams.blockDimY = kernelNodeParams.blockDimZ = 1
-    kernelNodeParams.sharedMemBytes = 0
-    kernelArgs2 = (
-        (outputVec_d, result_d, numOfBlocks),
+    node_dependencies.clear()
+    node_dependencies.append(kernel_node)
+
+    memset_params = cudart.cudaMemsetParams()
+    memset_params.dst = result_d
+    memset_params.value = 0
+    memset_params.elementSize = np.dtype(np.float32).itemsize
+    memset_params.width = 2
+    memset_params.height = 1
+    memset_node = check_cuda_errors(cudart.cudaGraphAddMemsetNode(graph, None, 0, memset_params))
+
+    node_dependencies.append(memset_node)
+
+    kernel_node_params = cuda.CUDA_KERNEL_NODE_PARAMS()
+    kernel_node_params.func = _reduceFinal
+    kernel_node_params.gridDimX = kernel_node_params.gridDimY = kernel_node_params.gridDimZ = 1
+    kernel_node_params.blockDimX = THREADS_PER_BLOCK
+    kernel_node_params.blockDimY = kernel_node_params.blockDimZ = 1
+    kernel_node_params.sharedMemBytes = 0
+    kernel_args2 = (
+        (output_vec_d, result_d, num_of_blocks),
         (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_uint),
     )
-    kernelNodeParams.kernelParams = kernelArgs2
+    kernel_node_params.kernelParams = kernel_args2
     # kernelNodeParams.extra = None
 
-    kernelNode = checkCudaErrors(
-        cuda.cuGraphAddKernelNode(graph, nodeDependencies, len(nodeDependencies), kernelNodeParams)
+    kernel_node = check_cuda_errors(
+        cuda.cuGraphAddKernelNode(graph, node_dependencies, len(node_dependencies), kernel_node_params)
     )
 
-    nodeDependencies.clear()
-    nodeDependencies.append(kernelNode)
-
-    memcpyParams = cudart.cudaMemcpy3DParms()
-
-    memcpyParams.srcArray = None
-    memcpyParams.srcPos = cudart.make_cudaPos(0, 0, 0)
-    memcpyParams.srcPtr = cudart.make_cudaPitchedPtr(result_d, np.dtype(np.float64).itemsize, 1, 1)
-    memcpyParams.dstArray = None
-    memcpyParams.dstPos = cudart.make_cudaPos(0, 0, 0)
-    memcpyParams.dstPtr = cudart.make_cudaPitchedPtr(result_h, np.dtype(np.float64).itemsize, 1, 1)
-    memcpyParams.extent = cudart.make_cudaExtent(np.dtype(np.float64).itemsize, 1, 1)
-    memcpyParams.kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
-    memcpyNode = checkCudaErrors(
-        cudart.cudaGraphAddMemcpyNode(graph, nodeDependencies, len(nodeDependencies), memcpyParams)
+    node_dependencies.clear()
+    node_dependencies.append(kernel_node)
+
+    memcpy_params = cudart.cudaMemcpy3DParms()
+
+    memcpy_params.srcArray = None
+    memcpy_params.srcPos = cudart.make_cudaPos(0, 0, 0)
+    memcpy_params.srcPtr = cudart.make_cudaPitchedPtr(result_d, np.dtype(np.float64).itemsize, 1, 1)
+    memcpy_params.dstArray = None
+    memcpy_params.dstPos = cudart.make_cudaPos(0, 0, 0)
+    memcpy_params.dstPtr = cudart.make_cudaPitchedPtr(result_h, np.dtype(np.float64).itemsize, 1, 1)
+    memcpy_params.extent = cudart.make_cudaExtent(np.dtype(np.float64).itemsize, 1, 1)
+    memcpy_params.kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
+    memcpy_node = check_cuda_errors(
+        cudart.cudaGraphAddMemcpyNode(graph, node_dependencies, len(node_dependencies), memcpy_params)
     )
 
-    nodeDependencies.clear()
-    nodeDependencies.append(memcpyNode)
+    node_dependencies.clear()
+    node_dependencies.append(memcpy_node)
 
     # WIP: Host nodes
 
-    nodes, numNodes = checkCudaErrors(cudart.cudaGraphGetNodes(graph))
-    print(f"\nNum of nodes in the graph created manually = {numNodes}")
+    nodes, num_nodes = check_cuda_errors(cudart.cudaGraphGetNodes(graph))
+    print(f"\nNum of nodes in the graph created manually = {num_nodes}")
 
-    graphExec = checkCudaErrors(cudart.cudaGraphInstantiate(graph, 0))
+    graph_exec = check_cuda_errors(cudart.cudaGraphInstantiate(graph, 0))
 
-    clonedGraph = checkCudaErrors(cudart.cudaGraphClone(graph))
-    clonedGraphExec = checkCudaErrors(cudart.cudaGraphInstantiate(clonedGraph, 0))
+    cloned_graph = check_cuda_errors(cudart.cudaGraphClone(graph))
+    cloned_graph_exec = check_cuda_errors(cudart.cudaGraphInstantiate(cloned_graph, 0))
 
     for _i in range(GRAPH_LAUNCH_ITERATIONS):
-        checkCudaErrors(cudart.cudaGraphLaunch(graphExec, streamForGraph))
+        check_cuda_errors(cudart.cudaGraphLaunch(graph_exec, stream_for_graph))
 
-    checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph))
+    check_cuda_errors(cudart.cudaStreamSynchronize(stream_for_graph))
 
     print("Cloned Graph Output..")
     for _i in range(GRAPH_LAUNCH_ITERATIONS):
-        checkCudaErrors(cudart.cudaGraphLaunch(clonedGraphExec, streamForGraph))
+        check_cuda_errors(cudart.cudaGraphLaunch(cloned_graph_exec, stream_for_graph))
 
-    checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph))
+    check_cuda_errors(cudart.cudaStreamSynchronize(stream_for_graph))
 
-    checkCudaErrors(cudart.cudaGraphExecDestroy(graphExec))
-    checkCudaErrors(cudart.cudaGraphExecDestroy(clonedGraphExec))
-    checkCudaErrors(cudart.cudaGraphDestroy(graph))
-    checkCudaErrors(cudart.cudaGraphDestroy(clonedGraph))
-    checkCudaErrors(cudart.cudaStreamDestroy(streamForGraph))
+    check_cuda_errors(cudart.cudaGraphExecDestroy(graph_exec))
+    check_cuda_errors(cudart.cudaGraphExecDestroy(cloned_graph_exec))
+    check_cuda_errors(cudart.cudaGraphDestroy(graph))
+    check_cuda_errors(cudart.cudaGraphDestroy(cloned_graph))
+    check_cuda_errors(cudart.cudaStreamDestroy(stream_for_graph))
 
 
-def cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, numOfBlocks):
+def cuda_graphs_using_stream_capture(input_vec_h, input_vec_d, output_vec_d, result_d, input_size, num_of_blocks):
     result_h = ctypes.c_double(0.0)
 
-    stream1 = checkCudaErrors(cudart.cudaStreamCreate())
-    stream2 = checkCudaErrors(cudart.cudaStreamCreate())
-    stream3 = checkCudaErrors(cudart.cudaStreamCreate())
-    streamForGraph = checkCudaErrors(cudart.cudaStreamCreate())
+    stream1 = check_cuda_errors(cudart.cudaStreamCreate())
+    stream2 = check_cuda_errors(cudart.cudaStreamCreate())
+    stream3 = check_cuda_errors(cudart.cudaStreamCreate())
+    stream_for_graph = check_cuda_errors(cudart.cudaStreamCreate())
 
-    forkStreamEvent = checkCudaErrors(cudart.cudaEventCreate())
-    memsetEvent1 = checkCudaErrors(cudart.cudaEventCreate())
-    memsetEvent2 = checkCudaErrors(cudart.cudaEventCreate())
+    fork_stream_event = check_cuda_errors(cudart.cudaEventCreate())
+    memset_event1 = check_cuda_errors(cudart.cudaEventCreate())
+    memset_event2 = check_cuda_errors(cudart.cudaEventCreate())
 
-    checkCudaErrors(cudart.cudaStreamBeginCapture(stream1, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal))
+    check_cuda_errors(cudart.cudaStreamBeginCapture(stream1, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal))
 
-    checkCudaErrors(cudart.cudaEventRecord(forkStreamEvent, stream1))
-    checkCudaErrors(cudart.cudaStreamWaitEvent(stream2, forkStreamEvent, 0))
-    checkCudaErrors(cudart.cudaStreamWaitEvent(stream3, forkStreamEvent, 0))
+    check_cuda_errors(cudart.cudaEventRecord(fork_stream_event, stream1))
+    check_cuda_errors(cudart.cudaStreamWaitEvent(stream2, fork_stream_event, 0))
+    check_cuda_errors(cudart.cudaStreamWaitEvent(stream3, fork_stream_event, 0))
 
-    checkCudaErrors(
+    check_cuda_errors(
         cudart.cudaMemcpyAsync(
-            inputVec_d,
-            inputVec_h,
-            np.dtype(np.float32).itemsize * inputSize,
+            input_vec_d,
+            input_vec_h,
+            np.dtype(np.float32).itemsize * input_size,
             cudart.cudaMemcpyKind.cudaMemcpyDefault,
             stream1,
         )
     )
 
-    checkCudaErrors(cudart.cudaMemsetAsync(outputVec_d, 0, np.dtype(np.float64).itemsize * numOfBlocks, stream2))
+    check_cuda_errors(cudart.cudaMemsetAsync(output_vec_d, 0, np.dtype(np.float64).itemsize * num_of_blocks, stream2))
 
-    checkCudaErrors(cudart.cudaEventRecord(memsetEvent1, stream2))
+    check_cuda_errors(cudart.cudaEventRecord(memset_event1, stream2))
 
-    checkCudaErrors(cudart.cudaMemsetAsync(result_d, 0, np.dtype(np.float64).itemsize, stream3))
-    checkCudaErrors(cudart.cudaEventRecord(memsetEvent2, stream3))
+    check_cuda_errors(cudart.cudaMemsetAsync(result_d, 0, np.dtype(np.float64).itemsize, stream3))
+    check_cuda_errors(cudart.cudaEventRecord(memset_event2, stream3))
 
-    checkCudaErrors(cudart.cudaStreamWaitEvent(stream1, memsetEvent1, 0))
+    check_cuda_errors(cudart.cudaStreamWaitEvent(stream1, memset_event1, 0))
 
-    kernelArgs = (
-        (inputVec_d, outputVec_d, inputSize, numOfBlocks),
+    kernel_args = (
+        (input_vec_d, output_vec_d, input_size, num_of_blocks),
         (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_uint),
     )
-    checkCudaErrors(
+    check_cuda_errors(
         cuda.cuLaunchKernel(
             _reduce,
-            numOfBlocks,
+            num_of_blocks,
             1,
             1,
             THREADS_PER_BLOCK,
@@ -307,20 +307,20 @@ def cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d,
             1,
             0,
             stream1,
-            kernelArgs,
+            kernel_args,
             0,
         )
     )
 
-    checkCudaErrors(cudart.cudaStreamWaitEvent(stream1, memsetEvent2, 0))
+    check_cuda_errors(cudart.cudaStreamWaitEvent(stream1, memset_event2, 0))
 
-    kernelArgs2 = (
-        (outputVec_d, result_d, numOfBlocks),
+    kernel_args2 = (
+        (output_vec_d, result_d, num_of_blocks),
         (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_uint),
     )
-    checkCudaErrors(cuda.cuLaunchKernel(_reduceFinal, 1, 1, 1, THREADS_PER_BLOCK, 1, 1, 0, stream1, kernelArgs2, 0))
+    check_cuda_errors(cuda.cuLaunchKernel(_reduceFinal, 1, 1, 1, THREADS_PER_BLOCK, 1, 1, 0, stream1, kernel_args2, 0))
 
-    checkCudaErrors(
+    check_cuda_errors(
         cudart.cudaMemcpyAsync(
             result_h,
             result_d,
@@ -332,71 +332,67 @@ def cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d,
 
     # WIP: Host nodes
 
-    graph = checkCudaErrors(cudart.cudaStreamEndCapture(stream1))
+    graph = check_cuda_errors(cudart.cudaStreamEndCapture(stream1))
 
-    nodes, numNodes = checkCudaErrors(cudart.cudaGraphGetNodes(graph))
-    print(f"\nNum of nodes in the graph created using stream capture API = {numNodes}")
+    nodes, num_nodes = check_cuda_errors(cudart.cudaGraphGetNodes(graph))
+    print(f"\nNum of nodes in the graph created using stream capture API = {num_nodes}")
 
-    graphExec = checkCudaErrors(cudart.cudaGraphInstantiate(graph, 0))
+    graph_exec = check_cuda_errors(cudart.cudaGraphInstantiate(graph, 0))
 
-    clonedGraph = checkCudaErrors(cudart.cudaGraphClone(graph))
-    clonedGraphExec = checkCudaErrors(cudart.cudaGraphInstantiate(clonedGraph, 0))
+    cloned_graph = check_cuda_errors(cudart.cudaGraphClone(graph))
+    cloned_graph_exec = check_cuda_errors(cudart.cudaGraphInstantiate(cloned_graph, 0))
 
     for _i in range(GRAPH_LAUNCH_ITERATIONS):
-        checkCudaErrors(cudart.cudaGraphLaunch(graphExec, streamForGraph))
+        check_cuda_errors(cudart.cudaGraphLaunch(graph_exec, stream_for_graph))
 
-    checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph))
+    check_cuda_errors(cudart.cudaStreamSynchronize(stream_for_graph))
 
     print("Cloned Graph Output..")
     for _i in range(GRAPH_LAUNCH_ITERATIONS):
-        checkCudaErrors(cudart.cudaGraphLaunch(clonedGraphExec, streamForGraph))
+        check_cuda_errors(cudart.cudaGraphLaunch(cloned_graph_exec, stream_for_graph))
 
-    checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph))
+    check_cuda_errors(cudart.cudaStreamSynchronize(stream_for_graph))
 
-    checkCudaErrors(cudart.cudaGraphExecDestroy(graphExec))
-    checkCudaErrors(cudart.cudaGraphExecDestroy(clonedGraphExec))
-    checkCudaErrors(cudart.cudaGraphDestroy(graph))
-    checkCudaErrors(cudart.cudaGraphDestroy(clonedGraph))
-    checkCudaErrors(cudart.cudaEventDestroy(memsetEvent2))
-    checkCudaErrors(cudart.cudaEventDestroy(memsetEvent1))
-    checkCudaErrors(cudart.cudaEventDestroy(forkStreamEvent))
-    checkCudaErrors(cudart.cudaStreamDestroy(stream3))
-    checkCudaErrors(cudart.cudaStreamDestroy(stream1))
-    checkCudaErrors(cudart.cudaStreamDestroy(stream2))
-    checkCudaErrors(cudart.cudaStreamDestroy(streamForGraph))
+    check_cuda_errors(cudart.cudaGraphExecDestroy(graph_exec))
+    check_cuda_errors(cudart.cudaGraphExecDestroy(cloned_graph_exec))
+    check_cuda_errors(cudart.cudaGraphDestroy(graph))
+    check_cuda_errors(cudart.cudaGraphDestroy(cloned_graph))
+    check_cuda_errors(cudart.cudaStreamDestroy(stream1))
+    check_cuda_errors(cudart.cudaStreamDestroy(stream2))
+    check_cuda_errors(cudart.cudaStreamDestroy(stream_for_graph))
 
 
 def main():
     size = 1 << 24  # number of elements to reduce
-    maxBlocks = 512
+    max_blocks = 512
 
     # This will pick the best possible CUDA capable device
-    devID = findCudaDevice()
+    dev_id = find_cuda_device()
 
     global _reduce
     global _reduceFinal
-    with common.KernelHelper(simpleCudaGraphs, devID) as kernelHelper:
-        _reduce = kernelHelper.getFunction(b"reduce")
-        _reduceFinal = kernelHelper.getFunction(b"reduceFinal")
+    kernel_helper = common.KernelHelper(simple_cuda_graphs, dev_id)
+    _reduce = kernel_helper.get_function(b"reduce")
+    _reduceFinal = kernel_helper.get_function(b"reduceFinal")
 
-        print(f"{size} elements")
-        print(f"threads per block  = {THREADS_PER_BLOCK}")
-        print(f"Graph Launch iterations = {GRAPH_LAUNCH_ITERATIONS}")
+    print(f"{size} elements")
+    print(f"threads per block  = {THREADS_PER_BLOCK}")
+    print(f"Graph Launch iterations = {GRAPH_LAUNCH_ITERATIONS}")
 
-        inputVec_h = checkCudaErrors(cudart.cudaMallocHost(size * np.dtype(np.float32).itemsize))
-        inputVec_d = checkCudaErrors(cudart.cudaMalloc(size * np.dtype(np.float32).itemsize))
-        outputVec_d = checkCudaErrors(cudart.cudaMalloc(maxBlocks * np.dtype(np.float64).itemsize))
-        result_d = checkCudaErrors(cudart.cudaMalloc(np.dtype(np.float64).itemsize))
+    input_vec_h = check_cuda_errors(cudart.cudaMallocHost(size * np.dtype(np.float32).itemsize))
+    input_vec_d = check_cuda_errors(cudart.cudaMalloc(size * np.dtype(np.float32).itemsize))
+    output_vec_d = check_cuda_errors(cudart.cudaMalloc(max_blocks * np.dtype(np.float64).itemsize))
+    result_d = check_cuda_errors(cudart.cudaMalloc(np.dtype(np.float64).itemsize))
 
-        init_input(inputVec_h, size)
+    init_input(input_vec_h, size)
 
-        cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, size, maxBlocks)
-        cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d, size, maxBlocks)
+    cuda_graphs_manual(input_vec_h, input_vec_d, output_vec_d, result_d, size, max_blocks)
+    cuda_graphs_using_stream_capture(input_vec_h, input_vec_d, output_vec_d, result_d, size, max_blocks)
 
-    checkCudaErrors(cudart.cudaFree(inputVec_d))
-    checkCudaErrors(cudart.cudaFree(outputVec_d))
-    checkCudaErrors(cudart.cudaFree(result_d))
-    checkCudaErrors(cudart.cudaFreeHost(inputVec_h))
+    check_cuda_errors(cudart.cudaFree(input_vec_d))
+    check_cuda_errors(cudart.cudaFree(output_vec_d))
+    check_cuda_errors(cudart.cudaFree(result_d))
+    check_cuda_errors(cudart.cudaFreeHost(input_vec_h))
 
 
 if __name__ == "__main__":
diff --git a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
index 257a7afa14..8ef5506257 100644
--- a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
+++ b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
@@ -9,12 +9,12 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors, findCudaDevice
+from common.helper_cuda import check_cuda_errors, find_cuda_device
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
 
-conjugateGradientMultiBlockCG = """\
+conjugate_gradient_multi_block_cg = """\
 #line __LINE__
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
@@ -163,37 +163,37 @@
 """
 
 
-def genTridiag(I, J, val, N, nz):
-    I[0] = 0
-    J[0] = 0
-    J[1] = 0
+def gen_tridiag(i, j, val, n, nz):
+    i[0] = 0
+    j[0] = 0
+    j[1] = 0
 
     val[0] = float(random()) + 10.0
     val[1] = float(random())
 
-    for i in range(1, N):
+    for i in range(1, n):
         if i > 1:
-            I[i] = I[i - 1] + 3
+            i[i] = i[i - 1] + 3
         else:
-            I[1] = 2
+            i[1] = 2
 
         start = (i - 1) * 3 + 2
-        J[start] = i - 1
-        J[start + 1] = i
+        j[start] = i - 1
+        j[start + 1] = i
 
-        if i < N - 1:
-            J[start + 2] = i + 1
+        if i < n - 1:
+            j[start + 2] = i + 1
 
         val[start] = val[start - 1]
         val[start + 1] = float(random()) + 10.0
 
-        if i < N - 1:
+        if i < n - 1:
             val[start + 2] = float(random())
-    I[N] = nz
+    i[n] = nz
 
 
 THREADS_PER_BLOCK = 512
-sSDKname = "conjugateGradientMultiBlockCG"
+s_sd_kname = "conjugateGradientMultiBlockCG"
 
 
 def main():
@@ -214,139 +214,137 @@ def main():
         pytest.skip("conjugateGradientMultiBlockCG is not supported on QNX")
 
     # This will pick the best possible CUDA capable device
-    devID = findCudaDevice()
-    deviceProp = checkCudaErrors(cudart.cudaGetDeviceProperties(devID))
+    dev_id = find_cuda_device()
+    device_prop = check_cuda_errors(cudart.cudaGetDeviceProperties(dev_id))
 
-    if not deviceProp.managedMemory:
+    if not device_prop.managedMemory:
         pytest.skip("Unified Memory not supported on this device")
 
     # This sample requires being run on a device that supports Cooperative Kernel
     # Launch
-    if not deviceProp.cooperativeLaunch:
-        pytest.skip(f"Selected GPU {devID} does not support Cooperative Kernel Launch")
+    if not device_prop.cooperativeLaunch:
+        pytest.skip(f"Selected GPU {dev_id} does not support Cooperative Kernel Launch")
 
     # Statistics about the GPU device
     print(
-        f"> GPU device has {deviceProp.multiProcessorCount:%d} Multi-Processors, SM {deviceProp.major:%d}.{deviceProp.minor:%d} compute capabilities\n"
+        f"> GPU device has {device_prop.multiProcessorCount:%d} Multi-Processors, SM {device_prop.major:%d}.{device_prop.minor:%d} compute capabilities\n"
     )
 
     # Get kernel
-    with common.KernelHelper(conjugateGradientMultiBlockCG, devID) as kernelHelper:
-        _gpuConjugateGradient = kernelHelper.getFunction(b"gpuConjugateGradient")
-
-        # Generate a random tridiagonal symmetric matrix in CSR format
-        N = 1048576
-        nz = (N - 2) * 3 + 4
-
-        I = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * (N + 1), cudart.cudaMemAttachGlobal))
-        J = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * nz, cudart.cudaMemAttachGlobal))
-        val = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * nz, cudart.cudaMemAttachGlobal))
-        I_local = (ctypes.c_int * (N + 1)).from_address(I)
-        J_local = (ctypes.c_int * nz).from_address(J)
-        val_local = (ctypes.c_float * nz).from_address(val)
-
-        genTridiag(I_local, J_local, val_local, N, nz)
-
-        x = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal))
-        rhs = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal))
-        dot_result = checkCudaErrors(
-            cudart.cudaMallocManaged(np.dtype(np.float64).itemsize, cudart.cudaMemAttachGlobal)
-        )
-        x_local = (ctypes.c_float * N).from_address(x)
-        rhs_local = (ctypes.c_float * N).from_address(rhs)
-        dot_result_local = (ctypes.c_double).from_address(dot_result)
-        dot_result_local = 0
-
-        # temp memory for CG
-        r = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal))
-        p = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal))
-        Ax = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal))
-        r_local = (ctypes.c_float * N).from_address(r)
-
-        checkCudaErrors(cudart.cudaDeviceSynchronize())
-
-        start = checkCudaErrors(cudart.cudaEventCreate())
-        stop = checkCudaErrors(cudart.cudaEventCreate())
-
-        for i in range(N):
-            r_local[i] = rhs_local[i] = 1.0
-            x_local[i] = 0.0
-
-        kernelArgs_value = (I, J, val, x, Ax, p, r, dot_result, nz, N, tol)
-        kernelArgs_types = (
-            ctypes.c_void_p,
-            ctypes.c_void_p,
-            ctypes.c_void_p,
-            ctypes.c_void_p,
-            ctypes.c_void_p,
-            ctypes.c_void_p,
-            ctypes.c_void_p,
-            ctypes.c_void_p,
-            ctypes.c_int,
-            ctypes.c_int,
-            ctypes.c_float,
-        )
-        kernelArgs = (kernelArgs_value, kernelArgs_types)
+    kernel_helper = common.KernelHelper(conjugate_gradient_multi_block_cg, dev_id)
+    _gpu_conjugate_gradient = kernel_helper.get_function(b"gpuConjugateGradient")
+
+    # Generate a random tridiagonal symmetric matrix in CSR format
+    n = 1048576
+    nz = (n - 2) * 3 + 4
+
+    i = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * (n + 1), cudart.cudaMemAttachGlobal))
+    j = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * nz, cudart.cudaMemAttachGlobal))
+    val = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * nz, cudart.cudaMemAttachGlobal))
+    i_local = (ctypes.c_int * (n + 1)).from_address(i)
+    j_local = (ctypes.c_int * nz).from_address(j)
+    val_local = (ctypes.c_float * nz).from_address(val)
+
+    gen_tridiag(i_local, j_local, val_local, n, nz)
+
+    x = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal))
+    rhs = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal))
+    dot_result = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float64).itemsize, cudart.cudaMemAttachGlobal))
+    x_local = (ctypes.c_float * n).from_address(x)
+    rhs_local = (ctypes.c_float * n).from_address(rhs)
+    dot_result_local = (ctypes.c_double).from_address(dot_result)
+    dot_result_local = 0
+
+    # temp memory for CG
+    r = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal))
+    p = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal))
+    ax = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal))
+    r_local = (ctypes.c_float * n).from_address(r)
+
+    check_cuda_errors(cudart.cudaDeviceSynchronize())
+
+    start = check_cuda_errors(cudart.cudaEventCreate())
+    stop = check_cuda_errors(cudart.cudaEventCreate())
+
+    for i in range(n):
+        r_local[i] = rhs_local[i] = 1.0
+        x_local[i] = 0.0
+
+    kernel_args_value = (i, j, val, x, ax, p, r, dot_result, nz, n, tol)
+    kernel_args_types = (
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_float,
+    )
+    kernel_args = (kernel_args_value, kernel_args_types)
 
-        sMemSize = np.dtype(np.float64).itemsize * ((THREADS_PER_BLOCK / 32) + 1)
-        numThreads = THREADS_PER_BLOCK
-        numBlocksPerSm = checkCudaErrors(
-            cuda.cuOccupancyMaxActiveBlocksPerMultiprocessor(_gpuConjugateGradient, numThreads, sMemSize)
-        )
-        numSms = deviceProp.multiProcessorCount
-        dimGrid = cudart.dim3()
-        dimGrid.x = numSms * numBlocksPerSm
-        dimGrid.y = 1
-        dimGrid.z = 1
-        dimBlock = cudart.dim3()
-        dimBlock.x = THREADS_PER_BLOCK
-        dimBlock.y = 1
-        dimBlock.z = 1
-
-        checkCudaErrors(cudart.cudaEventRecord(start, 0))
-        checkCudaErrors(
-            cuda.cuLaunchCooperativeKernel(
-                _gpuConjugateGradient,
-                dimGrid.x,
-                dimGrid.y,
-                dimGrid.z,
-                dimBlock.x,
-                dimBlock.y,
-                dimBlock.z,
-                0,
-                0,
-                kernelArgs,
-            )
+    s_mem_size = np.dtype(np.float64).itemsize * ((THREADS_PER_BLOCK / 32) + 1)
+    num_threads = THREADS_PER_BLOCK
+    num_blocks_per_sm = check_cuda_errors(
+        cuda.cuOccupancyMaxActiveBlocksPerMultiprocessor(_gpu_conjugate_gradient, num_threads, s_mem_size)
+    )
+    num_sms = device_prop.multiProcessorCount
+    dim_grid = cudart.dim3()
+    dim_grid.x = num_sms * num_blocks_per_sm
+    dim_grid.y = 1
+    dim_grid.z = 1
+    dim_block = cudart.dim3()
+    dim_block.x = THREADS_PER_BLOCK
+    dim_block.y = 1
+    dim_block.z = 1
+
+    check_cuda_errors(cudart.cudaEventRecord(start, 0))
+    check_cuda_errors(
+        cuda.cuLaunchCooperativeKernel(
+            _gpu_conjugate_gradient,
+            dim_grid.x,
+            dim_grid.y,
+            dim_grid.z,
+            dim_block.x,
+            dim_block.y,
+            dim_block.z,
+            0,
+            0,
+            kernel_args,
         )
-        checkCudaErrors(cudart.cudaEventRecord(stop, 0))
-        checkCudaErrors(cudart.cudaDeviceSynchronize())
-
-        time = checkCudaErrors(cudart.cudaEventElapsedTime(start, stop))
-        print(f"GPU Final, residual = {math.sqrt(dot_result_local):e}, kernel execution time = {time:f} ms")
-
-        err = 0.0
-        for i in range(N):
-            rsum = 0.0
-
-            for j in range(I_local[i], I_local[i + 1]):
-                rsum += val_local[j] * x_local[J_local[j]]
-
-            diff = math.fabs(rsum - rhs_local[i])
-
-            if diff > err:
-                err = diff
-
-        checkCudaErrors(cudart.cudaFree(I))
-        checkCudaErrors(cudart.cudaFree(J))
-        checkCudaErrors(cudart.cudaFree(val))
-        checkCudaErrors(cudart.cudaFree(x))
-        checkCudaErrors(cudart.cudaFree(rhs))
-        checkCudaErrors(cudart.cudaFree(r))
-        checkCudaErrors(cudart.cudaFree(p))
-        checkCudaErrors(cudart.cudaFree(Ax))
-        checkCudaErrors(cudart.cudaFree(dot_result))
-    checkCudaErrors(cudart.cudaEventDestroy(start))
-    checkCudaErrors(cudart.cudaEventDestroy(stop))
+    )
+    check_cuda_errors(cudart.cudaEventRecord(stop, 0))
+    check_cuda_errors(cudart.cudaDeviceSynchronize())
+
+    time = check_cuda_errors(cudart.cudaEventElapsedTime(start, stop))
+    print(f"GPU Final, residual = {math.sqrt(dot_result_local):e}, kernel execution time = {time:f} ms")
+
+    err = 0.0
+    for i in range(n):
+        rsum = 0.0
+
+        for j in range(i_local[i], i_local[i + 1]):
+            rsum += val_local[j] * x_local[j_local[j]]
+
+        diff = math.fabs(rsum - rhs_local[i])
+
+        if diff > err:
+            err = diff
+
+    check_cuda_errors(cudart.cudaFree(i))
+    check_cuda_errors(cudart.cudaFree(j))
+    check_cuda_errors(cudart.cudaFree(val))
+    check_cuda_errors(cudart.cudaFree(x))
+    check_cuda_errors(cudart.cudaFree(rhs))
+    check_cuda_errors(cudart.cudaFree(r))
+    check_cuda_errors(cudart.cudaFree(p))
+    check_cuda_errors(cudart.cudaFree(ax))
+    check_cuda_errors(cudart.cudaFree(dot_result))
+    check_cuda_errors(cudart.cudaEventDestroy(start))
+    check_cuda_errors(cudart.cudaEventDestroy(stop))
 
     print(f"Test Summary:  Error amount = {err:f}")
     if math.sqrt(dot_result_local) >= tol:
diff --git a/cuda_bindings/examples/common/common.py b/cuda_bindings/examples/common/common.py
index 8723abe26a..5b5151ef24 100644
--- a/cuda_bindings/examples/common/common.py
+++ b/cuda_bindings/examples/common/common.py
@@ -2,10 +2,8 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 
-from contextlib import suppress
-
 import numpy as np
-from common.helper_cuda import checkCudaErrors
+from common.helper_cuda import check_cuda_errors
 
 from cuda import pathfinder
 from cuda.bindings import driver as cuda
@@ -13,14 +11,14 @@
 from cuda.bindings import runtime as cudart
 
 
-def pytest_skipif_compute_capability_too_low(devID, required_cc_major_minor):
+def pytest_skipif_compute_capability_too_low(dev_id, required_cc_major_minor):
     import pytest
 
-    cc_major = checkCudaErrors(
-        cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID)
+    cc_major = check_cuda_errors(
+        cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, dev_id)
     )
-    cc_minor = checkCudaErrors(
-        cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, devID)
+    cc_minor = check_cuda_errors(
+        cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, dev_id)
     )
     have_cc_major_minor = (cc_major, cc_minor)
     if have_cc_major_minor < required_cc_major_minor:
@@ -28,8 +26,7 @@ def pytest_skipif_compute_capability_too_low(devID, required_cc_major_minor):
 
 
 class KernelHelper:
-    def __init__(self, code, devID):
-        self.module = None
+    def __init__(self, code, dev_id):
         include_dirs = []
         for libname in ("cudart", "cccl"):
             hdr_dir = pathfinder.find_nvidia_header_directory(libname)
@@ -39,18 +36,18 @@ def __init__(self, code, devID):
                 pytest.skip(f'pathfinder.find_nvidia_header_directory("{libname}") returned None')
             include_dirs.append(hdr_dir)
 
-        prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None))
+        prog = check_cuda_errors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None))
 
         # Initialize CUDA
-        checkCudaErrors(cudart.cudaFree(0))
+        check_cuda_errors(cudart.cudaFree(0))
 
-        major = checkCudaErrors(
-            cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID)
+        major = check_cuda_errors(
+            cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, dev_id)
         )
-        minor = checkCudaErrors(
-            cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, devID)
+        minor = check_cuda_errors(
+            cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, dev_id)
         )
-        _, nvrtc_minor = checkCudaErrors(nvrtc.nvrtcVersion())
+        _, nvrtc_minor = check_cuda_errors(nvrtc.nvrtcVersion())
         use_cubin = nvrtc_minor >= 1
         prefix = "sm" if use_cubin else "compute"
         arch_arg = bytes(f"--gpu-architecture={prefix}_{major}{minor}", "ascii")
@@ -65,44 +62,27 @@ def __init__(self, code, devID):
             opts.append(f"--include-path={inc_dir}".encode())
 
         try:
-            checkCudaErrors(nvrtc.nvrtcCompileProgram(prog, len(opts), opts))
-
-            if use_cubin:
-                dataSize = checkCudaErrors(nvrtc.nvrtcGetCUBINSize(prog))
-                data = b" " * dataSize
-                checkCudaErrors(nvrtc.nvrtcGetCUBIN(prog, data))
-            else:
-                dataSize = checkCudaErrors(nvrtc.nvrtcGetPTXSize(prog))
-                data = b" " * dataSize
-                checkCudaErrors(nvrtc.nvrtcGetPTX(prog, data))
+            check_cuda_errors(nvrtc.nvrtcCompileProgram(prog, len(opts), opts))
         except RuntimeError as err:
-            logSize = checkCudaErrors(nvrtc.nvrtcGetProgramLogSize(prog))
-            log = b" " * logSize
-            checkCudaErrors(nvrtc.nvrtcGetProgramLog(prog, log))
+            log_size = check_cuda_errors(nvrtc.nvrtcGetProgramLogSize(prog))
+            log = b" " * log_size
+            check_cuda_errors(nvrtc.nvrtcGetProgramLog(prog, log))
             import sys
 
             print(log.decode(), file=sys.stderr)
             print(err, file=sys.stderr)
             sys.exit(1)
-        finally:
-            checkCudaErrors(nvrtc.nvrtcDestroyProgram(prog))
-
-        self.module = checkCudaErrors(cuda.cuModuleLoadData(np.char.array(data)))
-
-    def getFunction(self, name):
-        return checkCudaErrors(cuda.cuModuleGetFunction(self.module, name))
-
-    def close(self):
-        if self.module is not None:
-            checkCudaErrors(cuda.cuModuleUnload(self.module))
-            self.module = None
 
-    def __enter__(self):
-        return self
+        if use_cubin:
+            data_size = check_cuda_errors(nvrtc.nvrtcGetCUBINSize(prog))
+            data = b" " * data_size
+            check_cuda_errors(nvrtc.nvrtcGetCUBIN(prog, data))
+        else:
+            data_size = check_cuda_errors(nvrtc.nvrtcGetPTXSize(prog))
+            data = b" " * data_size
+            check_cuda_errors(nvrtc.nvrtcGetPTX(prog, data))
 
-    def __exit__(self, exc_type, exc, tb):
-        self.close()
+        self.module = check_cuda_errors(cuda.cuModuleLoadData(np.char.array(data)))
 
-    def __del__(self):
-        with suppress(Exception):
-            self.close()
+    def get_function(self, name):
+        return check_cuda_errors(cuda.cuModuleGetFunction(self.module, name))
diff --git a/cuda_bindings/examples/common/helper_cuda.py b/cuda_bindings/examples/common/helper_cuda.py
index d741eb54d9..9fbfe8c82f 100644
--- a/cuda_bindings/examples/common/helper_cuda.py
+++ b/cuda_bindings/examples/common/helper_cuda.py
@@ -1,14 +1,14 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt
+from common.helper_string import check_cmd_line_flag, get_cmd_line_argument_int
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import nvrtc
 from cuda.bindings import runtime as cudart
 
 
-def _cudaGetErrorEnum(error):
+def _cuda_get_error_enum(error):
     if isinstance(error, cuda.CUresult):
         err, name = cuda.cuGetErrorName(error)
         return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>"
@@ -20,9 +20,9 @@ def _cudaGetErrorEnum(error):
         raise RuntimeError(f"Unknown error type: {error}")
 
 
-def checkCudaErrors(result):
+def check_cuda_errors(result):
     if result[0].value:
-        raise RuntimeError(f"CUDA error code={result[0].value}({_cudaGetErrorEnum(result[0])})")
+        raise RuntimeError(f"CUDA error code={result[0].value}({_cuda_get_error_enum(result[0])})")
     if len(result) == 1:
         return None
     elif len(result) == 2:
@@ -31,18 +31,18 @@ def checkCudaErrors(result):
         return result[1:]
 
 
-def findCudaDevice():
-    devID = 0
-    if checkCmdLineFlag("device="):
-        devID = getCmdLineArgumentInt("device=")
-    checkCudaErrors(cudart.cudaSetDevice(devID))
-    return devID
+def find_cuda_device():
+    dev_id = 0
+    if check_cmd_line_flag("device="):
+        dev_id = get_cmd_line_argument_int("device=")
+    check_cuda_errors(cudart.cudaSetDevice(dev_id))
+    return dev_id
 
 
-def findCudaDeviceDRV():
-    devID = 0
-    if checkCmdLineFlag("device="):
-        devID = getCmdLineArgumentInt("device=")
-    checkCudaErrors(cuda.cuInit(0))
-    cuDevice = checkCudaErrors(cuda.cuDeviceGet(devID))
-    return cuDevice
+def find_cuda_device_drv():
+    dev_id = 0
+    if check_cmd_line_flag("device="):
+        dev_id = get_cmd_line_argument_int("device=")
+    check_cuda_errors(cuda.cuInit(0))
+    cu_device = check_cuda_errors(cuda.cuDeviceGet(dev_id))
+    return cu_device
diff --git a/cuda_bindings/examples/common/helper_string.py b/cuda_bindings/examples/common/helper_string.py
index 9f8e70a6c4..47d9d36569 100644
--- a/cuda_bindings/examples/common/helper_string.py
+++ b/cuda_bindings/examples/common/helper_string.py
@@ -4,12 +4,12 @@
 import sys
 
 
-def checkCmdLineFlag(stringRef):
-    return any(stringRef == i and k < len(sys.argv) - 1 for i, k in enumerate(sys.argv))
+def check_cmd_line_flag(string_ref):
+    return any(string_ref == i and k < len(sys.argv) - 1 for i, k in enumerate(sys.argv))
 
 
-def getCmdLineArgumentInt(stringRef):
+def get_cmd_line_argument_int(string_ref):
     for i, k in enumerate(sys.argv):
-        if stringRef == i and k < len(sys.argv) - 1:
+        if string_ref == i and k < len(sys.argv) - 1:
             return sys.argv[k + 1]
     return 0
diff --git a/cuda_bindings/examples/extra/isoFDModelling_test.py b/cuda_bindings/examples/extra/isoFDModelling_test.py
index 148d836adf..21303664ac 100644
--- a/cuda_bindings/examples/extra/isoFDModelling_test.py
+++ b/cuda_bindings/examples/extra/isoFDModelling_test.py
@@ -5,12 +5,12 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors
+from common.helper_cuda import check_cuda_errors
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
 
-isoPropagator = """\
+iso_propagator = """\
 extern "C"
 __global__ void injectSource(float *__restrict__ in, float *__restrict__ src, int it)
 {
@@ -177,7 +177,7 @@ def align_ny(ny, blk, nops):
 #
 # this class contains the input params
 #
-class params:
+class Params:
     def __init__(self):
         self.BDIMX = 32  # tiles x y for fd operators
         self.BDIMY = 16
@@ -209,53 +209,53 @@ def __init__(self):
 #
 # this class contains all the kernels to be used bu propagator
 #
-class cudaKernels:
+class CudaKernels:
     def __init__(self, cntx):
-        checkCudaErrors(cuda.cuInit(0))
-        checkCudaErrors(cuda.cuCtxSetCurrent(cntx))
-        dev = checkCudaErrors(cuda.cuCtxGetDevice())
+        check_cuda_errors(cuda.cuInit(0))
+        check_cuda_errors(cuda.cuCtxSetCurrent(cntx))
+        dev = check_cuda_errors(cuda.cuCtxGetDevice())
 
-        self.kernelHelper = common.KernelHelper(isoPropagator, int(dev))
+        self.kernel_helper = common.KernelHelper(iso_propagator, int(dev))
 
         # kernel to create a source fnction with some max frequency
-        self.creatSource = self.kernelHelper.getFunction(b"createSource")
+        self.creatSource = self.kernel_helper.get_function(b"createSource")
         # create a velocity to try things: just a sphere on the middle 4500 m/s and 2500 m/s all around
-        self.createVelocity = self.kernelHelper.getFunction(b"createVelocity")
+        self.create_velocity = self.kernel_helper.get_function(b"createVelocity")
 
         # kernel to propagate the wavefield by 1 step in time
-        self.fdPropag = self.kernelHelper.getFunction(b"fwd_3D_orderX2k")
+        self.fdPropag = self.kernel_helper.get_function(b"fwd_3D_orderX2k")
 
         # kernel to propagate the wavefield by 1 step in time
-        self.injectSource = self.kernelHelper.getFunction(b"injectSource")
+        self.inject_source = self.kernel_helper.get_function(b"injectSource")
 
 
 #
 # this class contains: propagator, source creation, velocity creation
 # injection of data and domain exchange
 #
-class propagator:
+class Propagator:
     def __init__(self, params, _dev):
         print("init object for device ", _dev)
         self.dev = _dev
 
-        checkCudaErrors(cuda.cuInit(0))
-        self.cuDevice = checkCudaErrors(cuda.cuDeviceGet(_dev))
-        self.context = checkCudaErrors(cuda.cuCtxCreate(None, 0, self.cuDevice))
+        check_cuda_errors(cuda.cuInit(0))
+        self.cu_device = check_cuda_errors(cuda.cuDeviceGet(_dev))
+        self.context = check_cuda_errors(cuda.cuCtxCreate(None, 0, self.cu_device))
         self.waveOut = 0
         self.waveIn = 0
-        self.streamCenter = checkCudaErrors(cuda.cuStreamCreate(0))
-        self.streamHalo = checkCudaErrors(cuda.cuStreamCreate(0))
-        self.params = params
+        self.streamCenter = check_cuda_errors(cuda.cuStreamCreate(0))
+        self.streamHalo = check_cuda_errors(cuda.cuStreamCreate(0))
+        self.Params = params
 
     def __del__(self):
-        checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
-        checkCudaErrors(cuda.cuStreamDestroy(self.streamHalo))
-        checkCudaErrors(cuda.cuStreamDestroy(self.streamCenter))
+        check_cuda_errors(cuda.cuCtxSetCurrent(self.context))
+        check_cuda_errors(cuda.cuStreamDestroy(self.streamHalo))
+        check_cuda_errors(cuda.cuStreamDestroy(self.streamCenter))
         if self.waveIn != 0:
-            checkCudaErrors(cuda.cuMemFree(self.waveIn))
+            check_cuda_errors(cuda.cuMemFree(self.waveIn))
         if self.waveOut != 0:
-            checkCudaErrors(cuda.cuMemFree(self.waveOut))
-        checkCudaErrors(cuda.cuCtxDestroy(self.context))
+            check_cuda_errors(cuda.cuMemFree(self.waveOut))
+        check_cuda_errors(cuda.cuCtxDestroy(self.context))
 
     #
     # swap waveIn with waveOut
@@ -275,45 +275,45 @@ def swap(self):
     # allocate the device memory
     #
     def allocate(self):
-        nel = self.params.nx * self.params.ny * self.params.nz
+        nel = self.Params.nx * self.Params.ny * self.Params.nz
         n = np.array(nel, dtype=np.uint32)
 
-        bufferSize = n * np.dtype(np.float32).itemsize
-        checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
+        buffer_size = n * np.dtype(np.float32).itemsize
+        check_cuda_errors(cuda.cuCtxSetCurrent(self.context))
 
-        self.velocity = checkCudaErrors(cuda.cuMemAlloc(bufferSize))
-        checkCudaErrors(cuda.cuMemsetD32(self.velocity, 0, n))
+        self.velocity = check_cuda_errors(cuda.cuMemAlloc(buffer_size))
+        check_cuda_errors(cuda.cuMemsetD32(self.velocity, 0, n))
 
-        nel += self.params.lead
+        nel += self.Params.lead
         n = np.array(nel, dtype=np.uint32)  ## we need to align at the beginning of the tile
 
-        bufferSize = n * np.dtype(np.float32).itemsize
-        self.waveIn = checkCudaErrors(cuda.cuMemAlloc(bufferSize))
-        checkCudaErrors(cuda.cuMemsetD32(self.waveIn, 0, n))
+        buffer_size = n * np.dtype(np.float32).itemsize
+        self.waveIn = check_cuda_errors(cuda.cuMemAlloc(buffer_size))
+        check_cuda_errors(cuda.cuMemsetD32(self.waveIn, 0, n))
 
-        self.waveOut = checkCudaErrors(cuda.cuMemAlloc(bufferSize))
-        checkCudaErrors(cuda.cuMemsetD32(self.waveOut, 0, n))
+        self.waveOut = check_cuda_errors(cuda.cuMemAlloc(buffer_size))
+        check_cuda_errors(cuda.cuMemsetD32(self.waveOut, 0, n))
 
-        n = np.array(self.params.nt, dtype=np.uint32)
-        bufferSize = n * np.dtype(np.float32).itemsize
-        self.source = checkCudaErrors(cuda.cuMemAlloc(bufferSize))
-        checkCudaErrors(cuda.cuMemsetD32(self.source, 0, n))
+        n = np.array(self.Params.nt, dtype=np.uint32)
+        buffer_size = n * np.dtype(np.float32).itemsize
+        self.source = check_cuda_errors(cuda.cuMemAlloc(buffer_size))
+        check_cuda_errors(cuda.cuMemsetD32(self.source, 0, n))
 
     #
     # create source data
     #
-    def createSource(self, kernel):
+    def create_source(self, kernel):
         print("creating source on device ", self.dev)
 
         buf = np.array([int(self.source)], dtype=np.uint64)
-        nt = np.array(self.params.nt, dtype=np.uint32)
-        dt = np.array(self.params.dt, dtype=np.float32)
-        freq = np.array(self.params.freqMax, dtype=np.float32)
+        nt = np.array(self.Params.nt, dtype=np.uint32)
+        dt = np.array(self.Params.dt, dtype=np.float32)
+        freq = np.array(self.Params.freqMax, dtype=np.float32)
 
         args = [buf, dt, freq, nt]
         argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
-        checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
-        checkCudaErrors(
+        check_cuda_errors(cuda.cuCtxSetCurrent(self.context))
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 kernel.creatSource,
                 1,
@@ -328,34 +328,34 @@ def createSource(self, kernel):
                 0,
             )
         )  # arguments
-        checkCudaErrors(cuda.cuStreamSynchronize(self.streamHalo))
+        check_cuda_errors(cuda.cuStreamSynchronize(self.streamHalo))
 
     #
     # inject source function: ony on the domain 0
     #
-    def injectSource(self, kernel, iter):
-        checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
+    def inject_source(self, kernel, iter):
+        check_cuda_errors(cuda.cuCtxSetCurrent(self.context))
 
         if self.dev != 0:
             return
 
         wavein = np.array([int(self.waveIn)], dtype=np.uint64)
         src = np.array([int(self.source)], dtype=np.uint64)
-        offset_sourceInject = (
-            self.params.lead
-            + (int)(self.params.nz / 2) * self.params.nx * self.params.ny
-            + (int)(self.params.ny / 2) * self.params.nx
-            + (int)(self.params.nx / 2)
+        offset_source_inject = (
+            self.Params.lead
+            + (int)(self.Params.nz / 2) * self.Params.nx * self.Params.ny
+            + (int)(self.Params.ny / 2) * self.Params.nx
+            + (int)(self.Params.nx / 2)
         )
-        offset_sourceInject *= np.dtype(np.float32).itemsize
+        offset_source_inject *= np.dtype(np.float32).itemsize
 
         np_it = np.array(iter, dtype=np.uint32)
 
-        args = [wavein + offset_sourceInject, src, np_it]
+        args = [wavein + offset_source_inject, src, np_it]
         argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
-        checkCudaErrors(
+        check_cuda_errors(
             cuda.cuLaunchKernel(
-                kernel.injectSource,
+                kernel.inject_source,
                 1,
                 1,
                 1,  # grid dim
@@ -372,39 +372,39 @@ def injectSource(self, kernel, iter):
     #
     # create velocity
     #
-    def createVelocity(self, kernel):
+    def create_velocity(self, kernel):
         print("running create velocity on device ", self.dev)
 
         offset_velocity = (
-            self.params.FD_ORDER * self.params.nx * self.params.ny
-            + self.params.FD_ORDER * self.params.nx
-            + self.params.FD_ORDER
+            self.Params.FD_ORDER * self.Params.nx * self.Params.ny
+            + self.Params.FD_ORDER * self.Params.nx
+            + self.Params.FD_ORDER
         )
         offset_velocity *= np.dtype(np.float32).itemsize
 
         vel = np.array([int(self.velocity)], dtype=np.uint64)
-        dx_dt2 = (self.params.dt * self.params.dt) / (self.params.delta * self.params.delta)
+        dx_dt2 = (self.Params.dt * self.Params.dt) / (self.Params.delta * self.Params.delta)
 
-        stride = self.params.nx * self.params.ny
+        stride = self.Params.nx * self.Params.ny
         np_dx_dt2 = np.array(dx_dt2, dtype=np.float32)
-        np_nz = np.array((self.params.nz - 2 * self.params.FD_ORDER), dtype=np.uint32)
-        np_nx = np.array(self.params.nx, dtype=np.uint32)
+        np_nz = np.array((self.Params.nz - 2 * self.Params.FD_ORDER), dtype=np.uint32)
+        np_nx = np.array(self.Params.nx, dtype=np.uint32)
         np_stride = np.array(stride, dtype=np.uint32)
 
         args = [vel + offset_velocity, np_dx_dt2, np_nz, np_nx, np_stride]
         argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
 
-        checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
+        check_cuda_errors(cuda.cuCtxSetCurrent(self.context))
 
         # do halo up
-        checkCudaErrors(
+        check_cuda_errors(
             cuda.cuLaunchKernel(
-                kernel.createVelocity,
-                self.params.blkx,
-                self.params.blky,
+                kernel.create_velocity,
+                self.Params.blkx,
+                self.Params.blky,
                 1,  # grid dim
-                2 * self.params.BDIMX,
-                self.params.BDIMY,
+                2 * self.Params.BDIMX,
+                self.Params.BDIMY,
                 1,  # block dim
                 0,
                 self.streamHalo,  # shared mem and stream
@@ -412,22 +412,22 @@ def createVelocity(self, kernel):
                 0,
             )
         )  # arguments
-        checkCudaErrors(cuda.cuStreamSynchronize(self.streamHalo))
+        check_cuda_errors(cuda.cuStreamSynchronize(self.streamHalo))
 
     #
     # execute the center part of propagation
     #
-    def executeCenter(self, kernel):
+    def execute_center(self, kernel):
         if verbose_prints:
             print("running center on device ", self.dev)
-        checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
+        check_cuda_errors(cuda.cuCtxSetCurrent(self.context))
         offset_velocity = (
-            2 * self.params.FD_ORDER * self.params.nx * self.params.ny
-            + self.params.FD_ORDER * self.params.nx
-            + self.params.FD_ORDER
+            2 * self.Params.FD_ORDER * self.Params.nx * self.Params.ny
+            + self.Params.FD_ORDER * self.Params.nx
+            + self.Params.FD_ORDER
         )
 
-        offset_wave = self.params.lead + offset_velocity
+        offset_wave = self.Params.lead + offset_velocity
 
         offset_wave *= np.dtype(np.float32).itemsize
         offset_velocity *= np.dtype(np.float32).itemsize
@@ -436,9 +436,9 @@ def executeCenter(self, kernel):
         waveout = np.array([int(self.waveOut)], dtype=np.uint64)
 
         vel = np.array([int(self.velocity)], dtype=np.uint64)
-        stride = self.params.nx * self.params.ny
-        np_nz = np.array(self.params.nz - 4 * self.params.FD_ORDER, dtype=np.uint32)
-        np_nx = np.array(self.params.nx, dtype=np.uint32)
+        stride = self.Params.nx * self.Params.ny
+        np_nz = np.array(self.Params.nz - 4 * self.Params.FD_ORDER, dtype=np.uint32)
+        np_nx = np.array(self.Params.nx, dtype=np.uint32)
         np_stride = np.array(stride, dtype=np.uint32)
 
         args = [
@@ -452,14 +452,14 @@ def executeCenter(self, kernel):
         argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
 
         # do center propagation from 2 * fd_order to nz - 2 * fd_order
-        checkCudaErrors(
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 kernel.fdPropag,
-                self.params.blkx,
-                self.params.blky,
+                self.Params.blkx,
+                self.Params.blky,
                 1,  # grid dim
-                self.params.BDIMX,
-                self.params.BDIMY,
+                self.Params.BDIMX,
+                self.Params.BDIMY,
                 1,  # block dim
                 0,
                 self.streamCenter,  # shared mem and stream
@@ -471,18 +471,18 @@ def executeCenter(self, kernel):
     #
     # execute the halo part of propagation
     #
-    def executeHalo(self, kernel):
+    def execute_halo(self, kernel):
         if verbose_prints:
             print("running halos on device ", self.dev)
-        checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
+        check_cuda_errors(cuda.cuCtxSetCurrent(self.context))
 
         offset_velocity = (
-            self.params.FD_ORDER * self.params.nx * self.params.ny
-            + self.params.FD_ORDER * self.params.nx
-            + self.params.FD_ORDER
+            self.Params.FD_ORDER * self.Params.nx * self.Params.ny
+            + self.Params.FD_ORDER * self.Params.nx
+            + self.Params.FD_ORDER
         )
 
-        offset_wave = self.params.lead + offset_velocity
+        offset_wave = self.Params.lead + offset_velocity
 
         offset_wave *= np.dtype(np.float32).itemsize
         offset_velocity *= np.dtype(np.float32).itemsize
@@ -491,9 +491,9 @@ def executeHalo(self, kernel):
         waveout = np.array([int(self.waveOut)], dtype=np.uint64)
 
         vel = np.array([int(self.velocity)], dtype=np.uint64)
-        stride = self.params.nx * self.params.ny
-        np_nz = np.array(self.params.FD_ORDER, dtype=np.uint32)
-        np_nx = np.array(self.params.nx, dtype=np.uint32)
+        stride = self.Params.nx * self.Params.ny
+        np_nz = np.array(self.Params.FD_ORDER, dtype=np.uint32)
+        np_nx = np.array(self.Params.nx, dtype=np.uint32)
         np_stride = np.array(stride, dtype=np.uint32)
 
         args = [
@@ -507,14 +507,14 @@ def executeHalo(self, kernel):
         argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
 
         # do halo up
-        checkCudaErrors(
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 kernel.fdPropag,
-                self.params.blkx,
-                self.params.blky,
+                self.Params.blkx,
+                self.Params.blky,
                 1,  # grid dim
-                self.params.BDIMX,
-                self.params.BDIMY,
+                self.Params.BDIMX,
+                self.Params.BDIMY,
                 1,  # block dim
                 0,
                 self.streamHalo,  # shared mem and stream
@@ -525,11 +525,11 @@ def executeHalo(self, kernel):
 
         # do halo down
         offset_velocity = (
-            (self.params.nz - 2 * self.params.FD_ORDER) * self.params.nx * self.params.ny
-            + self.params.FD_ORDER * self.params.nx
-            + self.params.FD_ORDER
+            (self.Params.nz - 2 * self.Params.FD_ORDER) * self.Params.nx * self.Params.ny
+            + self.Params.FD_ORDER * self.Params.nx
+            + self.Params.FD_ORDER
         )
-        offset_wave = self.params.lead + offset_velocity
+        offset_wave = self.Params.lead + offset_velocity
 
         offset_wave *= np.dtype(np.float32).itemsize
         offset_velocity *= np.dtype(np.float32).itemsize
@@ -543,14 +543,14 @@ def executeHalo(self, kernel):
             np_stride,
         ]
         argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
-        checkCudaErrors(
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 kernel.fdPropag,
-                self.params.blkx,
-                self.params.blky,
+                self.Params.blkx,
+                self.Params.blky,
                 1,  # grid dim
-                self.params.BDIMX,
-                self.params.BDIMY,
+                self.Params.BDIMX,
+                self.Params.BDIMY,
                 1,  # block dim
                 0,
                 self.streamHalo,  # shared mem and stream
@@ -562,79 +562,79 @@ def executeHalo(self, kernel):
     #
     # exchange the halos
     #
-    def exchangeHalo(self, propag):
+    def exchange_halo(self, propag):
         if verbose_prints:
             print("exchange  halos on device ", self.dev, "with dev ", propag.dev)
-        checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
+        check_cuda_errors(cuda.cuCtxSetCurrent(self.context))
 
         #
         # the following variables don't change
         #
-        nstride = self.params.nx * self.params.ny
+        nstride = self.Params.nx * self.Params.ny
 
-        devS = self.context
-        devD = propag.context
+        dev_s = self.context
+        dev_d = propag.context
 
-        n_exch = self.params.FD_ORDER * nstride
+        n_exch = self.Params.FD_ORDER * nstride
         n_exch *= np.dtype(np.float32).itemsize
 
         if self.dev < propag.dev:
             # exchange up
-            offsetS = self.params.lead + (self.params.nz - 2 * self.params.FD_ORDER) * nstride
-            offsetD = propag.params.lead
+            offset_s = self.Params.lead + (self.Params.nz - 2 * self.Params.FD_ORDER) * nstride
+            offset_d = propag.Params.lead
 
-            offsetS *= np.dtype(np.float32).itemsize
-            offsetD *= np.dtype(np.float32).itemsize
+            offset_s *= np.dtype(np.float32).itemsize
+            offset_d *= np.dtype(np.float32).itemsize
 
-            waveD = cuda.CUdeviceptr(int(propag.waveOut) + offsetD)
-            waveS = cuda.CUdeviceptr(int(self.waveOut) + offsetS)
+            wave_d = cuda.CUdeviceptr(int(propag.waveOut) + offset_d)
+            wave_s = cuda.CUdeviceptr(int(self.waveOut) + offset_s)
 
-            checkCudaErrors(cuda.cuMemcpyPeerAsync(waveD, devD, waveS, devS, n_exch, self.streamHalo))
+            check_cuda_errors(cuda.cuMemcpyPeerAsync(wave_d, dev_d, wave_s, dev_s, n_exch, self.streamHalo))
         else:
             # exchange down
-            offsetS = self.params.lead + self.params.FD_ORDER * nstride
-            offsetD = propag.params.lead + (propag.params.nz - propag.params.FD_ORDER) * nstride
+            offset_s = self.Params.lead + self.Params.FD_ORDER * nstride
+            offset_d = propag.Params.lead + (propag.Params.nz - propag.Params.FD_ORDER) * nstride
 
-            offsetS *= np.dtype(np.float32).itemsize
-            offsetD *= np.dtype(np.float32).itemsize
+            offset_s *= np.dtype(np.float32).itemsize
+            offset_d *= np.dtype(np.float32).itemsize
 
-            waveD = cuda.CUdeviceptr(int(propag.waveOut) + offsetD)
-            waveS = cuda.CUdeviceptr(int(self.waveOut) + offsetS)
+            wave_d = cuda.CUdeviceptr(int(propag.waveOut) + offset_d)
+            wave_s = cuda.CUdeviceptr(int(self.waveOut) + offset_s)
 
-            checkCudaErrors(cuda.cuMemcpyPeerAsync(waveD, devD, waveS, devS, n_exch, self.streamHalo))
+            check_cuda_errors(cuda.cuMemcpyPeerAsync(wave_d, dev_d, wave_s, dev_s, n_exch, self.streamHalo))
 
     #
     # sync stream
     #
-    def syncStream(self, stream):
-        checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
-        checkCudaErrors(cuda.cuStreamSynchronize(stream))
+    def sync_stream(self, stream):
+        check_cuda_errors(cuda.cuCtxSetCurrent(self.context))
+        check_cuda_errors(cuda.cuStreamSynchronize(stream))
 
 
 def main():
-    checkCudaErrors(cuda.cuInit(0))
+    check_cuda_errors(cuda.cuInit(0))
 
     # Number of GPUs
     print("Checking for multiple GPUs...")
-    gpu_n = checkCudaErrors(cuda.cuDeviceGetCount())
+    gpu_n = check_cuda_errors(cuda.cuDeviceGetCount())
     print(f"CUDA-capable device count: {gpu_n}")
 
     if gpu_n < 2:
         print("Two or more GPUs with Peer-to-Peer access capability are required")
         return
 
-    prop = [checkCudaErrors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)]
+    prop = [check_cuda_errors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)]
     # Check possibility for peer access
     print("\nChecking GPU(s) for support of peer to peer memory access...")
 
-    p2pCapableGPUs = [-1, -1]
+    p2p_capable_gp_us = [-1, -1]
     for i in range(gpu_n):
-        p2pCapableGPUs[0] = i
+        p2p_capable_gp_us[0] = i
         for j in range(gpu_n):
             if i == j:
                 continue
-            i_access_j = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(i, j))
-            j_access_i = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(j, i))
+            i_access_j = check_cuda_errors(cudart.cudaDeviceCanAccessPeer(i, j))
+            j_access_i = check_cuda_errors(cudart.cudaDeviceCanAccessPeer(j, i))
             print(
                 "> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format(
                     prop[i].name, i, prop[j].name, j, "Yes" if i_access_j else "No"
@@ -646,23 +646,23 @@ def main():
                 )
             )
             if i_access_j and j_access_i:
-                p2pCapableGPUs[1] = j
+                p2p_capable_gp_us[1] = j
                 break
-        if p2pCapableGPUs[1] != -1:
+        if p2p_capable_gp_us[1] != -1:
             break
 
-    if p2pCapableGPUs[0] == -1 or p2pCapableGPUs[1] == -1:
+    if p2p_capable_gp_us[0] == -1 or p2p_capable_gp_us[1] == -1:
         print("Two or more GPUs with Peer-to-Peer access capability are required.")
         print("Peer to Peer access is not available amongst GPUs in the system, waiving test.")
         return
 
     # Use first pair of p2p capable GPUs detected
-    gpuid = [p2pCapableGPUs[0], p2pCapableGPUs[1]]
+    gpuid = [p2p_capable_gp_us[0], p2p_capable_gp_us[1]]
 
     #
     # init device
     #
-    pars = params()
+    pars = Params()
 
     #
     # create propagators
@@ -674,16 +674,16 @@ def main():
     # create kernels and propagators that are going to be used on device
     #
     for i in gpuid:
-        p = propagator(pars, i)
-        k = cudaKernels(p.context)
+        p = Propagator(pars, i)
+        k = CudaKernels(p.context)
         propags.append(p)
         kerns.append(k)
 
     # allocate resources in device
     for propag, kern in zip(propags, kerns):
         propag.allocate()
-        propag.createSource(kern)
-        propag.createVelocity(kern)
+        propag.create_source(kern)
+        propag.create_velocity(kern)
 
     #
     # loop over time iterations
@@ -691,26 +691,26 @@ def main():
     start = time.time()
     for it in range(pars.nt):
         for propag in propags:
-            propag.syncStream(propag.streamHalo)
+            propag.sync_stream(propag.streamHalo)
 
         for propag, kern in zip(propags, kerns):
-            propag.injectSource(kern, it)
+            propag.inject_source(kern, it)
 
         for propag, kern in zip(propags, kerns):
-            propag.executeHalo(kern)
+            propag.execute_halo(kern)
 
         for propag in propags:
-            propag.syncStream(propag.streamHalo)
+            propag.sync_stream(propag.streamHalo)
 
-        propags[1].exchangeHalo(propags[0])
+        propags[1].exchange_halo(propags[0])
 
-        propags[0].exchangeHalo(propags[1])
+        propags[0].exchange_halo(propags[1])
 
         for propag, kern in zip(propags, kerns):
-            propag.executeCenter(kern)
+            propag.execute_center(kern)
 
         for propag in propags:
-            propag.syncStream(propag.streamCenter)
+            propag.sync_stream(propag.streamCenter)
 
         for propag in propags:
             propag.swap()
@@ -727,19 +727,19 @@ def main():
     #
     nz = 2 * (int)(pars.nz - 2 * pars.FD_ORDER)
     print(" nz= ", nz, " nx= ", pars.nx)
-    hOut = np.zeros((nz, pars.nx), dtype="float32")
+    h_out = np.zeros((nz, pars.nx), dtype="float32")
 
     istart = 0
     for propag in propags:
-        checkCudaErrors(cuda.cuCtxSetCurrent(propag.context))
+        check_cuda_errors(cuda.cuCtxSetCurrent(propag.context))
         offset = pars.lead + pars.FD_ORDER * pars.nx * pars.ny + (int)(pars.ny / 2) * pars.nx
 
         for j in range(pars.nz - 2 * pars.FD_ORDER):
             ptr = cuda.CUdeviceptr(int(propag.waveOut) + offset * 4)
 
-            checkCudaErrors(
+            check_cuda_errors(
                 cuda.cuMemcpyDtoH(
-                    hOut[istart].ctypes.data,
+                    h_out[istart].ctypes.data,
                     ptr,
                     pars.nx * np.dtype(np.float32).itemsize,
                 )
@@ -756,7 +756,7 @@ def main():
     if display_graph:
         nrows = nz
         ncols = pars.nx
-        dbz = hOut
+        dbz = h_out
         dbz = np.reshape(dbz, (nrows, ncols))
 
         ##
diff --git a/cuda_bindings/examples/extra/jit_program_test.py b/cuda_bindings/examples/extra/jit_program_test.py
index be78deafc1..80e7e73376 100644
--- a/cuda_bindings/examples/extra/jit_program_test.py
+++ b/cuda_bindings/examples/extra/jit_program_test.py
@@ -9,7 +9,7 @@
 from cuda.bindings import nvrtc
 
 
-def ASSERT_DRV(err):
+def assert_drv(err):
     if isinstance(err, cuda.CUresult):
         if err != cuda.CUresult.CUDA_SUCCESS:
             raise RuntimeError(f"Cuda Error: {err}")
@@ -35,31 +35,31 @@ def ASSERT_DRV(err):
 def main():
     # Init
     (err,) = cuda.cuInit(0)
-    ASSERT_DRV(err)
+    assert_drv(err)
 
     # Device
-    err, cuDevice = cuda.cuDeviceGet(0)
-    ASSERT_DRV(err)
+    err, cu_device = cuda.cuDeviceGet(0)
+    assert_drv(err)
 
     # Ctx
-    err, context = cuda.cuCtxCreate(None, 0, cuDevice)
-    ASSERT_DRV(err)
+    err, context = cuda.cuCtxCreate(None, 0, cu_device)
+    assert_drv(err)
 
     # Create program
     err, prog = nvrtc.nvrtcCreateProgram(str.encode(saxpy), b"saxpy.cu", 0, None, None)
-    ASSERT_DRV(err)
+    assert_drv(err)
 
     # Get target architecture
     err, major = cuda.cuDeviceGetAttribute(
-        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice
+        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cu_device
     )
-    ASSERT_DRV(err)
+    assert_drv(err)
     err, minor = cuda.cuDeviceGetAttribute(
-        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice
+        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cu_device
     )
-    ASSERT_DRV(err)
+    assert_drv(err)
     err, nvrtc_major, nvrtc_minor = nvrtc.nvrtcVersion()
-    ASSERT_DRV(err)
+    assert_drv(err)
     use_cubin = nvrtc_minor >= 1
     prefix = "sm" if use_cubin else "compute"
     arch_arg = bytes(f"--gpu-architecture={prefix}_{major}{minor}", "ascii")
@@ -67,82 +67,80 @@ def main():
     # Compile program
     opts = [b"--fmad=false", arch_arg]
     (err,) = nvrtc.nvrtcCompileProgram(prog, len(opts), opts)
-    ASSERT_DRV(err)
+    assert_drv(err)
 
     # Get log from compilation
-    err, logSize = nvrtc.nvrtcGetProgramLogSize(prog)
-    ASSERT_DRV(err)
-    log = b" " * logSize
+    err, log_size = nvrtc.nvrtcGetProgramLogSize(prog)
+    assert_drv(err)
+    log = b" " * log_size
     (err,) = nvrtc.nvrtcGetProgramLog(prog, log)
-    ASSERT_DRV(err)
+    assert_drv(err)
     print(log.decode())
 
     # Get data from compilation
     if use_cubin:
-        err, dataSize = nvrtc.nvrtcGetCUBINSize(prog)
-        ASSERT_DRV(err)
-        data = b" " * dataSize
+        err, data_size = nvrtc.nvrtcGetCUBINSize(prog)
+        assert_drv(err)
+        data = b" " * data_size
         (err,) = nvrtc.nvrtcGetCUBIN(prog, data)
-        ASSERT_DRV(err)
+        assert_drv(err)
     else:
-        err, dataSize = nvrtc.nvrtcGetPTXSize(prog)
-        ASSERT_DRV(err)
-        data = b" " * dataSize
+        err, data_size = nvrtc.nvrtcGetPTXSize(prog)
+        assert_drv(err)
+        data = b" " * data_size
         (err,) = nvrtc.nvrtcGetPTX(prog, data)
-        ASSERT_DRV(err)
-    (err,) = nvrtc.nvrtcDestroyProgram(prog)
-    ASSERT_DRV(err)
+        assert_drv(err)
 
     # Load data as module data and retrieve function
     data = np.char.array(data)
     err, module = cuda.cuModuleLoadData(data)
-    ASSERT_DRV(err)
+    assert_drv(err)
     err, kernel = cuda.cuModuleGetFunction(module, b"saxpy")
-    ASSERT_DRV(err)
+    assert_drv(err)
 
     # Test the kernel
-    NUM_THREADS = 128
-    NUM_BLOCKS = 32
+    num_threads = 128
+    num_blocks = 32
 
     a = np.float32(2.0)
-    n = np.array(NUM_THREADS * NUM_BLOCKS, dtype=np.uint32)
-    bufferSize = n * a.itemsize
+    n = np.array(num_threads * num_blocks, dtype=np.uint32)
+    buffer_size = n * a.itemsize
 
-    err, dX = cuda.cuMemAlloc(bufferSize)
-    ASSERT_DRV(err)
-    err, dY = cuda.cuMemAlloc(bufferSize)
-    ASSERT_DRV(err)
-    err, dOut = cuda.cuMemAlloc(bufferSize)
-    ASSERT_DRV(err)
+    err, d_x = cuda.cuMemAlloc(buffer_size)
+    assert_drv(err)
+    err, d_y = cuda.cuMemAlloc(buffer_size)
+    assert_drv(err)
+    err, d_out = cuda.cuMemAlloc(buffer_size)
+    assert_drv(err)
 
-    hX = np.random.rand(n).astype(dtype=np.float32)
-    hY = np.random.rand(n).astype(dtype=np.float32)
-    hOut = np.zeros(n).astype(dtype=np.float32)
+    h_x = np.random.rand(n).astype(dtype=np.float32)
+    h_y = np.random.rand(n).astype(dtype=np.float32)
+    h_out = np.zeros(n).astype(dtype=np.float32)
 
     err, stream = cuda.cuStreamCreate(0)
-    ASSERT_DRV(err)
+    assert_drv(err)
 
-    (err,) = cuda.cuMemcpyHtoDAsync(dX, hX, bufferSize, stream)
-    ASSERT_DRV(err)
-    (err,) = cuda.cuMemcpyHtoDAsync(dY, hY, bufferSize, stream)
-    ASSERT_DRV(err)
+    (err,) = cuda.cuMemcpyHtoDAsync(d_x, h_x, buffer_size, stream)
+    assert_drv(err)
+    (err,) = cuda.cuMemcpyHtoDAsync(d_y, h_y, buffer_size, stream)
+    assert_drv(err)
 
     (err,) = cuda.cuStreamSynchronize(stream)
-    ASSERT_DRV(err)
+    assert_drv(err)
 
     # Assert values are different before running kernel
-    hZ = a * hX + hY
-    if np.allclose(hOut, hZ):
+    h_z = a * h_x + h_y
+    if np.allclose(h_out, h_z):
         raise ValueError("Error inside tolerence for host-device vectors")
 
-    arg_values = (a, dX, dY, dOut, n)
+    arg_values = (a, d_x, d_y, d_out, n)
     arg_types = (ctypes.c_float, None, None, None, ctypes.c_size_t)
     (err,) = cuda.cuLaunchKernel(
         kernel,
-        NUM_BLOCKS,
+        num_blocks,
         1,
         1,  # grid dim
-        NUM_THREADS,
+        num_threads,
         1,
         1,  # block dim
         0,
@@ -150,32 +148,32 @@ def main():
         (arg_values, arg_types),
         0,
     )  # arguments
-    ASSERT_DRV(err)
+    assert_drv(err)
 
-    (err,) = cuda.cuMemcpyDtoHAsync(hOut, dOut, bufferSize, stream)
-    ASSERT_DRV(err)
+    (err,) = cuda.cuMemcpyDtoHAsync(h_out, d_out, buffer_size, stream)
+    assert_drv(err)
     (err,) = cuda.cuStreamSynchronize(stream)
-    ASSERT_DRV(err)
+    assert_drv(err)
 
     # Assert values are same after running kernel
-    hZ = a * hX + hY
-    if not np.allclose(hOut, hZ):
+    h_z = a * h_x + h_y
+    if not np.allclose(h_out, h_z):
         raise ValueError("Error outside tolerence for host-device vectors")
 
     (err,) = cuda.cuStreamDestroy(stream)
-    ASSERT_DRV(err)
+    assert_drv(err)
 
-    (err,) = cuda.cuMemFree(dX)
-    ASSERT_DRV(err)
-    (err,) = cuda.cuMemFree(dY)
-    ASSERT_DRV(err)
-    (err,) = cuda.cuMemFree(dOut)
-    ASSERT_DRV(err)
+    (err,) = cuda.cuMemFree(d_x)
+    assert_drv(err)
+    (err,) = cuda.cuMemFree(d_y)
+    assert_drv(err)
+    (err,) = cuda.cuMemFree(d_out)
+    assert_drv(err)
 
     (err,) = cuda.cuModuleUnload(module)
-    ASSERT_DRV(err)
+    assert_drv(err)
     (err,) = cuda.cuCtxDestroy(context)
-    ASSERT_DRV(err)
+    assert_drv(err)
 
 
 if __name__ == "__main__":
diff --git a/cuda_bindings/tests/nvml/test_device.py b/cuda_bindings/tests/nvml/test_device.py
index f273a2a88e..7344a93efe 100644
--- a/cuda_bindings/tests/nvml/test_device.py
+++ b/cuda_bindings/tests/nvml/test_device.py
@@ -72,7 +72,7 @@ def test_get_nv_link_supported_bw_modes(all_devices):
     for device in all_devices:
         with unsupported_before(device, None):
             modes = nvml.device_get_nvlink_supported_bw_modes(device)
-        assert isinstance(modes, nvml.NvLinkSupportedBWModes_v1)
+        assert isinstance(modes, nvml.NvlinkSupportedBWModes_v1)
         # #define NVML_NVLINK_TOTAL_SUPPORTED_BW_MODES 23
         assert len(modes.bw_modes) <= 23
         assert not hasattr(modes, "total_bw_modes")
@@ -132,16 +132,6 @@ def test_read_write_prm(all_devices):
         assert isinstance(result[1], bytes)
 
 
-def test_nvlink_low_power_threshold(all_devices):
-    for device in all_devices:
-        # Docs say supported on HOPPER or newer
-        with unsupported_before(device, None):
-            try:
-                nvml.device_set_nvlink_device_low_power_threshold(device, 0)
-            except nvml.NoPermissionError:
-                pytest.skip("No permission to set NVLink low power threshold")
-
-
 def test_get_power_management_limit(all_devices):
     for device in all_devices:
         # Docs say supported on KEPLER or later
diff --git a/cuda_core/AGENTS.md b/cuda_core/AGENTS.md
new file mode 100644
index 0000000000..357e228360
--- /dev/null
+++ b/cuda_core/AGENTS.md
@@ -0,0 +1,65 @@
+This file describes `cuda_core`, the high-level Pythonic CUDA subpackage in the
+`cuda-python` monorepo.
+
+## Scope and principles
+
+- **Role**: provide higher-level CUDA abstractions (`Device`, `Stream`,
+  `Program`, `Linker`, memory resources, graphs) on top of `cuda.bindings`.
+- **API intent**: keep interfaces Pythonic while preserving explicit CUDA
+  behavior and error visibility.
+- **Compatibility**: changes should remain compatible with supported
+  `cuda.bindings` major versions (12.x and 13.x).
+
+## Package architecture
+
+- **Main package**: `cuda/core/` contains most Cython modules (`*.pyx`, `*.pxd`)
+  implementing runtime behaviors and public objects.
+- **Subsystems**:
+  - memory/resource stack: `cuda/core/_memory/`
+  - system-level APIs: `cuda/core/system/`
+  - compile/link path: `_program.pyx`, `_linker.pyx`, `_module.pyx`
+  - execution path: `_launcher.pyx`, `_launch_config.pyx`, `_stream.pyx`
+- **C++ helpers**: module-specific C++ implementations live under
+  `cuda/core/_cpp/`.
+- **Build backend**: `build_hooks.py` handles Cython extension setup and build
+  dependency wiring.
+
+## Build and version coupling
+
+- `build_hooks.py` determines CUDA major version from `CUDA_CORE_BUILD_MAJOR`
+  or CUDA headers (`CUDA_HOME`/`CUDA_PATH`) and uses it for build decisions.
+- Source builds require CUDA headers available through `CUDA_HOME` or
+  `CUDA_PATH`.
+- `cuda_core` expects `cuda.bindings` to be present and version-compatible.
+
+## Testing expectations
+
+- **Primary tests**: `pytest tests/`
+- **Cython tests**:
+  - build: `tests/cython/build_tests.sh` (or platform equivalent)
+  - run: `pytest tests/cython/`
+- **Examples**: validate affected examples in `examples/` when changing user
+  workflows or public APIs.
+- **Orchestrated run**: from repo root, `scripts/run_tests.sh core`.
+
+## Runtime/build environment notes
+
+- Runtime env vars commonly relevant:
+  - `CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM`
+  - `CUDA_PYTHON_DISABLE_MAJOR_VERSION_WARNING`
+- Build env vars commonly relevant:
+  - `CUDA_HOME` / `CUDA_PATH`
+  - `CUDA_CORE_BUILD_MAJOR`
+  - `CUDA_PYTHON_PARALLEL_LEVEL`
+  - `CUDA_PYTHON_COVERAGE`
+
+## Editing guidance
+
+- Keep user-facing behaviors coherent with docs and examples, especially around
+  stream semantics, memory ownership, and compile/link flows.
+- Reuse existing shared utilities in `cuda/core/_utils/` before adding new
+  helpers.
+- When changing Cython signatures or cimports, verify related `.pxd` and
+  call-site consistency.
+- Prefer explicit error propagation over silent fallback paths.
+- If you change public behavior, update tests and docs under `docs/source/`.
diff --git a/cuda_core/CLAUDE.md b/cuda_core/CLAUDE.md
new file mode 120000
index 0000000000..47dc3e3d86
--- /dev/null
+++ b/cuda_core/CLAUDE.md
@@ -0,0 +1 @@
+AGENTS.md
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_linker.pyx b/cuda_core/cuda/core/_linker.pyx
index 4dbb9950de..3fd4255ba7 100644
--- a/cuda_core/cuda/core/_linker.pyx
+++ b/cuda_core/cuda/core/_linker.pyx
@@ -29,6 +29,7 @@ from dataclasses import dataclass
 from typing import Union
 from warnings import warn
 
+from cuda.pathfinder import optional_cuda_import
 from cuda.core._device import Device
 from cuda.core._module import ObjectCode
 from cuda.core._utils.clear_error_support import assert_type
@@ -649,23 +650,20 @@ def _decide_nvjitlink_or_driver() -> bool:
         " For best results, consider upgrading to a recent version of"
     )
 
-    try:
-        __import__("cuda.bindings.nvjitlink")  # availability check
-    except ModuleNotFoundError:
+    nvjitlink_module = optional_cuda_import(
+        "cuda.bindings.nvjitlink",
+        probe_function=lambda module: module.version(),  # probe triggers nvJitLink runtime load
+    )
+    if nvjitlink_module is None:
         warn_txt = f"cuda.bindings.nvjitlink is not available, therefore {warn_txt_common} cuda-bindings."
     else:
         from cuda.bindings._internal import nvjitlink
 
-        try:
-            if _nvjitlink_has_version_symbol(nvjitlink):
-                _use_nvjitlink_backend = True
-                return False  # Use nvjitlink
-        except RuntimeError:
-            warn_detail = "not available"
-        else:
-            warn_detail = "too old (<12.3)"
+        if _nvjitlink_has_version_symbol(nvjitlink):
+            _use_nvjitlink_backend = True
+            return False  # Use nvjitlink
         warn_txt = (
-            f"{'nvJitLink*.dll' if sys.platform == 'win32' else 'libnvJitLink.so*'} is {warn_detail}."
+            f"{'nvJitLink*.dll' if sys.platform == 'win32' else 'libnvJitLink.so*'} is too old (<12.3)."
             f" Therefore cuda.bindings.nvjitlink is not usable and {warn_txt_common} nvJitLink."
         )
 
diff --git a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
index 1299f1bd57..744c58e021 100644
--- a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
@@ -130,7 +130,7 @@ cdef class DeviceMemoryResource(_MemPool):
 
     def __cinit__(self, *args, **kwargs):
         self._dev_id = cydriver.CU_DEVICE_INVALID
-        self._peer_accessible_by = ()
+        self._peer_accessible_by = None
 
     def __init__(self, device_id: Device | int, options=None):
         _DMR_init(self, device_id, options)
@@ -190,6 +190,7 @@ cdef class DeviceMemoryResource(_MemPool):
             _ipc.MP_from_allocation_handle(cls, alloc_handle))
         from .._device import Device
         mr._dev_id = Device(device_id).device_id
+        mr._peer_accessible_by = ()
         return mr
 
     def get_allocation_handle(self) -> IPCAllocationHandle:
@@ -224,6 +225,10 @@ cdef class DeviceMemoryResource(_MemPool):
         When setting, accepts a sequence of Device objects or device IDs.
         Setting to an empty sequence revokes all peer access.
 
+        For non-owned pools (the default or current device pool), the state
+        is always queried from the driver to reflect changes made by other
+        wrappers or direct driver calls.
+
         Examples
         --------
         >>> dmr = DeviceMemoryResource(0)
@@ -231,6 +236,8 @@ cdef class DeviceMemoryResource(_MemPool):
         >>> assert dmr.peer_accessible_by == (1,)
         >>> dmr.peer_accessible_by = []  # Revoke access
         """
+        if not self._mempool_owned:
+            _DMR_query_peer_access(self)
         return self._peer_accessible_by
 
     @peer_accessible_by.setter
@@ -248,6 +255,29 @@ cdef class DeviceMemoryResource(_MemPool):
         return False
 
 
+cdef inline _DMR_query_peer_access(DeviceMemoryResource self):
+    """Query the driver for the actual peer access state of this pool."""
+    cdef int total
+    cdef cydriver.CUmemAccess_flags flags
+    cdef cydriver.CUmemLocation location
+    cdef list peers = []
+
+    with nogil:
+        HANDLE_RETURN(cydriver.cuDeviceGetCount(&total))
+
+    location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+    for dev_id in range(total):
+        if dev_id == self._dev_id:
+            continue
+        location.id = dev_id
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, as_cu(self._h_pool), &location))
+        if flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE:
+            peers.append(dev_id)
+
+    self._peer_accessible_by = tuple(sorted(peers))
+
+
 cdef inline _DMR_set_peer_accessible_by(DeviceMemoryResource self, devices):
     from .._device import Device
 
@@ -257,6 +287,8 @@ cdef inline _DMR_set_peer_accessible_by(DeviceMemoryResource self, devices):
     cdef list bad = [dev for dev in target_ids if not this_dev.can_access_peer(dev)]
     if bad:
         raise ValueError(f"Device {self._dev_id} cannot access peer(s): {', '.join(map(str, bad))}")
+    if not self._mempool_owned:
+        _DMR_query_peer_access(self)
     cdef set[int] cur_ids = set(self._peer_accessible_by)
     cdef set[int] to_add = target_ids - cur_ids
     cdef set[int] to_rm = cur_ids - target_ids
@@ -314,6 +346,7 @@ cdef inline _DMR_init(DeviceMemoryResource self, device_id, options):
         self._mempool_owned = False
         MP_raise_release_threshold(self)
     else:
+        self._peer_accessible_by = ()
         MP_init_create_pool(
             self,
             cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE,
diff --git a/cuda_core/cuda/core/_memory/_ipc.pyx b/cuda_core/cuda/core/_memory/_ipc.pyx
index 172556373e..e1174937a2 100644
--- a/cuda_core/cuda/core/_memory/_ipc.pyx
+++ b/cuda_core/cuda/core/_memory/_ipc.pyx
@@ -197,6 +197,10 @@ cdef _MemPool MP_from_allocation_handle(cls, alloc_handle):
     uuid = getattr(alloc_handle, 'uuid', None)  # no-cython-lint
     mr = registry.get(uuid)
     if mr is not None:
+        if not isinstance(mr, cls):
+            raise TypeError(
+                f"Registry contains a {type(mr).__name__} for uuid "
+                f"{uuid}, but {cls.__name__} was requested")
         return mr
 
     # Ensure we have an allocation handle. Duplicate the file descriptor, if
diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx
index 94824e1c4d..0e1df726c0 100644
--- a/cuda_core/cuda/core/_memoryview.pyx
+++ b/cuda_core/cuda/core/_memoryview.pyx
@@ -1096,6 +1096,8 @@ cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None):
     buf.exporting_obj = obj
     buf.metadata = cai_data
     buf.dl_tensor = NULL
+    # Validate shape/strides/typestr eagerly so constructor paths fail fast.
+    buf.get_layout()
     buf.ptr, buf.readonly = cai_data["data"]
     buf.is_device_accessible = True
     if buf.ptr != 0:
@@ -1138,6 +1140,8 @@ cpdef StridedMemoryView view_as_array_interface(obj, view=None):
     buf.exporting_obj = obj
     buf.metadata = data
     buf.dl_tensor = NULL
+    # Validate shape/strides/typestr eagerly so constructor paths fail fast.
+    buf.get_layout()
     buf.ptr, buf.readonly = data["data"]
     buf.is_device_accessible = False
     buf.device_id = handle_return(driver.cuCtxGetDevice())
diff --git a/cuda_core/cuda/core/_program.pyx b/cuda_core/cuda/core/_program.pyx
index 0b1fa93279..5e85a111e7 100644
--- a/cuda_core/cuda/core/_program.pyx
+++ b/cuda_core/cuda/core/_program.pyx
@@ -14,6 +14,7 @@ import threading
 from warnings import warn
 
 from cuda.bindings import driver, nvrtc
+from cuda.pathfinder import optional_cuda_import
 
 from libcpp.vector cimport vector
 
@@ -461,8 +462,8 @@ class ProgramOptions:
 # =============================================================================
 
 # Module-level state for NVVM lazy loading
-cdef object_nvvm_module = None
-cdef bint _nvvm_import_attempted = False
+_nvvm_module = None
+_nvvm_import_attempted = False
 
 
 def _get_nvvm_module():
@@ -484,18 +485,21 @@ def _get_nvvm_module():
                 "Please update cuda-bindings to use NVVM features."
             )
 
-        from cuda.bindings import nvvm
-        from cuda.bindings._internal.nvvm import _inspect_function_pointer
-
-        if _inspect_function_pointer("__nvvmCreateProgram") == 0:
-            raise RuntimeError("NVVM library (libnvvm) is not available in this Python environment. ")
+        nvvm = optional_cuda_import(
+            "cuda.bindings.nvvm",
+            probe_function=lambda module: module.version(),  # probe triggers libnvvm load
+        )
+        if nvvm is None:
+            raise RuntimeError(
+                "NVVM support is unavailable: cuda.bindings.nvvm is missing or libnvvm cannot be loaded."
+            )
 
         _nvvm_module = nvvm
         return _nvvm_module
 
-    except RuntimeError as e:
+    except RuntimeError:
         _nvvm_module = None
-        raise e
+        raise
 
 def _find_libdevice_path():
     """Find libdevice*.bc for NVVM compilation using cuda.pathfinder."""
diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py
index eaf18be925..c83cecb980 100644
--- a/cuda_core/docs/source/conf.py
+++ b/cuda_core/docs/source/conf.py
@@ -41,6 +41,7 @@
     "sphinx_copybutton",
     "sphinx_toolbox.more_autodoc.autoprotocol",
     "release_toc",
+    "release_date",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
diff --git a/cuda_core/docs/source/release/0.7.x-notes.rst b/cuda_core/docs/source/release/0.7.x-notes.rst
index 1a1810219f..98551603b6 100644
--- a/cuda_core/docs/source/release/0.7.x-notes.rst
+++ b/cuda_core/docs/source/release/0.7.x-notes.rst
@@ -57,7 +57,13 @@ Fixes and enhancements
   instead of the NUMA node closest to the active CUDA device. On multi-NUMA
   systems where the device is attached to a non-zero host NUMA node, this could
   cause pool creation or allocation failures. (:issue:`1603`)
+- Fixed :attr:`DeviceMemoryResource.peer_accessible_by` returning stale results when wrapping
+  a non-owned (default) memory pool. The property now always queries the CUDA driver for
+  non-owned pools, so multiple wrappers around the same pool see consistent state. (:issue:`1720`)
 - Reduced Python overhead in :class:`Program` and :class:`Linker` by moving compilation and
   linking operations to the C level and releasing the GIL during backend calls. This benefits
   workloads that create many programs or linkers, and enables concurrent compilation in
   multithreaded applications.
+- Improved optional dependency handling for NVVM and nvJitLink imports so that only genuinely
+  missing optional modules are treated as unavailable; unrelated import failures now surface
+  normally, and ``cuda.core`` now depends directly on ``cuda-pathfinder``.
diff --git a/cuda_core/examples/cuda_graphs.py b/cuda_core/examples/cuda_graphs.py
index 02d1b59ec1..c6233dd5d9 100644
--- a/cuda_core/examples/cuda_graphs.py
+++ b/cuda_core/examples/cuda_graphs.py
@@ -84,9 +84,9 @@ def main():
         result3 = cp.empty_like(a)
 
         # Prepare launch configuration
-        block_size = 256
-        grid_size = (size + block_size - 1) // block_size
-        config = LaunchConfig(grid=grid_size, block=block_size)
+        block = 256
+        grid = (size + block - 1) // block
+        config = LaunchConfig(grid=grid, block=block)
 
         # Sync before graph capture
         dev.sync()
diff --git a/cuda_core/examples/gl_interop_plasma.py b/cuda_core/examples/gl_interop_plasma.py
index 7b8b43cd8d..46fa59ee3f 100644
--- a/cuda_core/examples/gl_interop_plasma.py
+++ b/cuda_core/examples/gl_interop_plasma.py
@@ -94,8 +94,8 @@ def setup_cuda(kernel_source):
     dev.set_current()
     stream = dev.create_stream()
 
-    opts = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
-    prog = Program(kernel_source, code_type="c++", options=opts)
+    program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
+    prog = Program(kernel_source, code_type="c++", options=program_options)
     mod = prog.compile("cubin")
     kernel = mod.get_kernel("plasma")
 
diff --git a/cuda_core/examples/pytorch_example.py b/cuda_core/examples/pytorch_example.py
index 3919953eab..4e3bfcceb5 100644
--- a/cuda_core/examples/pytorch_example.py
+++ b/cuda_core/examples/pytorch_example.py
@@ -48,7 +48,7 @@ def __cuda_stream__(self):
         return (0, stream_id)  # Return format required by CUDA Python
 
 
-s = dev.create_stream(PyTorchStreamWrapper(pt_stream))
+stream = dev.create_stream(PyTorchStreamWrapper(pt_stream))
 
 try:
     # prepare program
@@ -61,7 +61,7 @@ def __cuda_stream__(self):
     )
 
     # Run in single precision
-    ker = mod.get_kernel("saxpy_kernel<float>")
+    kernel = mod.get_kernel("saxpy_kernel<float>")
     dtype = torch.float32
 
     # prepare input/output
@@ -76,16 +76,16 @@ def __cuda_stream__(self):
     block = 32
     grid = int((size + block - 1) // block)
     config = LaunchConfig(grid=grid, block=block)
-    ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
+    kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
 
     # launch kernel on our stream
-    launch(s, config, ker, *ker_args)
+    launch(stream, config, kernel, *kernel_args)
 
     # check result
     assert torch.allclose(out, a.item() * x + y)
 
     # let's repeat again with double precision
-    ker = mod.get_kernel("saxpy_kernel<double>")
+    kernel = mod.get_kernel("saxpy_kernel<double>")
     dtype = torch.float64
 
     # prepare input
@@ -102,12 +102,12 @@ def __cuda_stream__(self):
     block = 64
     grid = int((size + block - 1) // block)
     config = LaunchConfig(grid=grid, block=block)
-    ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
+    kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
 
     # launch kernel on PyTorch's stream
-    launch(s, config, ker, *ker_args)
+    launch(stream, config, kernel, *kernel_args)
 
     # check result
     assert torch.allclose(out, a * x + y)
 finally:
-    s.close()
+    stream.close()
diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py
index d7eb401ac3..548af802be 100644
--- a/cuda_core/examples/saxpy.py
+++ b/cuda_core/examples/saxpy.py
@@ -35,7 +35,7 @@
 
 dev = Device()
 dev.set_current()
-s = dev.create_stream()
+stream = dev.create_stream()
 buf = None
 
 try:
@@ -53,7 +53,7 @@
     )
 
     # run in single precision
-    ker = mod.get_kernel("saxpy<float>")
+    kernel = mod.get_kernel("saxpy<float>")
     dtype = cp.float32
 
     # prepare input/output
@@ -63,24 +63,24 @@
     x = rng.random(size, dtype=dtype)
     y = rng.random(size, dtype=dtype)
     out = cp.empty_like(x)
-    dev.sync()  # cupy runs on a different stream from s, so sync before accessing
+    dev.sync()  # cupy runs on a different stream from stream, so sync before accessing
 
     # prepare launch
     block = 32
     grid = int((size + block - 1) // block)
     config = LaunchConfig(grid=grid, block=block)
-    ker_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
+    kernel_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
 
-    # launch kernel on stream s
-    launch(s, config, ker, *ker_args)
-    s.sync()
+    # launch kernel on stream
+    launch(stream, config, kernel, *kernel_args)
+    stream.sync()
 
     # check result
     assert cp.allclose(out, a * x + y)
 
     # let's repeat again, this time allocates our own out buffer instead of cupy's
     # run in double precision
-    ker = mod.get_kernel("saxpy<double>")
+    kernel = mod.get_kernel("saxpy<double>")
     dtype = cp.float64
 
     # prepare input
@@ -93,18 +93,18 @@
     # prepare output
     buf = dev.allocate(
         size * 8,  # = dtype.itemsize
-        stream=s,
+        stream=stream,
     )
 
     # prepare launch
     block = 64
     grid = int((size + block - 1) // block)
     config = LaunchConfig(grid=grid, block=block)
-    ker_args = (a, x.data.ptr, y.data.ptr, buf, size)
+    kernel_args = (a, x.data.ptr, y.data.ptr, buf, size)
 
-    # launch kernel on stream s
-    launch(s, config, ker, *ker_args)
-    s.sync()
+    # launch kernel on stream
+    launch(stream, config, kernel, *kernel_args)
+    stream.sync()
 
     # check result
     # we wrap output buffer as a cupy array for simplicity
@@ -115,5 +115,5 @@
 finally:
     # cupy cleans up automatically the rest
     if buf is not None:
-        buf.close(s)
-    s.close()
+        buf.close(stream)
+    stream.close()
diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py
index 0fbb4466bb..882ce8bbb3 100644
--- a/cuda_core/examples/simple_multi_gpu_example.py
+++ b/cuda_core/examples/simple_multi_gpu_example.py
@@ -13,7 +13,7 @@
 
 import cupy as cp
 
-from cuda.core import Device, LaunchConfig, Program, launch, system
+from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch, system
 
 if system.get_num_devices() < 2:
     print("this example requires at least 2 GPUs", file=sys.stderr)
@@ -56,9 +56,9 @@ def __cuda_stream__(self):
     }
 }
 """
-    prog_add = Program(code_add, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev0.arch}"})
+    prog_add = Program(code_add, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}"))
     mod_add = prog_add.compile("cubin")
-    ker_add = mod_add.get_kernel("vector_add")
+    add_kernel = mod_add.get_kernel("vector_add")
 
     # Set GPU 1
     dev1 = Device(1)
@@ -78,9 +78,9 @@ def __cuda_stream__(self):
     }
 }
 """
-    prog_sub = Program(code_sub, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev1.arch}"})
+    prog_sub = Program(code_sub, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}"))
     mod_sub = prog_sub.compile("cubin")
-    ker_sub = mod_sub.get_kernel("vector_sub")
+    sub_kernel = mod_sub.get_kernel("vector_sub")
 
     # Create launch configs for each kernel that will be executed on the respective
     # CUDA streams.
@@ -103,7 +103,7 @@ def __cuda_stream__(self):
     stream0.wait(cp_stream0)
 
     # Launch the add kernel on GPU 0 / stream 0
-    launch(stream0, config0, ker_add, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
+    launch(stream0, config0, add_kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
 
     # Allocate memory on GPU 1
     # Note: This runs on CuPy's current stream for GPU 1.
@@ -118,7 +118,7 @@ def __cuda_stream__(self):
     stream1.wait(cp_stream1)
 
     # Launch the subtract kernel on GPU 1 / stream 1
-    launch(stream1, config1, ker_sub, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
+    launch(stream1, config1, sub_kernel, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
 
     # Synchronize both GPUs are validate the results
     dev0.set_current()
diff --git a/cuda_core/examples/strided_memory_view_gpu.py b/cuda_core/examples/strided_memory_view_gpu.py
index d53c4278b2..9d4e4aacff 100644
--- a/cuda_core/examples/strided_memory_view_gpu.py
+++ b/cuda_core/examples/strided_memory_view_gpu.py
@@ -57,7 +57,7 @@
 # We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
 # of which are supported by StridedMemoryView).
 @args_viewable_as_strided_memory((0,))
-def my_func(arr, work_stream, gpu_ker):
+def my_func(arr, work_stream, kernel):
     # Create a memory view over arr (assumed to be a 1D array of int32). The stream
     # ordering is taken care of, so that arr can be safely accessed on our work
     # stream (ordered after a data stream on which arr is potentially prepared).
@@ -73,7 +73,7 @@ def my_func(arr, work_stream, gpu_ker):
     block = 256
     grid = (size + block - 1) // block
     config = LaunchConfig(grid=grid, block=block)
-    launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size))
+    launch(work_stream, config, kernel, view.ptr, np.uint64(size))
     # Here we're being conservative and synchronize over our work stream,
     # assuming we do not know the data stream; if we know then we could
     # just order the data stream after the work stream here, e.g.
@@ -101,24 +101,24 @@ def run():
     # To know the GPU's compute capability, we need to identify which GPU to use.
     dev = Device(0)
     dev.set_current()
-    gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11"))
-    mod = gpu_prog.compile(target_type="cubin")
-    gpu_ker = mod.get_kernel(func_name)
+    prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11"))
+    mod = prog.compile(target_type="cubin")
+    kernel = mod.get_kernel(func_name)
 
-    s = dev.create_stream()
+    stream = dev.create_stream()
     try:
         # Create input array on GPU
         arr_gpu = cp.ones(1024, dtype=cp.int32)
         print(f"before: {arr_gpu[:10]=}")
 
         # Run the workload
-        my_func(arr_gpu, s, gpu_ker)
+        my_func(arr_gpu, stream, kernel)
 
         # Check the result
         print(f"after: {arr_gpu[:10]=}")
         assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32))
     finally:
-        s.close()
+        stream.close()
 
 
 if __name__ == "__main__":
diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py
index 5e36270eab..a5f50d4189 100644
--- a/cuda_core/examples/thread_block_cluster.py
+++ b/cuda_core/examples/thread_block_cluster.py
@@ -94,7 +94,7 @@
     options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=include_path),
 )
 mod = prog.compile(target_type="cubin")
-ker = mod.get_kernel("check_cluster_info")
+kernel = mod.get_kernel("check_cluster_info")
 
 # prepare launch config
 grid = 4
@@ -126,7 +126,7 @@
     block_dims[:] = 0
 
     # launch kernel on the default stream
-    launch(dev.default_stream, config, ker, grid_buffer, cluster_buffer, block_buffer)
+    launch(dev.default_stream, config, kernel, grid_buffer, cluster_buffer, block_buffer)
     dev.sync()
 
     # verify results
diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py
index 4c645fc7dd..e648a3846f 100644
--- a/cuda_core/examples/vector_add.py
+++ b/cuda_core/examples/vector_add.py
@@ -30,7 +30,7 @@
 
 dev = Device()
 dev.set_current()
-s = dev.create_stream()
+stream = dev.create_stream()
 
 try:
     # prepare program
@@ -39,7 +39,7 @@
     mod = prog.compile("cubin", name_expressions=("vector_add<float>",))
 
     # run in single precision
-    ker = mod.get_kernel("vector_add<float>")
+    kernel = mod.get_kernel("vector_add<float>")
     dtype = cp.float32
 
     # prepare input/output
@@ -49,7 +49,7 @@
     b = rng.random(size, dtype=dtype)
     c = cp.empty_like(a)
 
-    # cupy runs on a different stream from s, so sync before accessing
+    # cupy runs on a different stream from stream, so sync before accessing
     dev.sync()
 
     # prepare launch
@@ -57,11 +57,11 @@
     grid = (size + block - 1) // block
     config = LaunchConfig(grid=grid, block=block)
 
-    # launch kernel on stream s
-    launch(s, config, ker, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
-    s.sync()
+    # launch kernel on stream
+    launch(stream, config, kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
+    stream.sync()
 
     # check result
     assert cp.allclose(c, a + b)
 finally:
-    s.close()
+    stream.close()
diff --git a/cuda_core/pixi.lock b/cuda_core/pixi.lock
index 2536544389..15736cbe8b 100644
--- a/cuda_core/pixi.lock
+++ b/cuda_core/pixi.lock
@@ -32,7 +32,6 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-12.9.86-ha770c72_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-12.9.86-h4bc722e_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-12.9.86-h4bc722e_2.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.3.3-pyhcf101f3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-profiler-api-12.9.79-h7938cbb_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-12.9-h4f385c5_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cython-3.2.2-py314h1807b08_0.conda
@@ -214,6 +213,7 @@ environments:
         build: py314h59f3c06_0
       - conda: ../cuda_bindings
         build: py314h59f3c06_0
+      - conda: ../cuda_pathfinder
       linux-aarch64:
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-2_gnu.tar.bz2
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/alsa-lib-1.2.15.3-he30d5cf_0.conda
@@ -240,7 +240,6 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-aarch64-12.9.86-h579c4fd_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-impl-12.9.86-h7b14b0b_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-tools-12.9.86-h7b14b0b_2.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.3.3-pyhcf101f3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-profiler-api-12.9.79-h16bee8c_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-12.9-h4f385c5_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cython-3.2.2-py314h4c416a3_0.conda
@@ -412,6 +411,7 @@ environments:
         build: py314ha479ada_0
       - conda: ../cuda_bindings
         build: py314ha479ada_0
+      - conda: ../cuda_pathfinder
       win-64:
       - conda: https://conda.anaconda.org/conda-forge/win-64/_openmp_mutex-4.5-2_gnu.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/aom-3.9.1-he0c23c2_0.conda
@@ -435,7 +435,6 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-12.9.86-h57928b3_2.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-12.9.86-h2466b09_2.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-tools-12.9.86-h2466b09_2.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.3.3-pyhcf101f3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-profiler-api-12.9.79-h57928b3_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-12.9-h4f385c5_3.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cython-3.2.2-py314h344ed54_0.conda
@@ -556,6 +555,7 @@ environments:
         build: py314hae7e39d_0
       - conda: ../cuda_bindings
         build: py314hae7e39d_0
+      - conda: ../cuda_pathfinder
   cu13:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
@@ -588,7 +588,6 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.1.115-ha770c72_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.115-h4bc722e_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-13.1.115-h4bc722e_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.3.3-pyhcf101f3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-profiler-api-13.1.80-h7938cbb_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cython-3.2.2-py314h1807b08_0.conda
@@ -770,6 +769,7 @@ environments:
         build: py314h59f3c06_0
       - conda: ../cuda_bindings
         build: py314h59f3c06_0
+      - conda: ../cuda_pathfinder
       linux-aarch64:
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-2_gnu.tar.bz2
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/alsa-lib-1.2.15.3-he30d5cf_0.conda
@@ -795,7 +795,6 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-aarch64-13.1.115-h579c4fd_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-impl-13.1.115-h7b14b0b_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-tools-13.1.115-h7b14b0b_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.3.3-pyhcf101f3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-profiler-api-13.1.80-h16bee8c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cython-3.2.2-py314h4c416a3_0.conda
@@ -967,6 +966,7 @@ environments:
         build: py314ha479ada_0
       - conda: ../cuda_bindings
         build: py314ha479ada_0
+      - conda: ../cuda_pathfinder
       win-64:
       - conda: https://conda.anaconda.org/conda-forge/win-64/_openmp_mutex-4.5-2_gnu.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/aom-3.9.1-he0c23c2_0.conda
@@ -990,7 +990,6 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-13.1.115-h57928b3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-13.1.115-h2466b09_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-tools-13.1.115-h2466b09_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.3.3-pyhcf101f3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-profiler-api-13.1.80-h57928b3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cython-3.2.2-py314h344ed54_0.conda
@@ -1111,6 +1110,7 @@ environments:
         build: py314hae7e39d_0
       - conda: ../cuda_bindings
         build: py314hae7e39d_0
+      - conda: ../cuda_pathfinder
   default:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
@@ -1143,7 +1143,6 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.1.115-ha770c72_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.115-h4bc722e_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-13.1.115-h4bc722e_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.3.3-pyhcf101f3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-profiler-api-13.1.80-h7938cbb_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cython-3.2.2-py314h1807b08_0.conda
@@ -1325,6 +1324,7 @@ environments:
         build: py314h59f3c06_0
       - conda: ../cuda_bindings
         build: py314h59f3c06_0
+      - conda: ../cuda_pathfinder
       linux-aarch64:
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-2_gnu.tar.bz2
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/alsa-lib-1.2.15.3-he30d5cf_0.conda
@@ -1350,7 +1350,6 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-aarch64-13.1.115-h579c4fd_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-impl-13.1.115-h7b14b0b_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-tools-13.1.115-h7b14b0b_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.3.3-pyhcf101f3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-profiler-api-13.1.80-h16bee8c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cython-3.2.2-py314h4c416a3_0.conda
@@ -1522,6 +1521,7 @@ environments:
         build: py314ha479ada_0
       - conda: ../cuda_bindings
         build: py314ha479ada_0
+      - conda: ../cuda_pathfinder
       win-64:
       - conda: https://conda.anaconda.org/conda-forge/win-64/_openmp_mutex-4.5-2_gnu.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/aom-3.9.1-he0c23c2_0.conda
@@ -1545,7 +1545,6 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-13.1.115-h57928b3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-13.1.115-h2466b09_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-tools-13.1.115-h2466b09_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.3.3-pyhcf101f3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-profiler-api-13.1.80-h57928b3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cython-3.2.2-py314h344ed54_0.conda
@@ -1666,6 +1665,7 @@ environments:
         build: py314hae7e39d_0
       - conda: ../cuda_bindings
         build: py314hae7e39d_0
+      - conda: ../cuda_pathfinder
   examples:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
@@ -1699,7 +1699,6 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.1.115-ha770c72_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.115-h4bc722e_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-13.1.115-h4bc722e_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.4.0-pyhc364b38_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cupy-14.0.1-py314h31ce861_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cupy-core-14.0.1-py314hed3c566_0.conda
@@ -1901,6 +1900,7 @@ environments:
         build: py314h59f3c06_0
       - conda: ../cuda_bindings
         build: py314h59f3c06_0
+      - conda: ../cuda_pathfinder
       linux-aarch64:
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-7_kmp_llvm.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/alsa-lib-1.2.15.3-he30d5cf_0.conda
@@ -1929,7 +1929,6 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-aarch64-13.1.115-h579c4fd_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-impl-13.1.115-h7b14b0b_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-tools-13.1.115-h7b14b0b_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.4.0-pyhc364b38_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cupy-14.0.1-py314h8e5308c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cupy-core-14.0.1-py314h1d6db3a_0.conda
@@ -2124,6 +2123,7 @@ environments:
         build: py314ha479ada_0
       - conda: ../cuda_bindings
         build: py314ha479ada_0
+      - conda: ../cuda_pathfinder
       win-64:
       - conda: https://conda.anaconda.org/conda-forge/win-64/aom-3.9.1-he0c23c2_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/bzip2-1.0.8-h0ad9c76_9.conda
@@ -2135,7 +2135,6 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-13.1.115-h57928b3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-13.1.115-h2466b09_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-tools-13.1.115-h2466b09_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.4.0-pyhc364b38_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/dav1d-1.2.1-hcfcfb64_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/ffmpeg-8.0.1-gpl_hb2d76f6_912.conda
@@ -2223,6 +2222,7 @@ environments:
         build: py314hae7e39d_0
       - conda: ../cuda_bindings
         build: py314hae7e39d_0
+      - conda: ../cuda_pathfinder
 packages:
 - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
   sha256: fe51de6107f9edc7aa4f786a70f4a883943bc9d39b3bb7307c04c41410990726
@@ -2823,6 +2823,7 @@ packages:
   - python
   - numpy
   - cuda-bindings
+  - cuda-pathfinder
   - libgcc >=15
   - libgcc >=15
   - libstdcxx >=15
@@ -2833,6 +2834,8 @@ packages:
   sources:
     cuda-bindings:
       path: ../cuda_bindings
+    cuda-pathfinder:
+      path: ../cuda_pathfinder
 - conda: .
   name: cuda-core
   version: 0.5.0
@@ -2845,6 +2848,7 @@ packages:
   - python
   - numpy
   - cuda-bindings
+  - cuda-pathfinder
   - libgcc >=15
   - libgcc >=15
   - libstdcxx >=15
@@ -2855,6 +2859,8 @@ packages:
   sources:
     cuda-bindings:
       path: ../cuda_bindings
+    cuda-pathfinder:
+      path: ../cuda_pathfinder
 - conda: .
   name: cuda-core
   version: 0.5.0
@@ -2869,6 +2875,7 @@ packages:
   - python
   - numpy
   - cuda-bindings
+  - cuda-pathfinder
   - vc >=14.3,<15
   - vc14_runtime >=14.44.35208
   - ucrt >=10.0.20348.0
@@ -2878,6 +2885,8 @@ packages:
   sources:
     cuda-bindings:
       path: ../cuda_bindings
+    cuda-pathfinder:
+      path: ../cuda_pathfinder
 - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-12.9.86-ha770c72_2.conda
   sha256: e6257534c4b4b6b8a1192f84191c34906ab9968c92680fa09f639e7846a87304
   md5: 79d280de61e18010df5997daea4743df
@@ -3879,27 +3888,17 @@ packages:
   license: LicenseRef-NVIDIA-End-User-License-Agreement
   size: 41660022
   timestamp: 1768280258661
-- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.3.3-pyhcf101f3_0.conda
-  sha256: 6f78993b194403725d4602355a8f1fc57f333eff9c3245a66f33e70c75d67163
-  md5: b08fa4a3478526e33a4c08224398d2e5
-  depends:
-  - python >=3.10
-  - cuda-version >=12.0,<14
-  - python
-  license: Apache-2.0
-  size: 30869
-  timestamp: 1764891530469
-- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.4.0-pyhc364b38_0.conda
-  sha256: edf16fdfbcce5bbb445118fd8d070dda8afe36b4b437a94f472fde153bc38151
-  md5: 2d13e524da66b60e6e7d5c6585729ea8
+- conda: ../cuda_pathfinder
+  name: cuda-pathfinder
+  version: 1.3.4a0
+  build: pyh4616a5c_0
+  subdir: noarch
+  variants:
+    target_platform: noarch
   depends:
   - python >=3.10
-  - cuda-version >=12.0,<14
-  - python
+  - python *
   license: Apache-2.0
-  license_family: APACHE
-  size: 39327
-  timestamp: 1772059437166
 - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-profiler-api-12.9.79-h7938cbb_1.conda
   sha256: 4f679dfbf2bf2d17abb507f31b0176c0e3572337b5005b9e36179948a53988ac
   md5: 90d09865fb37d11d510444e34ebe6a09
diff --git a/cuda_core/pixi.toml b/cuda_core/pixi.toml
index cd5b0d6de5..9dc6ac1ed9 100644
--- a/cuda_core/pixi.toml
+++ b/cuda_core/pixi.toml
@@ -148,6 +148,7 @@ numpy = "*"
 # Using path dependency now that we've added .pth support for Cython .pxd files
 # See build_hooks.py:_add_cython_include_paths_to_pth()
 cuda-bindings = { path = "../cuda_bindings" }
+cuda-pathfinder = { path = "../cuda_pathfinder" }
 
 [target.linux.tasks.build-cython-tests]
 cmd = ["$PIXI_PROJECT_ROOT/tests/cython/build_tests.sh"]
diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index a2828a9274..2710818816 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -47,6 +47,8 @@ classifiers = [
     "Environment :: GPU :: NVIDIA CUDA :: 13",
 ]
 dependencies = [
+    # TODO: bump to >=1.4.2 once cuda-pathfinder 1.4.2 is released.
+    "cuda-pathfinder >=1.4.1",
     "numpy",
 ]
 
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index c3be38d078..bd6a880fdc 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -8,7 +8,7 @@
 import pytest
 from helpers.buffers import PatternGen
 
-from cuda.core import Buffer, Device, DeviceMemoryResource
+from cuda.core import Buffer, Device, DeviceMemoryResource, PinnedMemoryResource
 
 CHILD_TIMEOUT_SEC = 30
 NBYTES = 64
@@ -159,7 +159,14 @@ def test_main(self, ipc_device, ipc_memory_resource):
     def child_main(self, alloc_handle, mr1, buffer_desc, buffer):
         device = Device()
         device.set_current()
-        mr2 = DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
+        if isinstance(mr1, PinnedMemoryResource):
+            with pytest.raises(TypeError):
+                DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
+            mr2 = PinnedMemoryResource.from_allocation_handle(alloc_handle)
+        else:
+            with pytest.raises(TypeError):
+                PinnedMemoryResource.from_allocation_handle(alloc_handle)
+            mr2 = DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
         pgen = PatternGen(device, NBYTES)
 
         # Verify initial content
diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py
index 7d66ef4763..b7d5747b75 100644
--- a/cuda_core/tests/test_memory_peer_access.py
+++ b/cuda_core/tests/test_memory_peer_access.py
@@ -142,3 +142,32 @@ def verify_state(state, pattern_seed):
         assert dmrs[0].peer_accessible_by == final_state
         verify_state(final_state, pattern_seed)
         pattern_seed += 1
+
+
+def test_peer_access_shared_pool_queries_driver(mempool_device_x2):
+    """Non-owned pools always query the driver for peer access state."""
+    dev0, dev1 = mempool_device_x2
+
+    # Grant peer access via one wrapper; a second wrapper must see it.
+    dmr1 = DeviceMemoryResource(dev0)
+    dmr1.peer_accessible_by = [dev1]
+    dmr2 = DeviceMemoryResource(dev0)
+    assert dev1.device_id in dmr2.peer_accessible_by
+
+    # Revoke via dmr2; dmr1 must reflect the change immediately.
+    dmr2.peer_accessible_by = []
+    assert dmr1.peer_accessible_by == ()
+
+    # Re-grant via dmr1. A fresh wrapper that has never read the
+    # property must still query the driver before computing diffs
+    # in the setter, so setting [] must discover and revoke the access.
+    dmr1.peer_accessible_by = [dev1]
+    dmr3 = DeviceMemoryResource(dev0)
+    assert dmr1.peer_accessible_by == (dev1.device_id,)
+    assert dmr2.peer_accessible_by == (dev1.device_id,)
+    assert dmr3.peer_accessible_by == (dev1.device_id,)
+    dmr3.peer_accessible_by = []
+    assert DeviceMemoryResource(dev0).peer_accessible_by == ()
+    assert dmr1.peer_accessible_by == ()
+    assert dmr2.peer_accessible_by == ()
+    assert dmr3.peer_accessible_by == ()
diff --git a/cuda_core/tests/test_optional_dependency_imports.py b/cuda_core/tests/test_optional_dependency_imports.py
new file mode 100644
index 0000000000..25789f7f59
--- /dev/null
+++ b/cuda_core/tests/test_optional_dependency_imports.py
@@ -0,0 +1,123 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import types
+
+import pytest
+
+from cuda.core import _linker, _program
+
+
+@pytest.fixture(autouse=True)
+def restore_optional_import_state():
+    saved_nvvm_module = _program._nvvm_module
+    saved_nvvm_attempted = _program._nvvm_import_attempted
+    saved_driver = _linker._driver
+    saved_driver_ver = _linker._driver_ver
+    saved_inited = _linker._inited
+    saved_use_nvjitlink = _linker._use_nvjitlink_backend
+
+    _program._nvvm_module = None
+    _program._nvvm_import_attempted = False
+    _linker._driver = None
+    _linker._driver_ver = None
+    _linker._inited = False
+    _linker._use_nvjitlink_backend = False
+
+    yield
+
+    _program._nvvm_module = saved_nvvm_module
+    _program._nvvm_import_attempted = saved_nvvm_attempted
+    _linker._driver = saved_driver
+    _linker._driver_ver = saved_driver_ver
+    _linker._inited = saved_inited
+    _linker._use_nvjitlink_backend = saved_use_nvjitlink
+
+
+def _patch_driver_version(monkeypatch, version=13000):
+    monkeypatch.setattr(
+        _linker,
+        "driver",
+        types.SimpleNamespace(cuDriverGetVersion=lambda: version),
+    )
+    monkeypatch.setattr(_linker, "handle_return", lambda value: value)
+
+
+def test_get_nvvm_module_reraises_nested_module_not_found(monkeypatch):
+    monkeypatch.setattr(_program, "get_binding_version", lambda: (12, 9))
+
+    def fake_optional_cuda_import(modname, probe_function=None):
+        assert modname == "cuda.bindings.nvvm"
+        assert probe_function is not None
+        err = ModuleNotFoundError("No module named 'not_a_real_dependency'")
+        err.name = "not_a_real_dependency"
+        raise err
+
+    monkeypatch.setattr(_program, "optional_cuda_import", fake_optional_cuda_import)
+
+    with pytest.raises(ModuleNotFoundError, match="not_a_real_dependency") as excinfo:
+        _program._get_nvvm_module()
+    assert excinfo.value.name == "not_a_real_dependency"
+
+
+def test_get_nvvm_module_reports_missing_nvvm_module(monkeypatch):
+    monkeypatch.setattr(_program, "get_binding_version", lambda: (12, 9))
+
+    def fake_optional_cuda_import(modname, probe_function=None):
+        assert modname == "cuda.bindings.nvvm"
+        assert probe_function is not None
+        return None
+
+    monkeypatch.setattr(_program, "optional_cuda_import", fake_optional_cuda_import)
+
+    with pytest.raises(RuntimeError, match="cuda.bindings.nvvm"):
+        _program._get_nvvm_module()
+
+
+def test_get_nvvm_module_handles_missing_libnvvm(monkeypatch):
+    monkeypatch.setattr(_program, "get_binding_version", lambda: (12, 9))
+
+    def fake_optional_cuda_import(modname, probe_function=None):
+        assert modname == "cuda.bindings.nvvm"
+        assert probe_function is not None
+        return None
+
+    monkeypatch.setattr(_program, "optional_cuda_import", fake_optional_cuda_import)
+
+    with pytest.raises(RuntimeError, match="libnvvm"):
+        _program._get_nvvm_module()
+
+
+def test_decide_nvjitlink_or_driver_reraises_nested_module_not_found(monkeypatch):
+    _patch_driver_version(monkeypatch)
+
+    def fake_optional_cuda_import(modname, probe_function=None):
+        assert modname == "cuda.bindings.nvjitlink"
+        assert probe_function is not None
+        err = ModuleNotFoundError("No module named 'not_a_real_dependency'")
+        err.name = "not_a_real_dependency"
+        raise err
+
+    monkeypatch.setattr(_linker, "optional_cuda_import", fake_optional_cuda_import)
+
+    with pytest.raises(ModuleNotFoundError, match="not_a_real_dependency") as excinfo:
+        _linker._decide_nvjitlink_or_driver()
+    assert excinfo.value.name == "not_a_real_dependency"
+
+
+def test_decide_nvjitlink_or_driver_falls_back_when_module_missing(monkeypatch):
+    _patch_driver_version(monkeypatch)
+
+    def fake_optional_cuda_import(modname, probe_function=None):
+        assert modname == "cuda.bindings.nvjitlink"
+        assert probe_function is not None
+        return None
+
+    monkeypatch.setattr(_linker, "optional_cuda_import", fake_optional_cuda_import)
+
+    with pytest.warns(RuntimeWarning, match="cuda.bindings.nvjitlink is not available"):
+        use_driver_backend = _linker._decide_nvjitlink_or_driver()
+
+    assert use_driver_backend is True
+    assert _linker._use_nvjitlink_backend is False
diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index 9e79f48313..e7ebb5bb52 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -582,10 +582,53 @@ def test_from_array_interface_unsupported_strides(init_cuda):
     # Create an array with strides that aren't a multiple of itemsize
     x = np.array([(1, 2.0), (3, 4.0)], dtype=[("a", "i4"), ("b", "f8")])
     b = x["b"]
-    smv = StridedMemoryView.from_array_interface(b)
     with pytest.raises(ValueError, match="strides must be divisible by itemsize"):
-        # TODO: ideally this would raise on construction
-        smv.strides  # noqa: B018
+        StridedMemoryView.from_array_interface(b)
+
+
+def _make_cuda_array_interface_obj(*, shape, strides, typestr="<f8", data=(0, False), version=3):
+    return type(
+        "SyntheticCAI",
+        (),
+        {
+            "__cuda_array_interface__": {
+                "shape": shape,
+                "strides": strides,
+                "typestr": typestr,
+                "data": data,
+                "version": version,
+            }
+        },
+    )()
+
+
+def test_from_cuda_array_interface_unsupported_strides(init_cuda):
+    cai_obj = _make_cuda_array_interface_obj(shape=(2,), strides=(10,))
+    with pytest.raises(ValueError, match="strides must be divisible by itemsize"):
+        StridedMemoryView.from_cuda_array_interface(cai_obj, stream_ptr=-1)
+
+
+def test_from_cuda_array_interface_zero_strides(init_cuda):
+    cai_obj = _make_cuda_array_interface_obj(shape=(1, 1), strides=(0, 0))
+    smv = StridedMemoryView.from_cuda_array_interface(cai_obj, stream_ptr=-1)
+    assert smv.shape == (1, 1)
+    assert smv.strides == (0, 0)
+
+
+@pytest.mark.skipif(cp is None, reason="CuPy is not installed")
+def test_from_cuda_array_interface_negative_strides(init_cuda):
+    x = cp.arange(4, dtype=cp.float64)[::-1]
+    smv = StridedMemoryView.from_cuda_array_interface(_EnforceCAIView(x), stream_ptr=-1)
+    assert smv.shape == x.shape
+    assert smv.strides == (-1,)
+
+
+def test_from_cuda_array_interface_empty_array(init_cuda):
+    cai_obj = _make_cuda_array_interface_obj(shape=(0, 3), strides=(24, 8))
+    smv = StridedMemoryView.from_cuda_array_interface(cai_obj, stream_ptr=-1)
+    assert smv.size == 0
+    assert smv.shape == (0, 3)
+    assert smv.strides == (3, 1)
 
 
 @pytest.mark.parametrize(
diff --git a/cuda_pathfinder/AGENTS.md b/cuda_pathfinder/AGENTS.md
new file mode 100644
index 0000000000..52159c84fb
--- /dev/null
+++ b/cuda_pathfinder/AGENTS.md
@@ -0,0 +1,72 @@
+This file describes `cuda_pathfinder`, a Python sub-package of
+[cuda-python](https://github.com/NVIDIA/cuda-python). It locates and loads
+NVIDIA dynamic libraries (CTK, third-party, and driver) across Linux and
+Windows.
+
+## Scope and principles
+
+- **Language**: all implementation code in this package is pure Python.
+- **Public API**: keep user-facing imports stable via `cuda.pathfinder`.
+  Internal modules should stay under `cuda.pathfinder._*`.
+- **Behavior**: loader behavior must remain deterministic and explicit. Avoid
+  "best effort" silent fallbacks that mask why discovery/loading failed.
+- **Cross-platform**: preserve Linux and Windows behavior parity unless a change
+  is explicitly platform-scoped.
+
+## Package architecture
+
+- **Descriptor source-of-truth**: `cuda/pathfinder/_dynamic_libs/descriptor_catalog.py`
+  defines canonical metadata for known libraries.
+- **Registry layers**:
+  - `lib_descriptor.py` builds the name-keyed runtime registry from the catalog.
+  - `supported_nvidia_libs.py` keeps legacy exported tables derived from the
+    catalog for compatibility.
+- **Search pipeline**:
+  - `search_steps.py` implements composable find steps (`site-packages`,
+    `CONDA_PREFIX`, `CUDA_HOME`/`CUDA_PATH`, canary-assisted CTK root flow).
+  - `search_platform.py` and `platform_loader.py` isolate OS-specific logic.
+- **Load orchestration**:
+  - `load_nvidia_dynamic_lib.py` coordinates find/load phases, dependency
+    loading, driver-lib fast path, and cache semantics.
+- **Process isolation helper**:
+  - `cuda/pathfinder/_utils/spawned_process_runner.py` is used where process
+    global dynamic loader state would otherwise leak across tests.
+
+## Editing guidance
+
+- **Edit authored descriptors, not derived tables**: when adding/changing a
+  library, update `descriptor_catalog.py` first; keep derived exports in sync
+  through existing conversion logic and tests.
+- **Respect cache semantics**: `load_nvidia_dynamic_lib` is cached. Never add
+  behavior that closes returned handles or assumes repeated fresh loads.
+- **Keep error contracts intact**:
+  - unknown name -> `DynamicLibUnknownError`
+  - known but unsupported on this OS -> `DynamicLibNotAvailableError`
+  - known/supported but not found/loadable -> `DynamicLibNotFoundError`
+- **Do not hardcode host assumptions**: avoid baking in machine-local paths,
+  shell-specific quoting, or environment assumptions.
+- **Prefer focused abstractions**: if a change is platform-specific, route it
+  through existing platform abstraction points instead of branching in many call
+  sites.
+
+## Testing expectations
+
+- **Primary command**: run `pytest tests/` from `cuda_pathfinder/`.
+- **Real-loading tests**: prefer spawned child-process tests for actual dynamic
+  loading behavior; avoid in-process cross-test interference.
+- **Cache-aware tests**: if a test patches internals used by
+  `load_nvidia_dynamic_lib`, call `load_nvidia_dynamic_lib.cache_clear()`.
+- **Negative-case names**: use obviously fake names (for example
+  `"not_a_real_lib"`) in unknown/invalid-lib tests.
+- **INFO summary in CI logs**: use `info_summary_append` for useful
+  test-context lines visible in terminal summaries.
+- **Strictness toggle**:
+  `CUDA_PATHFINDER_TEST_LOAD_NVIDIA_DYNAMIC_LIB_STRICTNESS` controls whether
+  missing libraries are tolerated (`see_what_works`) or treated as failures
+  (`all_must_work`).
+
+## Useful commands
+
+- Run package tests: `pytest tests/`
+- Run package tests via orchestrator: `../scripts/run_tests.sh pathfinder`
+- Build package docs: `cd docs && ./build_docs.sh`
diff --git a/cuda_pathfinder/CLAUDE.md b/cuda_pathfinder/CLAUDE.md
new file mode 120000
index 0000000000..47dc3e3d86
--- /dev/null
+++ b/cuda_pathfinder/CLAUDE.md
@@ -0,0 +1 @@
+AGENTS.md
\ No newline at end of file
diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py
index 57702de425..16711385b7 100644
--- a/cuda_pathfinder/cuda/pathfinder/__init__.py
+++ b/cuda_pathfinder/cuda/pathfinder/__init__.py
@@ -25,6 +25,7 @@
     locate_nvidia_header_directory as locate_nvidia_header_directory,
 )
 from cuda.pathfinder._headers.supported_nvidia_headers import SUPPORTED_HEADERS_CTK as _SUPPORTED_HEADERS_CTK
+from cuda.pathfinder._optional_cuda_import import optional_cuda_import as optional_cuda_import
 from cuda.pathfinder._static_libs.find_bitcode_lib import (
     SUPPORTED_BITCODE_LIBS as _SUPPORTED_BITCODE_LIBS,
 )
diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/descriptor_catalog.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/descriptor_catalog.py
index e189bb127a..89fa07445d 100644
--- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/descriptor_catalog.py
+++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/descriptor_catalog.py
@@ -266,6 +266,29 @@ class DescriptorSpec:
         linux_sonames=("libcufile.so.0",),
         site_packages_linux=("nvidia/cu13/lib", "nvidia/cufile/lib"),
     ),
+    DescriptorSpec(
+        name="cupti",
+        packaged_with="ctk",
+        linux_sonames=("libcupti.so.12", "libcupti.so.13"),
+        windows_dlls=(
+            "cupti64_2025.4.1.dll",
+            "cupti64_2025.3.1.dll",
+            "cupti64_2025.2.1.dll",
+            "cupti64_2025.1.1.dll",
+            "cupti64_2024.3.2.dll",
+            "cupti64_2024.2.1.dll",
+            "cupti64_2024.1.1.dll",
+            "cupti64_2023.3.1.dll",
+            "cupti64_2023.2.2.dll",
+            "cupti64_2023.1.1.dll",
+            "cupti64_2022.4.1.dll",
+        ),
+        site_packages_linux=("nvidia/cu13/lib", "nvidia/cuda_cupti/lib"),
+        site_packages_windows=("nvidia/cu13/bin/x86_64", "nvidia/cuda_cupti/bin"),
+        anchor_rel_dirs_linux=("extras/CUPTI/lib64", "lib"),
+        anchor_rel_dirs_windows=("extras/CUPTI/lib64", "bin"),
+        ctk_root_canary_anchor_libnames=("cudart",),
+    ),
     # -----------------------------------------------------------------------
     # Third-party / separately packaged libraries
     # -----------------------------------------------------------------------
diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/search_platform.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/search_platform.py
index 817ac0b65f..95e0f4dd1e 100644
--- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/search_platform.py
+++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/search_platform.py
@@ -141,10 +141,20 @@ def find_in_lib_dir(
         error_messages: list[str],
         attachments: list[str],
     ) -> str | None:
+        # Most libraries have both unversioned and versioned files/symlinks (exact match first)
         so_name = os.path.join(lib_dir, lib_searched_for)
         if os.path.isfile(so_name):
             return so_name
-        error_messages.append(f"No such file: {so_name}")
+        # Some libraries only exist as versioned files (e.g., libcupti.so.13 in conda),
+        # so the glob fallback is needed
+        file_wild = lib_searched_for + "*"
+        # Only one match is expected, but to ensure deterministic behavior in unexpected
+        # situations, and to be internally consistent, we sort in reverse order with the
+        # intent to return the newest version first.
+        for so_name in sorted(glob.glob(os.path.join(lib_dir, file_wild)), reverse=True):
+            if os.path.isfile(so_name):
+                return so_name
+        error_messages.append(f"No such file: {file_wild}")
         attachments.append(f'  listdir("{lib_dir}"):')
         if not os.path.isdir(lib_dir):
             attachments.append("    DIRECTORY DOES NOT EXIST")
diff --git a/cuda_pathfinder/cuda/pathfinder/_headers/find_nvidia_headers.py b/cuda_pathfinder/cuda/pathfinder/_headers/find_nvidia_headers.py
index 1727cca607..13f47fc2b5 100644
--- a/cuda_pathfinder/cuda/pathfinder/_headers/find_nvidia_headers.py
+++ b/cuda_pathfinder/cuda/pathfinder/_headers/find_nvidia_headers.py
@@ -8,6 +8,10 @@
 import os
 from dataclasses import dataclass
 
+from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import (
+    _resolve_system_loaded_abs_path_in_subprocess,
+)
+from cuda.pathfinder._dynamic_libs.search_steps import derive_ctk_root
 from cuda.pathfinder._headers import supported_nvidia_headers
 from cuda.pathfinder._utils.env_vars import get_cuda_home_or_path
 from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages
@@ -91,6 +95,23 @@ def _find_based_on_conda_layout(libname: str, h_basename: str, ctk_layout: bool)
     return None
 
 
+def _find_ctk_header_directory_via_canary(libname: str, h_basename: str) -> str | None:
+    """Try CTK header lookup via CTK-root canary probing.
+
+    Uses the same canary as dynamic-library CTK-root discovery: system-load
+    ``cudart`` in a spawned child process, derive CTK root from the resolved
+    absolute library path, then search the expected CTK include layout under
+    that root.
+    """
+    canary_abs_path = _resolve_system_loaded_abs_path_in_subprocess("cudart")
+    if canary_abs_path is None:
+        return None
+    ctk_root = derive_ctk_root(canary_abs_path)
+    if ctk_root is None:
+        return None
+    return _locate_based_on_ctk_layout(libname, h_basename, ctk_root)
+
+
 def _find_ctk_header_directory(libname: str) -> LocatedHeaderDir | None:
     h_basename = supported_nvidia_headers.SUPPORTED_HEADERS_CTK[libname]
     candidate_dirs = supported_nvidia_headers.SUPPORTED_SITE_PACKAGE_HEADER_DIRS_CTK[libname]
@@ -106,6 +127,9 @@ def _find_ctk_header_directory(libname: str) -> LocatedHeaderDir | None:
     if cuda_home and (result := _locate_based_on_ctk_layout(libname, h_basename, cuda_home)):
         return LocatedHeaderDir(abs_path=result, found_via="CUDA_HOME")
 
+    if result := _find_ctk_header_directory_via_canary(libname, h_basename):
+        return LocatedHeaderDir(abs_path=result, found_via="system-ctk-root")
+
     return None
 
 
@@ -139,6 +163,12 @@ def locate_nvidia_header_directory(libname: str) -> LocatedHeaderDir | None:
         3. **CUDA Toolkit environment variables**
 
            - Use ``CUDA_HOME`` or ``CUDA_PATH`` (in that order).
+
+        4. **CTK root canary probe**
+
+           - Probe a system-loaded ``cudart`` in a spawned child process,
+             derive the CTK root from the resolved library path, then search
+             CTK include layout under that root.
     """
 
     if libname in supported_nvidia_headers.SUPPORTED_HEADERS_CTK:
@@ -195,6 +225,12 @@ def find_nvidia_header_directory(libname: str) -> str | None:
         3. **CUDA Toolkit environment variables**
 
            - Use ``CUDA_HOME`` or ``CUDA_PATH`` (in that order).
+
+        4. **CTK root canary probe**
+
+           - Probe a system-loaded ``cudart`` in a spawned child process,
+             derive the CTK root from the resolved library path, then search
+             CTK include layout under that root.
     """
     found = locate_nvidia_header_directory(libname)
     return found.abs_path if found else None
diff --git a/cuda_pathfinder/cuda/pathfinder/_optional_cuda_import.py b/cuda_pathfinder/cuda/pathfinder/_optional_cuda_import.py
new file mode 100644
index 0000000000..3ac977cf35
--- /dev/null
+++ b/cuda_pathfinder/cuda/pathfinder/_optional_cuda_import.py
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import importlib
+from collections.abc import Callable
+from types import ModuleType
+
+from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError
+
+
+def optional_cuda_import(
+    fully_qualified_modname: str,
+    *,
+    probe_function: Callable[[ModuleType], object] | None = None,
+) -> ModuleType | None:
+    """Import an optional CUDA module without masking unrelated import bugs.
+
+    Returns:
+        The imported module if available and the optional probe succeeds,
+        otherwise ``None`` when the requested module is unavailable.
+
+    Raises:
+        ModuleNotFoundError: If the import fails because a dependency of the
+            target module is missing (instead of the target module itself).
+        Exception: Any exception raised by ``probe_function`` except
+            :class:`DynamicLibNotFoundError`, which is treated as "unavailable".
+    """
+    try:
+        module = importlib.import_module(fully_qualified_modname)
+    except ModuleNotFoundError as err:
+        if err.name != fully_qualified_modname:
+            raise
+        return None
+
+    if probe_function is not None:
+        try:
+            probe_function(module)
+        except DynamicLibNotFoundError:
+            return None
+
+    return module
diff --git a/cuda_pathfinder/docs/nv-versions.json b/cuda_pathfinder/docs/nv-versions.json
index a8498094b5..eb0e60239e 100644
--- a/cuda_pathfinder/docs/nv-versions.json
+++ b/cuda_pathfinder/docs/nv-versions.json
@@ -3,6 +3,10 @@
         "version": "latest",
         "url": "https://nvidia.github.io/cuda-python/cuda-pathfinder/latest/"
     },
+    {
+        "version": "1.4.1",
+        "url": "https://nvidia.github.io/cuda-python/cuda-pathfinder/1.4.1/"
+    },
     {
         "version": "1.4.0",
         "url": "https://nvidia.github.io/cuda-python/cuda-pathfinder/1.4.0/"
diff --git a/cuda_pathfinder/docs/source/api.rst b/cuda_pathfinder/docs/source/api.rst
index 52a4ff5010..63f4273a0a 100644
--- a/cuda_pathfinder/docs/source/api.rst
+++ b/cuda_pathfinder/docs/source/api.rst
@@ -14,6 +14,7 @@ locating NVIDIA C/C++ header directories, and finding CUDA binary utilities.
 
    SUPPORTED_NVIDIA_LIBNAMES
    load_nvidia_dynamic_lib
+   optional_cuda_import
    LoadedDL
    DynamicLibNotFoundError
    DynamicLibUnknownError
diff --git a/cuda_pathfinder/docs/source/conf.py b/cuda_pathfinder/docs/source/conf.py
index ac795ff368..f794eb9ee2 100644
--- a/cuda_pathfinder/docs/source/conf.py
+++ b/cuda_pathfinder/docs/source/conf.py
@@ -40,6 +40,7 @@
     "enum_tools.autoenum",
     "sphinx_copybutton",
     "release_toc",
+    "release_date",
 ]
 
 nb_execution_mode = "off"
diff --git a/cuda_pathfinder/docs/source/release/1.4.1-notes.rst b/cuda_pathfinder/docs/source/release/1.4.1-notes.rst
new file mode 100644
index 0000000000..836a62f03d
--- /dev/null
+++ b/cuda_pathfinder/docs/source/release/1.4.1-notes.rst
@@ -0,0 +1,49 @@
+.. SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-License-Identifier: Apache-2.0
+
+.. py:currentmodule:: cuda.pathfinder
+
+``cuda-pathfinder`` 1.4.1 Release notes
+=======================================
+
+Released on Mar 6, 2026
+
+Highlights
+----------
+
+* Add CTK canary fallback for header discovery. When CUDA headers cannot be
+  found via site-packages, Conda, or ``CUDA_HOME``/``CUDA_PATH``, the system
+  now attempts to discover the CTK root via canary probing (using a
+  system-loaded ``cudart`` library path) and searches the CTK include layout
+  from that root. This improves header discovery in standard and non-standard
+  CTK installations, including containerized environments.
+  (`PR #1731 <https://github.com/NVIDIA/cuda-python/pull/1731>`_)
+
+* Add support for loading CUPTI (CUDA Profiling Tools Interface) dynamic libs
+  via ``load_nvidia_dynamic_lib("cupti")`` on Linux and Windows. CUPTI libraries
+  are discovered in site-packages (CUDA 12 and 13), Conda environments, and
+  CTK installations (via CUDA_HOME/CUDA_PATH or canary probe).
+  Supports ``libcupti.so.12``, ``libcupti.so.13`` on Linux and versioned
+  ``cupti64_*.dll`` files on Windows.
+  (`PR #1693 <https://github.com/NVIDIA/cuda-python/pull/1693>`_)
+
+* Add support for finding static libraries (e.g., ``libcudadevrt.a`` on Linux,
+  ``cudadevrt.lib`` on Windows) via new ``find_static_lib()`` and
+  ``locate_static_lib()`` APIs. These follow the same search order as bitcode
+  libraries: site-packages, Conda, then CUDA_HOME/CUDA_PATH.
+  (`PR #1690 <https://github.com/NVIDIA/cuda-python/pull/1690>`_)
+
+* Fix site-packages search order for virtual environments created with
+  ``--system-site-packages``. The search now correctly prioritizes the venv's
+  site-packages before user-site-packages, conforming to PEP 405.
+  (`PR #1717 <https://github.com/NVIDIA/cuda-python/pull/1717>`_)
+
+Internal refactoring
+--------------------
+
+* Refactor library discovery and loading to use a descriptor-driven architecture.
+  All library metadata (SONAMEs, DLLs, site-packages paths, dependencies, loader
+  flags) is now consolidated into a single ``LibDescriptor`` registry, improving
+  maintainability and extensibility. This is an internal refactoring with no
+  behavioral changes.
+  (`PR #1685 <https://github.com/NVIDIA/cuda-python/pull/1685>`_)
diff --git a/cuda_pathfinder/docs/source/release/1.4.2-notes.rst b/cuda_pathfinder/docs/source/release/1.4.2-notes.rst
new file mode 100644
index 0000000000..f81ff2804f
--- /dev/null
+++ b/cuda_pathfinder/docs/source/release/1.4.2-notes.rst
@@ -0,0 +1,15 @@
+.. SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-License-Identifier: Apache-2.0
+
+.. py:currentmodule:: cuda.pathfinder
+
+``cuda-pathfinder`` 1.4.2 Release notes
+=======================================
+
+Highlights
+----------
+
+* Add ``optional_cuda_import()`` to support robust optional imports of CUDA
+  Python modules. It returns ``None`` when the requested module is absent or a
+  probe hits ``DynamicLibNotFoundError``, while still re-raising unrelated
+  ``ModuleNotFoundError`` exceptions (for missing transitive dependencies).
diff --git a/cuda_pathfinder/pyproject.toml b/cuda_pathfinder/pyproject.toml
index 21299d3366..fdd01b763b 100644
--- a/cuda_pathfinder/pyproject.toml
+++ b/cuda_pathfinder/pyproject.toml
@@ -19,7 +19,7 @@ test = [
 ]
 # Internal organization of test dependencies.
 cu12 = [
-    "cuda-toolkit[nvcc,cublas,nvrtc,cudart,cufft,curand,cusolver,cusparse,npp,nvfatbin,nvjitlink,nvjpeg,cccl]==12.*",
+    "cuda-toolkit[nvcc,cublas,nvrtc,cudart,cufft,curand,cusolver,cusparse,npp,nvfatbin,nvjitlink,nvjpeg,cccl,cupti]==12.*",
     "cuda-toolkit[cufile]==12.*; sys_platform != 'win32'",
     "cutensor-cu12",
     "nvidia-cublasmp-cu12; sys_platform != 'win32'",
@@ -31,7 +31,7 @@ cu12 = [
     "nvidia-nvshmem-cu12; sys_platform != 'win32'",
 ]
 cu13 = [
-    "cuda-toolkit[nvcc,cublas,nvrtc,cudart,cufft,curand,cusolver,cusparse,npp,nvfatbin,nvjitlink,nvjpeg,cccl,nvvm]==13.*",
+    "cuda-toolkit[nvcc,cublas,nvrtc,cudart,cufft,curand,cusolver,cusparse,npp,nvfatbin,nvjitlink,nvjpeg,cccl,cupti,nvvm]==13.*",
     "cuda-toolkit[cufile]==13.*; sys_platform != 'win32'",
     "cutensor-cu13",
     "nvidia-cublasmp-cu13; sys_platform != 'win32'",
diff --git a/cuda_pathfinder/tests/test_find_nvidia_headers.py b/cuda_pathfinder/tests/test_find_nvidia_headers.py
index f14681546d..2732de216b 100644
--- a/cuda_pathfinder/tests/test_find_nvidia_headers.py
+++ b/cuda_pathfinder/tests/test_find_nvidia_headers.py
@@ -16,10 +16,15 @@
 import importlib.metadata
 import os
 import re
+from pathlib import Path
 
 import pytest
 
+import cuda.pathfinder._headers.find_nvidia_headers as find_nvidia_headers_module
 from cuda.pathfinder import LocatedHeaderDir, find_nvidia_header_directory, locate_nvidia_header_directory
+from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import (
+    _resolve_system_loaded_abs_path_in_subprocess,
+)
 from cuda.pathfinder._headers.supported_nvidia_headers import (
     SUPPORTED_HEADERS_CTK,
     SUPPORTED_HEADERS_CTK_ALL,
@@ -28,6 +33,7 @@
     SUPPORTED_INSTALL_DIRS_NON_CTK,
     SUPPORTED_SITE_PACKAGE_HEADER_DIRS_CTK,
 )
+from cuda.pathfinder._utils.platform_aware import IS_WINDOWS
 
 STRICTNESS = os.environ.get("CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS", "see_what_works")
 assert STRICTNESS in ("see_what_works", "all_must_work")
@@ -46,7 +52,13 @@ def test_unknown_libname():
 
 def _located_hdr_dir_asserts(located_hdr_dir):
     assert isinstance(located_hdr_dir, LocatedHeaderDir)
-    assert located_hdr_dir.found_via in ("site-packages", "conda", "CUDA_HOME", "supported_install_dir")
+    assert located_hdr_dir.found_via in (
+        "site-packages",
+        "conda",
+        "CUDA_HOME",
+        "system-ctk-root",
+        "supported_install_dir",
+    )
 
 
 def test_non_ctk_importlib_metadata_distributions_names():
@@ -62,6 +74,36 @@ def have_distribution_for(libname: str) -> bool:
     )
 
 
+@pytest.fixture
+def clear_locate_nvidia_header_cache():
+    locate_nvidia_header_directory.cache_clear()
+    _resolve_system_loaded_abs_path_in_subprocess.cache_clear()
+    yield
+    locate_nvidia_header_directory.cache_clear()
+    _resolve_system_loaded_abs_path_in_subprocess.cache_clear()
+
+
+def _create_ctk_header(ctk_root: Path, libname: str) -> str:
+    """Create a fake CTK header file and return its directory."""
+    header_basename = SUPPORTED_HEADERS_CTK[libname]
+    if libname == "nvvm":
+        header_dir = ctk_root / "nvvm" / "include"
+    elif libname == "cccl":
+        header_dir = ctk_root / "include" / "cccl"
+    else:
+        header_dir = ctk_root / "include"
+    header_path = header_dir / header_basename
+    header_path.parent.mkdir(parents=True, exist_ok=True)
+    header_path.touch()
+    return str(header_dir)
+
+
+def _fake_cudart_canary_abs_path(ctk_root: Path) -> str:
+    if IS_WINDOWS:
+        return str(ctk_root / "bin" / "x64" / "cudart64_13.dll")
+    return str(ctk_root / "lib64" / "libcudart.so.13")
+
+
 @pytest.mark.parametrize("libname", SUPPORTED_HEADERS_NON_CTK.keys())
 def test_locate_non_ctk_headers(info_summary_append, libname):
     hdr_dir = find_nvidia_header_directory(libname)
@@ -110,3 +152,85 @@ def test_locate_ctk_headers(info_summary_append, libname):
         assert os.path.isfile(os.path.join(hdr_dir, h_filename))
     if STRICTNESS == "all_must_work":
         assert hdr_dir is not None
+
+
+@pytest.mark.usefixtures("clear_locate_nvidia_header_cache")
+def test_locate_ctk_headers_uses_canary_fallback_when_cuda_home_unset(tmp_path, monkeypatch, mocker):
+    ctk_root = tmp_path / "cuda-system"
+    expected_hdr_dir = _create_ctk_header(ctk_root, "cudart")
+
+    monkeypatch.delenv("CONDA_PREFIX", raising=False)
+    monkeypatch.delenv("CUDA_HOME", raising=False)
+    monkeypatch.delenv("CUDA_PATH", raising=False)
+    mocker.patch.object(find_nvidia_headers_module, "find_sub_dirs_all_sitepackages", return_value=[])
+    probe = mocker.patch.object(
+        find_nvidia_headers_module,
+        "_resolve_system_loaded_abs_path_in_subprocess",
+        return_value=_fake_cudart_canary_abs_path(ctk_root),
+    )
+
+    located_hdr_dir = locate_nvidia_header_directory("cudart")
+
+    assert located_hdr_dir is not None
+    assert located_hdr_dir.abs_path == expected_hdr_dir
+    assert located_hdr_dir.found_via == "system-ctk-root"
+    probe.assert_called_once_with("cudart")
+
+
+@pytest.mark.usefixtures("clear_locate_nvidia_header_cache")
+def test_locate_ctk_headers_cuda_home_takes_priority_over_canary(tmp_path, monkeypatch, mocker):
+    cuda_home = tmp_path / "cuda-home"
+    expected_hdr_dir = _create_ctk_header(cuda_home, "cudart")
+    canary_root = tmp_path / "cuda-system"
+    _create_ctk_header(canary_root, "cudart")
+
+    monkeypatch.delenv("CONDA_PREFIX", raising=False)
+    monkeypatch.setenv("CUDA_HOME", str(cuda_home))
+    monkeypatch.delenv("CUDA_PATH", raising=False)
+    mocker.patch.object(find_nvidia_headers_module, "find_sub_dirs_all_sitepackages", return_value=[])
+    probe = mocker.patch.object(
+        find_nvidia_headers_module,
+        "_resolve_system_loaded_abs_path_in_subprocess",
+        return_value=_fake_cudart_canary_abs_path(canary_root),
+    )
+
+    located_hdr_dir = locate_nvidia_header_directory("cudart")
+
+    assert located_hdr_dir is not None
+    assert located_hdr_dir.abs_path == expected_hdr_dir
+    assert located_hdr_dir.found_via == "CUDA_HOME"
+    probe.assert_not_called()
+
+
+@pytest.mark.usefixtures("clear_locate_nvidia_header_cache")
+def test_locate_ctk_headers_canary_miss_paths_are_non_fatal(monkeypatch, mocker):
+    monkeypatch.delenv("CONDA_PREFIX", raising=False)
+    monkeypatch.delenv("CUDA_HOME", raising=False)
+    monkeypatch.delenv("CUDA_PATH", raising=False)
+    mocker.patch.object(find_nvidia_headers_module, "find_sub_dirs_all_sitepackages", return_value=[])
+    mocker.patch.object(
+        find_nvidia_headers_module,
+        "_resolve_system_loaded_abs_path_in_subprocess",
+        return_value=None,
+    )
+
+    assert locate_nvidia_header_directory("cudart") is None
+    assert find_nvidia_header_directory("cudart") is None
+
+
+@pytest.mark.usefixtures("clear_locate_nvidia_header_cache")
+def test_locate_ctk_headers_canary_probe_errors_are_not_masked(monkeypatch, mocker):
+    monkeypatch.delenv("CONDA_PREFIX", raising=False)
+    monkeypatch.delenv("CUDA_HOME", raising=False)
+    monkeypatch.delenv("CUDA_PATH", raising=False)
+    mocker.patch.object(find_nvidia_headers_module, "find_sub_dirs_all_sitepackages", return_value=[])
+    mocker.patch.object(
+        find_nvidia_headers_module,
+        "_resolve_system_loaded_abs_path_in_subprocess",
+        side_effect=RuntimeError("canary probe failed"),
+    )
+
+    with pytest.raises(RuntimeError, match="canary probe failed"):
+        locate_nvidia_header_directory("cudart")
+    with pytest.raises(RuntimeError, match="canary probe failed"):
+        find_nvidia_header_directory("cudart")
diff --git a/cuda_pathfinder/tests/test_load_nvidia_dynamic_lib_using_mocker.py b/cuda_pathfinder/tests/test_load_nvidia_dynamic_lib_using_mocker.py
new file mode 100644
index 0000000000..3510d1933e
--- /dev/null
+++ b/cuda_pathfinder/tests/test_load_nvidia_dynamic_lib_using_mocker.py
@@ -0,0 +1,173 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from cuda.pathfinder._dynamic_libs import load_nvidia_dynamic_lib as load_mod
+from cuda.pathfinder._dynamic_libs import search_steps as steps_mod
+from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError, LoadedDL
+from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import (
+    _load_lib_no_cache,
+    _resolve_system_loaded_abs_path_in_subprocess,
+)
+from cuda.pathfinder._dynamic_libs.search_steps import EARLY_FIND_STEPS
+from cuda.pathfinder._utils.platform_aware import IS_WINDOWS
+
+_MODULE = "cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib"
+_STEPS_MODULE = "cuda.pathfinder._dynamic_libs.search_steps"
+
+
+@pytest.fixture(autouse=True)
+def _clear_canary_subprocess_probe_cache():
+    _resolve_system_loaded_abs_path_in_subprocess.cache_clear()
+    yield
+    _resolve_system_loaded_abs_path_in_subprocess.cache_clear()
+
+
+def _make_loaded_dl(path, found_via):
+    return LoadedDL(path, False, 0xDEAD, found_via)
+
+
+def _create_cupti_in_ctk(ctk_root):
+    """Create a fake cupti lib in extras/CUPTI/lib64."""
+    if IS_WINDOWS:
+        cupti_dir = ctk_root / "extras" / "CUPTI" / "lib64"
+        cupti_dir.mkdir(parents=True, exist_ok=True)
+        cupti_lib = cupti_dir / "cupti64_2025.4.1.dll"
+    else:
+        cupti_dir = ctk_root / "extras" / "CUPTI" / "lib64"
+        cupti_dir.mkdir(parents=True, exist_ok=True)
+        cupti_lib = cupti_dir / "libcupti.so.13"
+        # Create symlink like real CTK installations
+        cupti_symlink = cupti_dir / "libcupti.so"
+        cupti_symlink.symlink_to("libcupti.so.13")
+    cupti_lib.write_bytes(b"fake")
+    return cupti_lib
+
+
+# ---------------------------------------------------------------------------
+# Conda tests
+# Note: Site-packages and CTK are covered by real CI tests.
+# Mock tests focus on Conda (not covered by real CI) and error paths.
+# ---------------------------------------------------------------------------
+
+
+def test_cupti_found_in_conda(tmp_path, mocker, monkeypatch):
+    """Test finding cupti in conda environment."""
+    if IS_WINDOWS:
+        pytest.skip("Windows support for cupti not yet implemented")
+
+    # Create conda structure
+    conda_prefix = tmp_path / "conda_env"
+    conda_lib_dir = conda_prefix / "lib"
+    conda_lib_dir.mkdir(parents=True)
+    cupti_lib = conda_lib_dir / "libcupti.so.13"
+    cupti_lib.write_bytes(b"fake")
+
+    # Mock conda discovery
+    monkeypatch.setenv("CONDA_PREFIX", str(conda_prefix))
+
+    # Disable site-packages search
+    def _run_find_steps_without_site_packages(ctx, steps):
+        if steps is EARLY_FIND_STEPS:
+            # Skip site-packages, only run conda
+            from cuda.pathfinder._dynamic_libs.search_steps import find_in_conda
+
+            result = find_in_conda(ctx)
+            return result
+        return steps_mod.run_find_steps(ctx, steps)
+
+    mocker.patch(f"{_MODULE}.run_find_steps", side_effect=_run_find_steps_without_site_packages)
+    mocker.patch.object(load_mod.LOADER, "check_if_already_loaded_from_elsewhere", return_value=None)
+    mocker.patch(f"{_MODULE}.load_dependencies")
+    mocker.patch.object(load_mod.LOADER, "load_with_system_search", return_value=None)
+    mocker.patch(f"{_STEPS_MODULE}.get_cuda_home_or_path", return_value=None)
+    mocker.patch(f"{_MODULE}._resolve_system_loaded_abs_path_in_subprocess", return_value=None)
+    mocker.patch.object(
+        load_mod.LOADER,
+        "load_with_abs_path",
+        side_effect=lambda _desc, path, via: _make_loaded_dl(path, via),
+    )
+
+    result = _load_lib_no_cache("cupti")
+    assert result.found_via == "conda"
+    assert result.abs_path == str(cupti_lib)
+
+
+# ---------------------------------------------------------------------------
+# Error path tests
+# ---------------------------------------------------------------------------
+
+
+def test_cupti_not_found_raises_error(mocker):
+    """Test that DynamicLibNotFoundError is raised when cupti is not found."""
+    if IS_WINDOWS:
+        pytest.skip("Windows support for cupti not yet implemented")
+
+    # Mock all search paths to return None
+    def _run_find_steps_disabled(ctx, steps):
+        return None
+
+    mocker.patch(f"{_MODULE}.run_find_steps", side_effect=_run_find_steps_disabled)
+    mocker.patch.object(load_mod.LOADER, "check_if_already_loaded_from_elsewhere", return_value=None)
+    mocker.patch(f"{_MODULE}.load_dependencies")
+    mocker.patch.object(load_mod.LOADER, "load_with_system_search", return_value=None)
+    mocker.patch(f"{_STEPS_MODULE}.get_cuda_home_or_path", return_value=None)
+    mocker.patch(
+        f"{_MODULE}._resolve_system_loaded_abs_path_in_subprocess",
+        return_value=None,
+    )
+
+    with pytest.raises(DynamicLibNotFoundError):
+        _load_lib_no_cache("cupti")
+
+
+# ---------------------------------------------------------------------------
+# Search order tests (Conda-specific, since Conda is not covered by real CI)
+# ---------------------------------------------------------------------------
+
+
+def test_cupti_search_order_conda_before_cuda_home(tmp_path, mocker, monkeypatch):
+    """Test that conda is searched before CUDA_HOME (CTK).
+
+    This test is important because Conda is not covered by real CI tests,
+    so we need to verify the search order between Conda and CTK.
+    """
+    if IS_WINDOWS:
+        pytest.skip("Windows support for cupti not yet implemented")
+
+    # Create both conda and CUDA_HOME structures
+    conda_prefix = tmp_path / "conda_env"
+    conda_lib_dir = conda_prefix / "lib"
+    conda_lib_dir.mkdir(parents=True)
+    conda_cupti_lib = conda_lib_dir / "libcupti.so.13"
+    conda_cupti_lib.write_bytes(b"fake")
+
+    ctk_root = tmp_path / "cuda-13.1"
+    _create_cupti_in_ctk(ctk_root)
+
+    # Mock discovery - disable site-packages, enable conda
+    def _run_find_steps_without_site_packages(ctx, steps):
+        if steps is EARLY_FIND_STEPS:
+            # Skip site-packages, only run conda
+            from cuda.pathfinder._dynamic_libs.search_steps import find_in_conda
+
+            result = find_in_conda(ctx)
+            return result
+        return steps_mod.run_find_steps(ctx, steps)
+
+    mocker.patch(f"{_MODULE}.run_find_steps", side_effect=_run_find_steps_without_site_packages)
+    monkeypatch.setenv("CONDA_PREFIX", str(conda_prefix))
+    mocker.patch.object(load_mod.LOADER, "check_if_already_loaded_from_elsewhere", return_value=None)
+    mocker.patch(f"{_MODULE}.load_dependencies")
+    mocker.patch.object(load_mod.LOADER, "load_with_system_search", return_value=None)
+    mocker.patch(f"{_STEPS_MODULE}.get_cuda_home_or_path", return_value=str(ctk_root))
+    mocker.patch.object(
+        load_mod.LOADER,
+        "load_with_abs_path",
+        side_effect=lambda _desc, path, via: _make_loaded_dl(path, via),
+    )
+
+    result = _load_lib_no_cache("cupti")
+    assert result.found_via == "conda"
+    assert result.abs_path == str(conda_cupti_lib)
diff --git a/cuda_pathfinder/tests/test_optional_cuda_import.py b/cuda_pathfinder/tests/test_optional_cuda_import.py
new file mode 100644
index 0000000000..34cc2a158a
--- /dev/null
+++ b/cuda_pathfinder/tests/test_optional_cuda_import.py
@@ -0,0 +1,67 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import types
+
+import pytest
+
+import cuda.pathfinder._optional_cuda_import as optional_import_mod
+from cuda.pathfinder import DynamicLibNotFoundError, optional_cuda_import
+
+
+def test_optional_cuda_import_returns_module_when_available(monkeypatch):
+    fake_module = types.SimpleNamespace(__name__="cuda.bindings.nvvm")
+    monkeypatch.setattr(optional_import_mod.importlib, "import_module", lambda _name: fake_module)
+
+    result = optional_cuda_import("cuda.bindings.nvvm")
+
+    assert result is fake_module
+
+
+def test_optional_cuda_import_returns_none_when_module_missing(monkeypatch):
+    def fake_import_module(name):
+        err = ModuleNotFoundError("No module named 'cuda.bindings.nvvm'")
+        err.name = name
+        raise err
+
+    monkeypatch.setattr(optional_import_mod.importlib, "import_module", fake_import_module)
+
+    result = optional_cuda_import("cuda.bindings.nvvm")
+
+    assert result is None
+
+
+def test_optional_cuda_import_reraises_nested_module_not_found(monkeypatch):
+    def fake_import_module(_name):
+        err = ModuleNotFoundError("No module named 'not_a_real_dependency'")
+        err.name = "not_a_real_dependency"
+        raise err
+
+    monkeypatch.setattr(optional_import_mod.importlib, "import_module", fake_import_module)
+
+    with pytest.raises(ModuleNotFoundError, match="not_a_real_dependency") as excinfo:
+        optional_cuda_import("cuda.bindings.nvvm")
+    assert excinfo.value.name == "not_a_real_dependency"
+
+
+def test_optional_cuda_import_returns_none_when_probe_finds_missing_dynamic_lib(monkeypatch):
+    fake_module = types.SimpleNamespace(__name__="cuda.bindings.nvvm")
+    monkeypatch.setattr(optional_import_mod.importlib, "import_module", lambda _name: fake_module)
+
+    def probe(_module):
+        raise DynamicLibNotFoundError("libnvvm missing")
+
+    result = optional_cuda_import("cuda.bindings.nvvm", probe_function=probe)
+
+    assert result is None
+
+
+def test_optional_cuda_import_reraises_non_pathfinder_probe_error(monkeypatch):
+    fake_module = types.SimpleNamespace(__name__="cuda.bindings.nvvm")
+    monkeypatch.setattr(optional_import_mod.importlib, "import_module", lambda _name: fake_module)
+
+    def probe(_module):
+        raise RuntimeError("unexpected probe failure")
+
+    with pytest.raises(RuntimeError, match="unexpected probe failure"):
+        optional_cuda_import("cuda.bindings.nvvm", probe_function=probe)
diff --git a/cuda_python/AGENTS.md b/cuda_python/AGENTS.md
new file mode 100644
index 0000000000..7c4fb9c0b1
--- /dev/null
+++ b/cuda_python/AGENTS.md
@@ -0,0 +1,24 @@
+This file describes `cuda_python`, the metapackage layer in the `cuda-python`
+monorepo.
+
+## Scope
+
+- `cuda_python` is primarily packaging and documentation glue.
+- It does not host substantial runtime APIs like `cuda_core`,
+  `cuda_bindings`, or `cuda_pathfinder`.
+
+## Main files to edit
+
+- `pyproject.toml`: project metadata and dynamic dependency declaration.
+- `setup.py`: dynamic dependency pinning logic for matching `cuda-bindings`
+  versions (release vs pre-release behavior).
+- `docs/`: top-level docs build/aggregation scripts.
+
+## Editing guidance
+
+- Keep this package lightweight; prefer implementing runtime features in the
+  component packages rather than here.
+- Be careful when changing dependency/version logic in `setup.py`; preserve
+  compatibility between metapackage versioning and subpackage constraints.
+- If you update docs structure, ensure `docs/build_all_docs.sh` still collects
+  docs from `cuda_python`, `cuda_bindings`, `cuda_core`, and `cuda_pathfinder`.
diff --git a/cuda_python/CLAUDE.md b/cuda_python/CLAUDE.md
new file mode 120000
index 0000000000..47dc3e3d86
--- /dev/null
+++ b/cuda_python/CLAUDE.md
@@ -0,0 +1 @@
+AGENTS.md
\ No newline at end of file
diff --git a/cuda_python/docs/exts/release_date.py b/cuda_python/docs/exts/release_date.py
new file mode 100644
index 0000000000..5e89b2e648
--- /dev/null
+++ b/cuda_python/docs/exts/release_date.py
@@ -0,0 +1,94 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+"""Sphinx extension to auto-inject release dates from git tags.
+
+For every release-notes page (``release/<version>-notes``), this
+extension looks up the corresponding git tag and injects a
+``Released on <date>`` line after the RST title.  Pages that already
+contain such a line, or whose version has no tag yet, are left
+untouched.
+"""
+
+from __future__ import annotations
+
+import re
+import subprocess
+from datetime import UTC, datetime
+
+from sphinx.application import Sphinx
+
+_RELEASED_ON_RE = re.compile(r"Released on ", re.IGNORECASE)
+_RELEASE_NOTE_RE = re.compile(r"^release/(.+)-notes$")
+_UNDERLINE_RE = re.compile(r"^={3,}[ \t]*$", re.MULTILINE)
+
+# project name (from conf.py) -> git tag prefix
+_TAG_PREFIXES: dict[str, str] = {
+    "cuda.core": "cuda-core-v",
+    "cuda.pathfinder": "cuda-pathfinder-v",
+    "cuda.bindings": "v",
+    "CUDA Python": "v",
+}
+
+
+def _format_date(iso_date: str) -> str:
+    """``2026-03-06`` -> ``Mar 6, 2026``."""
+    dt = datetime.strptime(iso_date, "%Y-%m-%d").replace(tzinfo=UTC)
+    return f"{dt.strftime('%b')} {dt.day}, {dt.year}"
+
+
+def _git_tag_date(tag: str) -> str | None:
+    """Return the creator date (YYYY-MM-DD) for *tag*, or None."""
+    try:
+        result = subprocess.run(  # noqa: S603
+            [  # noqa: S607
+                "git",
+                "for-each-ref",
+                "--format=%(creatordate:short)",
+                f"refs/tags/{tag}",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=5,
+        )
+        date_str = result.stdout.strip()
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        date_str = ""
+    return date_str or None
+
+
+def _on_source_read(app: Sphinx, docname: str, source: list[str]) -> None:
+    m = _RELEASE_NOTE_RE.match(docname)
+    if not m:
+        return
+
+    text = source[0]
+    if _RELEASED_ON_RE.search(text):
+        return
+
+    version = m.group(1)
+    prefix = _TAG_PREFIXES.get(app.config.project)
+    if prefix is None:
+        return
+
+    tag = prefix + version
+    iso_date = _git_tag_date(tag)
+    if not iso_date:
+        return
+
+    underline = _UNDERLINE_RE.search(text)
+    if not underline:
+        return
+
+    date_line = f"Released on {_format_date(iso_date)}"
+
+    # Insert after the title underline: skip any blank lines, then place
+    # the date line surrounded by single blank lines before the content.
+    after = text[underline.end() :]
+    stripped = after.lstrip("\n")
+    source[0] = text[: underline.end()] + f"\n\n{date_line}\n\n" + stripped
+
+
+def setup(app: Sphinx) -> dict:
+    app.connect("source-read", _on_source_read)
+    return {"version": "1.0", "parallel_read_safe": True}
diff --git a/cuda_python/docs/source/conf.py b/cuda_python/docs/source/conf.py
index b01cf6b2e0..454cec4973 100644
--- a/cuda_python/docs/source/conf.py
+++ b/cuda_python/docs/source/conf.py
@@ -37,6 +37,7 @@
     "myst_nb",
     "enum_tools.autoenum",
     "release_toc",
+    "release_date",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
diff --git a/ruff.toml b/ruff.toml
index 7f3853529e..76f548848c 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -123,13 +123,15 @@ inline-quotes = "double"
 ]
 
 # CUDA bindings mirror C API naming conventions (CamelCase types, camelCase functions)
-"cuda_bindings/**" = [
+# Keep examples opted-in to enforce naming conventions in example-local identifiers.
+"cuda_bindings/{benchmarks,cuda,docs,tests}/**" = [
   "N801",    # invalid-class-name
   "N802",    # invalid-function-name
   "N803",    # invalid-argument-name
   "N806",    # non-lowercase-variable-in-function
   "N816",    # mixed-case-variable-in-global-scope
 ]
+"cuda_bindings/{build_hooks.py,setup.py}" = ["N801", "N802", "N803", "N806", "N816"]
 
 # scripts and build tooling — print is the expected output method
 "toolshed/**" = ["T201"]