NVIDIA · rwgk · Mar 6, 2026 · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors, findCudaDevice
+from common.helper_cuda import check_cuda_errors, find_cuda_device
 
 from cuda.bindings import driver as cuda
 
@@ -50,8 +50,8 @@
 }
 """
 
-NUM_BLOCKS = 64
-NUM_THREADS = 256
+num_blocks = 64
+num_threads = 256
 
 
 def elems_to_bytes(nelems, dt):
@@ -64,52 +64,52 @@ def main():
     if platform.machine() == "armv7l":
         pytest.skip("clock_nvrtc is not supported on ARMv7")
 
-    timer = np.empty(NUM_BLOCKS * 2, dtype="int64")
-    hinput = np.empty(NUM_THREADS * 2, dtype="float32")
+    timer = np.empty(num_blocks * 2, dtype="int64")
+    hinput = np.empty(num_threads * 2, dtype="float32")
 
-    for i in range(NUM_THREADS * 2):
+    for i in range(num_threads * 2):
         hinput[i] = i
 
-    devID = findCudaDevice()
-    with common.KernelHelper(clock_nvrtc, devID) as kernelHelper:
-        kernel_addr = kernelHelper.getFunction(b"timedReduction")
-
-        dinput = checkCudaErrors(cuda.cuMemAlloc(hinput.nbytes))
-        doutput = checkCudaErrors(cuda.cuMemAlloc(elems_to_bytes(NUM_BLOCKS, np.float32)))
-        dtimer = checkCudaErrors(cuda.cuMemAlloc(timer.nbytes))
-        checkCudaErrors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes))
-
-        args = ((dinput, doutput, dtimer), (None, None, None))
-        shared_memory_nbytes = elems_to_bytes(2 * NUM_THREADS, np.float32)
-
-        grid_dims = (NUM_BLOCKS, 1, 1)
-        block_dims = (NUM_THREADS, 1, 1)
-
-        checkCudaErrors(
-            cuda.cuLaunchKernel(
-                kernel_addr,
-                *grid_dims,  # grid dim
-                *block_dims,  # block dim
-                shared_memory_nbytes,
-                0,  # shared mem, stream
-                args,
-                0,
-            )
-        )  # arguments
-
-        checkCudaErrors(cuda.cuCtxSynchronize())
-        checkCudaErrors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes))
-        checkCudaErrors(cuda.cuMemFree(dinput))
-        checkCudaErrors(cuda.cuMemFree(doutput))
-        checkCudaErrors(cuda.cuMemFree(dtimer))
-
-    avgElapsedClocks = 0.0
-
-    for i in range(NUM_BLOCKS):
-        avgElapsedClocks += timer[i + NUM_BLOCKS] - timer[i]
-
-    avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS
-    print(f"Average clocks/block = {avgElapsedClocks}")
+    dev_id = find_cuda_device()
+    kernel_helper = common.KernelHelper(clock_nvrtc, dev_id)
+    kernel_addr = kernel_helper.get_function(b"timedReduction")
+
+    dinput = check_cuda_errors(cuda.cuMemAlloc(hinput.nbytes))
+    doutput = check_cuda_errors(cuda.cuMemAlloc(elems_to_bytes(num_blocks, np.float32)))
+    dtimer = check_cuda_errors(cuda.cuMemAlloc(timer.nbytes))
+    check_cuda_errors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes))
+
+    args = ((dinput, doutput, dtimer), (None, None, None))
+    shared_memory_nbytes = elems_to_bytes(2 * num_threads, np.float32)
+
+    grid_dims = (num_blocks, 1, 1)
+    block_dims = (num_threads, 1, 1)
+
+    check_cuda_errors(
+        cuda.cuLaunchKernel(
+            kernel_addr,
+            *grid_dims,  # grid dim
+            *block_dims,  # block dim
+            shared_memory_nbytes,
+            0,  # shared mem, stream
+            args,
+            0,
+        )
+    )  # arguments
+
+    check_cuda_errors(cuda.cuCtxSynchronize())
+    check_cuda_errors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes))
+    check_cuda_errors(cuda.cuMemFree(dinput))
+    check_cuda_errors(cuda.cuMemFree(doutput))
+    check_cuda_errors(cuda.cuMemFree(dtimer))
+
+    avg_elapsed_clocks = 0.0
+
+    for i in range(num_blocks):
+        avg_elapsed_clocks += timer[i + num_blocks] - timer[i]
+
+    avg_elapsed_clocks = avg_elapsed_clocks / num_blocks
+    print(f"Average clocks/block = {avg_elapsed_clocks}")
 
 
 if __name__ == "__main__":

diff --git a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
@@ -7,12 +7,12 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors, findCudaDevice
+from common.helper_cuda import check_cuda_errors, find_cuda_device
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
 
-simpleCubemapTexture = """\
+simple_cubemap_texture = """\
 extern "C"
 __global__ void transformKernel(float *g_odata, int width, cudaTextureObject_t tex)
 {
@@ -83,14 +83,14 @@
 
 def main():
     # Use command-line specified CUDA device, otherwise use device with highest Gflops/s
-    devID = findCudaDevice()
+    dev_id = find_cuda_device()
 
     # Get number of SMs on this GPU
-    deviceProps = checkCudaErrors(cudart.cudaGetDeviceProperties(devID))
+    device_props = check_cuda_errors(cudart.cudaGetDeviceProperties(dev_id))
     print(
-        f"CUDA device [{deviceProps.name}] has {deviceProps.multiProcessorCount} Multi-Processors SM {deviceProps.major}.{deviceProps.minor}"
+        f"CUDA device [{device_props.name}] has {device_props.multiProcessorCount} Multi-Processors SM {device_props.major}.{device_props.minor}"
     )
-    if deviceProps.major < 2:
+    if device_props.major < 2:
         import pytest
 
         pytest.skip("Test requires SM 2.0 or higher for support of Texture Arrays.")
@@ -107,15 +107,15 @@ def main():
     h_data_ref = np.repeat(np.arange(num_layers, dtype=h_data.dtype), cubemap_size) - h_data
 
     # Allocate device memory for result
-    d_data = checkCudaErrors(cudart.cudaMalloc(size))
+    d_data = check_cuda_errors(cudart.cudaMalloc(size))
 
     # Allocate array and copy image data
-    channelDesc = checkCudaErrors(
+    channel_desc = check_cuda_errors(
         cudart.cudaCreateChannelDesc(32, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat)
     )
-    cu_3darray = checkCudaErrors(
+    cu_3darray = check_cuda_errors(
         cudart.cudaMalloc3DArray(
-            channelDesc,
+            channel_desc,
             cudart.make_cudaExtent(width, width, num_faces),
             cudart.cudaArrayCubemap,
         )
@@ -128,90 +128,90 @@ def main():
     myparms.dstArray = cu_3darray
     myparms.extent = cudart.make_cudaExtent(width, width, num_faces)
     myparms.kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
-    checkCudaErrors(cudart.cudaMemcpy3D(myparms))
-
-    texRes = cudart.cudaResourceDesc()
-    texRes.resType = cudart.cudaResourceType.cudaResourceTypeArray
-    texRes.res.array.array = cu_3darray
-
-    texDescr = cudart.cudaTextureDesc()
-    texDescr.normalizedCoords = True
-    texDescr.filterMode = cudart.cudaTextureFilterMode.cudaFilterModeLinear
-    texDescr.addressMode[0] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
-    texDescr.addressMode[1] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
-    texDescr.addressMode[2] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
-    texDescr.readMode = cudart.cudaTextureReadMode.cudaReadModeElementType
-
-    tex = checkCudaErrors(cudart.cudaCreateTextureObject(texRes, texDescr, None))
-    dimBlock = cudart.dim3()
-    dimBlock.x = 8
-    dimBlock.y = 8
-    dimBlock.z = 1
-    dimGrid = cudart.dim3()
-    dimGrid.x = width / dimBlock.x
-    dimGrid.y = width / dimBlock.y
-    dimGrid.z = 1
+    check_cuda_errors(cudart.cudaMemcpy3D(myparms))
+
+    tex_res = cudart.cudaResourceDesc()
+    tex_res.resType = cudart.cudaResourceType.cudaResourceTypeArray
+    tex_res.res.array.array = cu_3darray
+
+    tex_descr = cudart.cudaTextureDesc()
+    tex_descr.normalizedCoords = True
+    tex_descr.filterMode = cudart.cudaTextureFilterMode.cudaFilterModeLinear
+    tex_descr.addressMode[0] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
+    tex_descr.addressMode[1] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
+    tex_descr.addressMode[2] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
+    tex_descr.readMode = cudart.cudaTextureReadMode.cudaReadModeElementType
+
+    tex = check_cuda_errors(cudart.cudaCreateTextureObject(tex_res, tex_descr, None))
+    dim_block = cudart.dim3()
+    dim_block.x = 8
+    dim_block.y = 8
+    dim_block.z = 1
+    dim_grid = cudart.dim3()
+    dim_grid.x = width / dim_block.x
+    dim_grid.y = width / dim_block.y
+    dim_grid.z = 1
 
     print(
-        f"Covering Cubemap data array of {width}~3 x {num_layers}: Grid size is {dimGrid.x} x {dimGrid.y}, each block has 8 x 8 threads"
+        f"Covering Cubemap data array of {width}~3 x {num_layers}: Grid size is {dim_grid.x} x {dim_grid.y}, each block has 8 x 8 threads"
     )
 
-    with common.KernelHelper(simpleCubemapTexture, devID) as kernelHelper:
-        _transformKernel = kernelHelper.getFunction(b"transformKernel")
-        kernelArgs = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None))
-        checkCudaErrors(
-            cuda.cuLaunchKernel(
-                _transformKernel,
-                dimGrid.x,
-                dimGrid.y,
-                dimGrid.z,  # grid dim
-                dimBlock.x,
-                dimBlock.y,
-                dimBlock.z,  # block dim
-                0,
-                0,  # shared mem and stream
-                kernelArgs,
-                0,
-            )
-        )  # arguments
-
-        checkCudaErrors(cudart.cudaDeviceSynchronize())
-
-        start = time.time()
-
-        # Execute the kernel
-        checkCudaErrors(
-            cuda.cuLaunchKernel(
-                _transformKernel,
-                dimGrid.x,
-                dimGrid.y,
-                dimGrid.z,  # grid dim
-                dimBlock.x,
-                dimBlock.y,
-                dimBlock.z,  # block dim
-                0,
-                0,  # shared mem and stream
-                kernelArgs,
-                0,
-            )
-        )  # arguments
-
-        checkCudaErrors(cudart.cudaDeviceSynchronize())
-        stop = time.time()
-        print(f"Processing time: {stop - start:.3f} msec")
-        print(f"{cubemap_size / ((stop - start + 1) / 1000.0) / 1e6:.2f} Mtexlookups/sec")
-
-        # Allocate mem for the result on host side
-        h_odata = np.empty_like(h_data)
-        # Copy result from device to host
-        checkCudaErrors(cudart.cudaMemcpy(h_odata, d_data, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost))
-
-    checkCudaErrors(cudart.cudaDestroyTextureObject(tex))
-    checkCudaErrors(cudart.cudaFree(d_data))
-    checkCudaErrors(cudart.cudaFreeArray(cu_3darray))
-
-    MIN_EPSILON_ERROR = 5.0e-3
-    if np.max(np.abs(h_odata - h_data_ref)) > MIN_EPSILON_ERROR:
+    kernel_helper = common.KernelHelper(simple_cubemap_texture, dev_id)
+    _transform_kernel = kernel_helper.get_function(b"transformKernel")
+    kernel_args = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None))
+    check_cuda_errors(
+        cuda.cuLaunchKernel(
+            _transform_kernel,
+            dim_grid.x,
+            dim_grid.y,
+            dim_grid.z,  # grid dim
+            dim_block.x,
+            dim_block.y,
+            dim_block.z,  # block dim
+            0,
+            0,  # shared mem and stream
+            kernel_args,
+            0,
+        )
+    )  # arguments
+
+    check_cuda_errors(cudart.cudaDeviceSynchronize())
+
+    start = time.time()
+
+    # Execute the kernel
+    check_cuda_errors(
+        cuda.cuLaunchKernel(
+            _transform_kernel,
+            dim_grid.x,
+            dim_grid.y,
+            dim_grid.z,  # grid dim
+            dim_block.x,
+            dim_block.y,
+            dim_block.z,  # block dim
+            0,
+            0,  # shared mem and stream
+            kernel_args,
+            0,
+        )
+    )  # arguments
+
+    check_cuda_errors(cudart.cudaDeviceSynchronize())
+    stop = time.time()
+    print(f"Processing time: {stop - start:.3f} msec")
+    print(f"{cubemap_size / ((stop - start + 1) / 1000.0) / 1e6:.2f} Mtexlookups/sec")
+
+    # Allocate mem for the result on host side
+    h_odata = np.empty_like(h_data)
+    # Copy result from device to host
+    check_cuda_errors(cudart.cudaMemcpy(h_odata, d_data, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost))
+
+    check_cuda_errors(cudart.cudaDestroyTextureObject(tex))
+    check_cuda_errors(cudart.cudaFree(d_data))
+    check_cuda_errors(cudart.cudaFreeArray(cu_3darray))
+
+    min_epsilon_error = 5.0e-3
+    if np.max(np.abs(h_odata - h_data_ref)) > min_epsilon_error:
         print("Failed", file=sys.stderr)
         sys.exit(1)