diff --git a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
index dc1084bea8..d67f180fe0 100644
--- a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
+++ b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors, findCudaDevice
+from common.helper_cuda import check_cuda_errors, find_cuda_device
 
 from cuda.bindings import driver as cuda
 
@@ -50,8 +50,8 @@
 }
 """
 
-NUM_BLOCKS = 64
-NUM_THREADS = 256
+num_blocks = 64
+num_threads = 256
 
 
 def elems_to_bytes(nelems, dt):
@@ -64,52 +64,52 @@ def main():
     if platform.machine() == "armv7l":
         pytest.skip("clock_nvrtc is not supported on ARMv7")
 
-    timer = np.empty(NUM_BLOCKS * 2, dtype="int64")
-    hinput = np.empty(NUM_THREADS * 2, dtype="float32")
+    timer = np.empty(num_blocks * 2, dtype="int64")
+    hinput = np.empty(num_threads * 2, dtype="float32")
 
-    for i in range(NUM_THREADS * 2):
+    for i in range(num_threads * 2):
         hinput[i] = i
 
-    devID = findCudaDevice()
-    with common.KernelHelper(clock_nvrtc, devID) as kernelHelper:
-        kernel_addr = kernelHelper.getFunction(b"timedReduction")
-
-        dinput = checkCudaErrors(cuda.cuMemAlloc(hinput.nbytes))
-        doutput = checkCudaErrors(cuda.cuMemAlloc(elems_to_bytes(NUM_BLOCKS, np.float32)))
-        dtimer = checkCudaErrors(cuda.cuMemAlloc(timer.nbytes))
-        checkCudaErrors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes))
-
-        args = ((dinput, doutput, dtimer), (None, None, None))
-        shared_memory_nbytes = elems_to_bytes(2 * NUM_THREADS, np.float32)
-
-        grid_dims = (NUM_BLOCKS, 1, 1)
-        block_dims = (NUM_THREADS, 1, 1)
-
-        checkCudaErrors(
-            cuda.cuLaunchKernel(
-                kernel_addr,
-                *grid_dims,  # grid dim
-                *block_dims,  # block dim
-                shared_memory_nbytes,
-                0,  # shared mem, stream
-                args,
-                0,
-            )
-        )  # arguments
-
-        checkCudaErrors(cuda.cuCtxSynchronize())
-        checkCudaErrors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes))
-        checkCudaErrors(cuda.cuMemFree(dinput))
-        checkCudaErrors(cuda.cuMemFree(doutput))
-        checkCudaErrors(cuda.cuMemFree(dtimer))
-
-    avgElapsedClocks = 0.0
-
-    for i in range(NUM_BLOCKS):
-        avgElapsedClocks += timer[i + NUM_BLOCKS] - timer[i]
-
-    avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS
-    print(f"Average clocks/block = {avgElapsedClocks}")
+    dev_id = find_cuda_device()
+    kernel_helper = common.KernelHelper(clock_nvrtc, dev_id)
+    kernel_addr = kernel_helper.get_function(b"timedReduction")
+
+    dinput = check_cuda_errors(cuda.cuMemAlloc(hinput.nbytes))
+    doutput = check_cuda_errors(cuda.cuMemAlloc(elems_to_bytes(num_blocks, np.float32)))
+    dtimer = check_cuda_errors(cuda.cuMemAlloc(timer.nbytes))
+    check_cuda_errors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes))
+
+    args = ((dinput, doutput, dtimer), (None, None, None))
+    shared_memory_nbytes = elems_to_bytes(2 * num_threads, np.float32)
+
+    grid_dims = (num_blocks, 1, 1)
+    block_dims = (num_threads, 1, 1)
+
+    check_cuda_errors(
+        cuda.cuLaunchKernel(
+            kernel_addr,
+            *grid_dims,  # grid dim
+            *block_dims,  # block dim
+            shared_memory_nbytes,
+            0,  # shared mem, stream
+            args,
+            0,
+        )
+    )  # arguments
+
+    check_cuda_errors(cuda.cuCtxSynchronize())
+    check_cuda_errors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes))
+    check_cuda_errors(cuda.cuMemFree(dinput))
+    check_cuda_errors(cuda.cuMemFree(doutput))
+    check_cuda_errors(cuda.cuMemFree(dtimer))
+
+    avg_elapsed_clocks = 0.0
+
+    for i in range(num_blocks):
+        avg_elapsed_clocks += timer[i + num_blocks] - timer[i]
+
+    avg_elapsed_clocks = avg_elapsed_clocks / num_blocks
+    print(f"Average clocks/block = {avg_elapsed_clocks}")
 
 
 if __name__ == "__main__":
diff --git a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
index 75f1b0800d..5d764509ce 100644
--- a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
@@ -7,12 +7,12 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors, findCudaDevice
+from common.helper_cuda import check_cuda_errors, find_cuda_device
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
 
-simpleCubemapTexture = """\
+simple_cubemap_texture = """\
 extern "C"
 __global__ void transformKernel(float *g_odata, int width, cudaTextureObject_t tex)
 {
@@ -83,14 +83,14 @@
 
 def main():
     # Use command-line specified CUDA device, otherwise use device with highest Gflops/s
-    devID = findCudaDevice()
+    dev_id = find_cuda_device()
 
     # Get number of SMs on this GPU
-    deviceProps = checkCudaErrors(cudart.cudaGetDeviceProperties(devID))
+    device_props = check_cuda_errors(cudart.cudaGetDeviceProperties(dev_id))
     print(
-        f"CUDA device [{deviceProps.name}] has {deviceProps.multiProcessorCount} Multi-Processors SM {deviceProps.major}.{deviceProps.minor}"
+        f"CUDA device [{device_props.name}] has {device_props.multiProcessorCount} Multi-Processors SM {device_props.major}.{device_props.minor}"
     )
-    if deviceProps.major < 2:
+    if device_props.major < 2:
         import pytest
 
         pytest.skip("Test requires SM 2.0 or higher for support of Texture Arrays.")
@@ -107,15 +107,15 @@ def main():
     h_data_ref = np.repeat(np.arange(num_layers, dtype=h_data.dtype), cubemap_size) - h_data
 
     # Allocate device memory for result
-    d_data = checkCudaErrors(cudart.cudaMalloc(size))
+    d_data = check_cuda_errors(cudart.cudaMalloc(size))
 
     # Allocate array and copy image data
-    channelDesc = checkCudaErrors(
+    channel_desc = check_cuda_errors(
         cudart.cudaCreateChannelDesc(32, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat)
     )
-    cu_3darray = checkCudaErrors(
+    cu_3darray = check_cuda_errors(
         cudart.cudaMalloc3DArray(
-            channelDesc,
+            channel_desc,
             cudart.make_cudaExtent(width, width, num_faces),
             cudart.cudaArrayCubemap,
         )
@@ -128,90 +128,90 @@ def main():
     myparms.dstArray = cu_3darray
     myparms.extent = cudart.make_cudaExtent(width, width, num_faces)
     myparms.kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
-    checkCudaErrors(cudart.cudaMemcpy3D(myparms))
-
-    texRes = cudart.cudaResourceDesc()
-    texRes.resType = cudart.cudaResourceType.cudaResourceTypeArray
-    texRes.res.array.array = cu_3darray
-
-    texDescr = cudart.cudaTextureDesc()
-    texDescr.normalizedCoords = True
-    texDescr.filterMode = cudart.cudaTextureFilterMode.cudaFilterModeLinear
-    texDescr.addressMode[0] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
-    texDescr.addressMode[1] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
-    texDescr.addressMode[2] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
-    texDescr.readMode = cudart.cudaTextureReadMode.cudaReadModeElementType
-
-    tex = checkCudaErrors(cudart.cudaCreateTextureObject(texRes, texDescr, None))
-    dimBlock = cudart.dim3()
-    dimBlock.x = 8
-    dimBlock.y = 8
-    dimBlock.z = 1
-    dimGrid = cudart.dim3()
-    dimGrid.x = width / dimBlock.x
-    dimGrid.y = width / dimBlock.y
-    dimGrid.z = 1
+    check_cuda_errors(cudart.cudaMemcpy3D(myparms))
+
+    tex_res = cudart.cudaResourceDesc()
+    tex_res.resType = cudart.cudaResourceType.cudaResourceTypeArray
+    tex_res.res.array.array = cu_3darray
+
+    tex_descr = cudart.cudaTextureDesc()
+    tex_descr.normalizedCoords = True
+    tex_descr.filterMode = cudart.cudaTextureFilterMode.cudaFilterModeLinear
+    tex_descr.addressMode[0] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
+    tex_descr.addressMode[1] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
+    tex_descr.addressMode[2] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
+    tex_descr.readMode = cudart.cudaTextureReadMode.cudaReadModeElementType
+
+    tex = check_cuda_errors(cudart.cudaCreateTextureObject(tex_res, tex_descr, None))
+    dim_block = cudart.dim3()
+    dim_block.x = 8
+    dim_block.y = 8
+    dim_block.z = 1
+    dim_grid = cudart.dim3()
+    dim_grid.x = width / dim_block.x
+    dim_grid.y = width / dim_block.y
+    dim_grid.z = 1
 
     print(
-        f"Covering Cubemap data array of {width}~3 x {num_layers}: Grid size is {dimGrid.x} x {dimGrid.y}, each block has 8 x 8 threads"
+        f"Covering Cubemap data array of {width}~3 x {num_layers}: Grid size is {dim_grid.x} x {dim_grid.y}, each block has 8 x 8 threads"
     )
 
-    with common.KernelHelper(simpleCubemapTexture, devID) as kernelHelper:
-        _transformKernel = kernelHelper.getFunction(b"transformKernel")
-        kernelArgs = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None))
-        checkCudaErrors(
-            cuda.cuLaunchKernel(
-                _transformKernel,
-                dimGrid.x,
-                dimGrid.y,
-                dimGrid.z,  # grid dim
-                dimBlock.x,
-                dimBlock.y,
-                dimBlock.z,  # block dim
-                0,
-                0,  # shared mem and stream
-                kernelArgs,
-                0,
-            )
-        )  # arguments
-
-        checkCudaErrors(cudart.cudaDeviceSynchronize())
-
-        start = time.time()
-
-        # Execute the kernel
-        checkCudaErrors(
-            cuda.cuLaunchKernel(
-                _transformKernel,
-                dimGrid.x,
-                dimGrid.y,
-                dimGrid.z,  # grid dim
-                dimBlock.x,
-                dimBlock.y,
-                dimBlock.z,  # block dim
-                0,
-                0,  # shared mem and stream
-                kernelArgs,
-                0,
-            )
-        )  # arguments
-
-        checkCudaErrors(cudart.cudaDeviceSynchronize())
-        stop = time.time()
-        print(f"Processing time: {stop - start:.3f} msec")
-        print(f"{cubemap_size / ((stop - start + 1) / 1000.0) / 1e6:.2f} Mtexlookups/sec")
-
-        # Allocate mem for the result on host side
-        h_odata = np.empty_like(h_data)
-        # Copy result from device to host
-        checkCudaErrors(cudart.cudaMemcpy(h_odata, d_data, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost))
-
-    checkCudaErrors(cudart.cudaDestroyTextureObject(tex))
-    checkCudaErrors(cudart.cudaFree(d_data))
-    checkCudaErrors(cudart.cudaFreeArray(cu_3darray))
-
-    MIN_EPSILON_ERROR = 5.0e-3
-    if np.max(np.abs(h_odata - h_data_ref)) > MIN_EPSILON_ERROR:
+    kernel_helper = common.KernelHelper(simple_cubemap_texture, dev_id)
+    _transform_kernel = kernel_helper.get_function(b"transformKernel")
+    kernel_args = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None))
+    check_cuda_errors(
+        cuda.cuLaunchKernel(
+            _transform_kernel,
+            dim_grid.x,
+            dim_grid.y,
+            dim_grid.z,  # grid dim
+            dim_block.x,
+            dim_block.y,
+            dim_block.z,  # block dim
+            0,
+            0,  # shared mem and stream
+            kernel_args,
+            0,
+        )
+    )  # arguments
+
+    check_cuda_errors(cudart.cudaDeviceSynchronize())
+
+    start = time.time()
+
+    # Execute the kernel
+    check_cuda_errors(
+        cuda.cuLaunchKernel(
+            _transform_kernel,
+            dim_grid.x,
+            dim_grid.y,
+            dim_grid.z,  # grid dim
+            dim_block.x,
+            dim_block.y,
+            dim_block.z,  # block dim
+            0,
+            0,  # shared mem and stream
+            kernel_args,
+            0,
+        )
+    )  # arguments
+
+    check_cuda_errors(cudart.cudaDeviceSynchronize())
+    stop = time.time()
+    print(f"Processing time: {stop - start:.3f} msec")
+    print(f"{cubemap_size / ((stop - start + 1) / 1000.0) / 1e6:.2f} Mtexlookups/sec")
+
+    # Allocate mem for the result on host side
+    h_odata = np.empty_like(h_data)
+    # Copy result from device to host
+    check_cuda_errors(cudart.cudaMemcpy(h_odata, d_data, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost))
+
+    check_cuda_errors(cudart.cudaDestroyTextureObject(tex))
+    check_cuda_errors(cudart.cudaFree(d_data))
+    check_cuda_errors(cudart.cudaFreeArray(cu_3darray))
+
+    min_epsilon_error = 5.0e-3
+    if np.max(np.abs(h_odata - h_data_ref)) > min_epsilon_error:
         print("Failed", file=sys.stderr)
         sys.exit(1)
 
diff --git a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
index a60dbac5bc..09dafa1be1 100644
--- a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors
+from common.helper_cuda import check_cuda_errors
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
@@ -41,24 +41,24 @@ def main():
 
     # Number of GPUs
     print("Checking for multiple GPUs...")
-    gpu_n = checkCudaErrors(cudart.cudaGetDeviceCount())
+    gpu_n = check_cuda_errors(cudart.cudaGetDeviceCount())
     print(f"CUDA-capable device count: {gpu_n}")
 
     if gpu_n < 2:
         pytest.skip("Two or more GPUs with Peer-to-Peer access capability are required")
 
-    prop = [checkCudaErrors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)]
+    prop = [check_cuda_errors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)]
     # Check possibility for peer access
     print("\nChecking GPU(s) for support of peer to peer memory access...")
 
-    p2pCapableGPUs = [-1, -1]
+    p2p_capable_gp_us = [-1, -1]
     for i in range(gpu_n):
-        p2pCapableGPUs[0] = i
+        p2p_capable_gp_us[0] = i
         for j in range(gpu_n):
             if i == j:
                 continue
-            i_access_j = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(i, j))
-            j_access_i = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(j, i))
+            i_access_j = check_cuda_errors(cudart.cudaDeviceCanAccessPeer(i, j))
+            j_access_i = check_cuda_errors(cudart.cudaDeviceCanAccessPeer(j, i))
             print(
                 "> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format(
                     prop[i].name, i, prop[j].name, j, "Yes" if i_access_j else "No"
@@ -70,54 +70,54 @@ def main():
                 )
             )
             if i_access_j and j_access_i:
-                p2pCapableGPUs[1] = j
+                p2p_capable_gp_us[1] = j
                 break
-        if p2pCapableGPUs[1] != -1:
+        if p2p_capable_gp_us[1] != -1:
             break
 
-    if p2pCapableGPUs[0] == -1 or p2pCapableGPUs[1] == -1:
+    if p2p_capable_gp_us[0] == -1 or p2p_capable_gp_us[1] == -1:
         pytest.skip("Peer to Peer access is not available amongst GPUs in the system")
 
     # Use first pair of p2p capable GPUs detected
-    gpuid = [p2pCapableGPUs[0], p2pCapableGPUs[1]]
+    gpuid = [p2p_capable_gp_us[0], p2p_capable_gp_us[1]]
 
     # Enable peer access
     print(f"Enabling peer access between GPU{gpuid[0]} and GPU{gpuid[1]}...")
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[0]))
-    checkCudaErrors(cudart.cudaDeviceEnablePeerAccess(gpuid[1], 0))
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[1]))
-    checkCudaErrors(cudart.cudaDeviceEnablePeerAccess(gpuid[0], 0))
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[0]))
+    check_cuda_errors(cudart.cudaDeviceEnablePeerAccess(gpuid[1], 0))
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[1]))
+    check_cuda_errors(cudart.cudaDeviceEnablePeerAccess(gpuid[0], 0))
 
     # Allocate buffers
     buf_size = 1024 * 1024 * 16 * np.dtype(np.float32).itemsize
     print(f"Allocating buffers ({int(buf_size / 1024 / 1024)}MB on GPU{gpuid[0]}, GPU{gpuid[1]} and CPU Host)...")
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[0]))
-    g0 = checkCudaErrors(cudart.cudaMalloc(buf_size))
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[1]))
-    g1 = checkCudaErrors(cudart.cudaMalloc(buf_size))
-    h0 = checkCudaErrors(cudart.cudaMallocHost(buf_size))  # Automatically portable with UVA
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[0]))
+    g0 = check_cuda_errors(cudart.cudaMalloc(buf_size))
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[1]))
+    g1 = check_cuda_errors(cudart.cudaMalloc(buf_size))
+    h0 = check_cuda_errors(cudart.cudaMallocHost(buf_size))  # Automatically portable with UVA
 
     # Create CUDA event handles
     print("Creating event handles...")
     eventflags = cudart.cudaEventBlockingSync
-    start_event = checkCudaErrors(cudart.cudaEventCreateWithFlags(eventflags))
-    stop_event = checkCudaErrors(cudart.cudaEventCreateWithFlags(eventflags))
+    start_event = check_cuda_errors(cudart.cudaEventCreateWithFlags(eventflags))
+    stop_event = check_cuda_errors(cudart.cudaEventCreateWithFlags(eventflags))
 
     # P2P memcopy() benchmark
-    checkCudaErrors(cudart.cudaEventRecord(start_event, cudart.cudaStream_t(0)))
+    check_cuda_errors(cudart.cudaEventRecord(start_event, cudart.cudaStream_t(0)))
 
     for i in range(100):
         # With UVA we don't need to specify source and target devices, the
         # runtime figures this out by itself from the pointers
         # Ping-pong copy between GPUs
         if i % 2 == 0:
-            checkCudaErrors(cudart.cudaMemcpy(g1, g0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault))
+            check_cuda_errors(cudart.cudaMemcpy(g1, g0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault))
         else:
-            checkCudaErrors(cudart.cudaMemcpy(g0, g1, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault))
+            check_cuda_errors(cudart.cudaMemcpy(g0, g1, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault))
 
-    checkCudaErrors(cudart.cudaEventRecord(stop_event, cudart.cudaStream_t(0)))
-    checkCudaErrors(cudart.cudaEventSynchronize(stop_event))
-    time_memcpy = checkCudaErrors(cudart.cudaEventElapsedTime(start_event, stop_event))
+    check_cuda_errors(cudart.cudaEventRecord(stop_event, cudart.cudaStream_t(0)))
+    check_cuda_errors(cudart.cudaEventSynchronize(stop_event))
+    time_memcpy = check_cuda_errors(cudart.cudaEventElapsedTime(start_event, stop_event))
     print(
         f"cudaMemcpyPeer / cudaMemcpy between GPU{gpuid[0]} and GPU{gpuid[1]}: {(1.0 / (time_memcpy / 1000.0)) * (100.0 * buf_size) / 1024.0 / 1024.0 / 1024.0:.2f}GB/s"
     )
@@ -129,8 +129,8 @@ def main():
     for i in range(int(buf_size / np.dtype(np.float32).itemsize)):
         h0_local[i] = i % 4096
 
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[0]))
-    checkCudaErrors(cudart.cudaMemcpy(g0, h0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault))
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[0]))
+    check_cuda_errors(cudart.cudaMemcpy(g0, h0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault))
 
     # Kernel launch configuration
     threads = cudart.dim3()
@@ -145,57 +145,61 @@ def main():
     # Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
     # output to the GPU 1 buffer
     print(f"Run kernel on GPU{gpuid[1]}, taking source data from GPU{gpuid[0]} and writing to GPU{gpuid[1]}...")
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[1]))
-
-    with common.KernelHelper(simplep2p, gpuid[1]) as kernelHelper:
-        simple_kernel_1 = kernelHelper.getFunction(b"SimpleKernel")
-        kernel_args_1 = ((g0, g1), (ctypes.c_void_p, ctypes.c_void_p))
-        checkCudaErrors(
-            cuda.cuLaunchKernel(
-                simple_kernel_1,
-                blocks.x,
-                blocks.y,
-                blocks.z,
-                threads.x,
-                threads.y,
-                threads.z,
-                0,
-                0,
-                kernel_args_1,
-                0,
-            )
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[1]))
+
+    kernel_helper = [None] * 2
+    _simple_kernel = [None] * 2
+    kernel_args = [None] * 2
+
+    kernel_helper[1] = common.KernelHelper(simplep2p, gpuid[1])
+    _simple_kernel[1] = kernel_helper[1].get_function(b"SimpleKernel")
+    kernel_args[1] = ((g0, g1), (ctypes.c_void_p, ctypes.c_void_p))
+    check_cuda_errors(
+        cuda.cuLaunchKernel(
+            _simple_kernel[1],
+            blocks.x,
+            blocks.y,
+            blocks.z,
+            threads.x,
+            threads.y,
+            threads.z,
+            0,
+            0,
+            kernel_args[1],
+            0,
         )
+    )
 
-    checkCudaErrors(cudart.cudaDeviceSynchronize())
+    check_cuda_errors(cudart.cudaDeviceSynchronize())
 
     # Run kernel on GPU 0, reading input from the GPU 1 buffer, writing
     # output to the GPU 0 buffer
     print(f"Run kernel on GPU{gpuid[0]}, taking source data from GPU{gpuid[1]} and writing to GPU{gpuid[0]}...")
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[0]))
-    with common.KernelHelper(simplep2p, gpuid[0]) as kernelHelper:
-        simple_kernel_0 = kernelHelper.getFunction(b"SimpleKernel")
-        kernel_args_0 = ((g1, g0), (ctypes.c_void_p, ctypes.c_void_p))
-        checkCudaErrors(
-            cuda.cuLaunchKernel(
-                simple_kernel_0,
-                blocks.x,
-                blocks.y,
-                blocks.z,
-                threads.x,
-                threads.y,
-                threads.z,
-                0,
-                0,
-                kernel_args_0,
-                0,
-            )
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[0]))
+    kernel_helper[0] = common.KernelHelper(simplep2p, gpuid[0])
+    _simple_kernel[0] = kernel_helper[0].get_function(b"SimpleKernel")
+    kernel_args[0] = ((g1, g0), (ctypes.c_void_p, ctypes.c_void_p))
+    check_cuda_errors(
+        cuda.cuLaunchKernel(
+            _simple_kernel[0],
+            blocks.x,
+            blocks.y,
+            blocks.z,
+            threads.x,
+            threads.y,
+            threads.z,
+            0,
+            0,
+            kernel_args[0],
+            0,
         )
+    )
 
-    checkCudaErrors(cudart.cudaDeviceSynchronize())
+    check_cuda_errors(cudart.cudaDeviceSynchronize())
 
     # Copy data back to host and verify
     print(f"Copy data back to host from GPU{gpuid[0]} and verify results...")
-    checkCudaErrors(cudart.cudaMemcpy(h0, g0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault))
+    check_cuda_errors(cudart.cudaMemcpy(h0, g0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault))
 
     error_count = 0
 
@@ -210,23 +214,23 @@ def main():
 
     # Disable peer access (also unregisters memory for non-UVA cases)
     print("Disabling peer access...")
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[0]))
-    checkCudaErrors(cudart.cudaDeviceDisablePeerAccess(gpuid[1]))
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[1]))
-    checkCudaErrors(cudart.cudaDeviceDisablePeerAccess(gpuid[0]))
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[0]))
+    check_cuda_errors(cudart.cudaDeviceDisablePeerAccess(gpuid[1]))
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[1]))
+    check_cuda_errors(cudart.cudaDeviceDisablePeerAccess(gpuid[0]))
 
     # Cleanup and shutdown
     print("Shutting down...")
-    checkCudaErrors(cudart.cudaEventDestroy(start_event))
-    checkCudaErrors(cudart.cudaEventDestroy(stop_event))
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[0]))
-    checkCudaErrors(cudart.cudaFree(g0))
-    checkCudaErrors(cudart.cudaSetDevice(gpuid[1]))
-    checkCudaErrors(cudart.cudaFree(g1))
-    checkCudaErrors(cudart.cudaFreeHost(h0))
+    check_cuda_errors(cudart.cudaEventDestroy(start_event))
+    check_cuda_errors(cudart.cudaEventDestroy(stop_event))
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[0]))
+    check_cuda_errors(cudart.cudaFree(g0))
+    check_cuda_errors(cudart.cudaSetDevice(gpuid[1]))
+    check_cuda_errors(cudart.cudaFree(g1))
+    check_cuda_errors(cudart.cudaFreeHost(h0))
 
     for i in range(gpu_n):
-        checkCudaErrors(cudart.cudaSetDevice(i))
+        check_cuda_errors(cudart.cudaSetDevice(i))
 
     if error_count != 0:
         print("Test failed!", file=sys.stderr)
diff --git a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
index ea64017b95..d4bf44e19a 100644
--- a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
@@ -9,13 +9,13 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors
-from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt
+from common.helper_cuda import check_cuda_errors
+from common.helper_string import check_cmd_line_flag, get_cmd_line_argument_int
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
 
-simpleZeroCopy = """\
+simple_zero_copy = """\
 extern "C"
 __global__ void vectorAddGPU(float *a, float *b, float *c, int N)
 {
@@ -31,7 +31,7 @@
 
 def main():
     idev = 0
-    bPinGenericMemory = False
+    b_pin_generic_memory = False
 
     import pytest
 
@@ -47,7 +47,7 @@ def main():
     if platform.machine() == "sbsa":
         pytest.skip("simpleZeroCopy is not supported on sbsa")
 
-    if checkCmdLineFlag("help"):
+    if check_cmd_line_flag("help"):
         print("Usage:  simpleZeroCopy [OPTION]\n", file=sys.stderr)
         print("Options:", file=sys.stderr)
         print("  device=[device #]  Specify the device to be used", file=sys.stderr)
@@ -55,50 +55,50 @@ def main():
         sys.exit(1)
 
     # Get the device selected by the user or default to 0, and then set it.
-    if checkCmdLineFlag("device="):
-        deviceCount = cudart.cudaGetDeviceCount()
-        idev = int(getCmdLineArgumentInt("device="))
+    if check_cmd_line_flag("device="):
+        device_count = cudart.cudaGetDeviceCount()
+        idev = int(get_cmd_line_argument_int("device="))
 
-        if idev >= deviceCount or idev < 0:
+        if idev >= device_count or idev < 0:
             print(f"Device number {idev} is invalid, will use default CUDA device 0.")
             idev = 0
 
-    if checkCmdLineFlag("use_generic_memory"):
-        bPinGenericMemory = True
+    if check_cmd_line_flag("use_generic_memory"):
+        b_pin_generic_memory = True
 
-    if bPinGenericMemory:
+    if b_pin_generic_memory:
         print("> Using Generic System Paged Memory (malloc)")
     else:
         print("> Using CUDA Host Allocated (cudaHostAlloc)")
 
-    checkCudaErrors(cudart.cudaSetDevice(idev))
+    check_cuda_errors(cudart.cudaSetDevice(idev))
 
     # Verify the selected device supports mapped memory and set the device flags for mapping host memory.
-    deviceProp = checkCudaErrors(cudart.cudaGetDeviceProperties(idev))
+    device_prop = check_cuda_errors(cudart.cudaGetDeviceProperties(idev))
 
-    if not deviceProp.canMapHostMemory:
+    if not device_prop.canMapHostMemory:
         pytest.skip(f"Device {idev} does not support mapping CPU host memory!")
 
-    checkCudaErrors(cudart.cudaSetDeviceFlags(cudart.cudaDeviceMapHost))
+    check_cuda_errors(cudart.cudaSetDeviceFlags(cudart.cudaDeviceMapHost))
 
     # Allocate mapped CPU memory
 
     nelem = 1048576
     num_bytes = nelem * np.dtype(np.float32).itemsize
 
-    if bPinGenericMemory:
+    if b_pin_generic_memory:
         a = np.empty(nelem, dtype=np.float32)
         b = np.empty(nelem, dtype=np.float32)
         c = np.empty(nelem, dtype=np.float32)
 
-        checkCudaErrors(cudart.cudaHostRegister(a, num_bytes, cudart.cudaHostRegisterMapped))
-        checkCudaErrors(cudart.cudaHostRegister(b, num_bytes, cudart.cudaHostRegisterMapped))
-        checkCudaErrors(cudart.cudaHostRegister(c, num_bytes, cudart.cudaHostRegisterMapped))
+        check_cuda_errors(cudart.cudaHostRegister(a, num_bytes, cudart.cudaHostRegisterMapped))
+        check_cuda_errors(cudart.cudaHostRegister(b, num_bytes, cudart.cudaHostRegisterMapped))
+        check_cuda_errors(cudart.cudaHostRegister(c, num_bytes, cudart.cudaHostRegisterMapped))
     else:
         flags = cudart.cudaHostAllocMapped
-        a_ptr = checkCudaErrors(cudart.cudaHostAlloc(num_bytes, flags))
-        b_ptr = checkCudaErrors(cudart.cudaHostAlloc(num_bytes, flags))
-        c_ptr = checkCudaErrors(cudart.cudaHostAlloc(num_bytes, flags))
+        a_ptr = check_cuda_errors(cudart.cudaHostAlloc(num_bytes, flags))
+        b_ptr = check_cuda_errors(cudart.cudaHostAlloc(num_bytes, flags))
+        c_ptr = check_cuda_errors(cudart.cudaHostAlloc(num_bytes, flags))
 
         a = (ctypes.c_float * nelem).from_address(a_ptr)
         b = (ctypes.c_float * nelem).from_address(b_ptr)
@@ -110,9 +110,9 @@ def main():
         b[n] = rnd.random()
 
     # Get the device pointers for the pinned CPU memory mapped into the GPU memory space
-    d_a = checkCudaErrors(cudart.cudaHostGetDevicePointer(a, 0))
-    d_b = checkCudaErrors(cudart.cudaHostGetDevicePointer(b, 0))
-    d_c = checkCudaErrors(cudart.cudaHostGetDevicePointer(c, 0))
+    d_a = check_cuda_errors(cudart.cudaHostGetDevicePointer(a, 0))
+    d_b = check_cuda_errors(cudart.cudaHostGetDevicePointer(b, 0))
+    d_c = check_cuda_errors(cudart.cudaHostGetDevicePointer(c, 0))
 
     # Call the GPU kernel using the CPU pointers residing in CPU mapped memory
     print("> vectorAddGPU kernel will add vectors using mapped CPU memory...")
@@ -124,57 +124,57 @@ def main():
     grid.x = math.ceil(nelem / float(block.x))
     grid.y = 1
     grid.z = 1
-    with common.KernelHelper(simpleZeroCopy, idev) as kernelHelper:
-        _vectorAddGPU = kernelHelper.getFunction(b"vectorAddGPU")
-        kernelArgs = (
-            (d_a, d_b, d_c, nelem),
-            (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int),
+    kernel_helper = common.KernelHelper(simple_zero_copy, idev)
+    _vector_add_gpu = kernel_helper.get_function(b"vectorAddGPU")
+    kernel_args = (
+        (d_a, d_b, d_c, nelem),
+        (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int),
+    )
+    check_cuda_errors(
+        cuda.cuLaunchKernel(
+            _vector_add_gpu,
+            grid.x,
+            grid.y,
+            grid.z,
+            block.x,
+            block.y,
+            block.z,
+            0,
+            cuda.CU_STREAM_LEGACY,
+            kernel_args,
+            0,
         )
-        checkCudaErrors(
-            cuda.cuLaunchKernel(
-                _vectorAddGPU,
-                grid.x,
-                grid.y,
-                grid.z,
-                block.x,
-                block.y,
-                block.z,
-                0,
-                cuda.CU_STREAM_LEGACY,
-                kernelArgs,
-                0,
-            )
-        )
-    checkCudaErrors(cudart.cudaDeviceSynchronize())
+    )
+    check_cuda_errors(cudart.cudaDeviceSynchronize())
 
     print("> Checking the results from vectorAddGPU() ...")
     # Compare the results
-    errorNorm = 0.0
-    refNorm = 0.0
+    error_norm = 0.0
+    ref_norm = 0.0
 
     for n in range(nelem):
         ref = a[n] + b[n]
         diff = c[n] - ref
-        errorNorm += diff * diff
-        refNorm += ref * ref
+        error_norm += diff * diff
+        ref_norm += ref * ref
 
-    errorNorm = math.sqrt(errorNorm)
-    refNorm = math.sqrt(refNorm)
+    error_norm = math.sqrt(error_norm)
+    ref_norm = math.sqrt(ref_norm)
 
     # Memory clean up
 
     print("Releasing CPU memory...")
 
-    if bPinGenericMemory:
-        checkCudaErrors(cudart.cudaHostUnregister(a))
-        checkCudaErrors(cudart.cudaHostUnregister(b))
-        checkCudaErrors(cudart.cudaHostUnregister(c))
+    if b_pin_generic_memory:
+        check_cuda_errors(cudart.cudaHostUnregister(a))
+        check_cuda_errors(cudart.cudaHostUnregister(b))
+        check_cuda_errors(cudart.cudaHostUnregister(c))
     else:
-        checkCudaErrors(cudart.cudaFreeHost(a))
-        checkCudaErrors(cudart.cudaFreeHost(b))
-        checkCudaErrors(cudart.cudaFreeHost(c))
+        check_cuda_errors(cudart.cudaFreeHost(a))
+        check_cuda_errors(cudart.cudaFreeHost(b))
+        check_cuda_errors(cudart.cudaFreeHost(c))
 
-    if errorNorm / refNorm >= 1.0e-7:
+    if error_norm / ref_norm >= 1.0e-7:
         print("FAILED", file=sys.stderr)
         sys.exit(1)
 
diff --git a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
index df52462854..94a356101f 100644
--- a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
+++ b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
@@ -7,12 +7,12 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors, findCudaDevice
+from common.helper_cuda import check_cuda_errors, find_cuda_device
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
 
-systemWideAtomics = """\
+system_wide_atomics = """\
 #define LOOP_NUM 50
 
 extern "C"
@@ -63,21 +63,21 @@
 #! @param reference  reference data, computed but preallocated
 #! @param idata      input data as provided to device
 #! @param len        number of elements in reference / idata
-def verify(testData, length):
+def verify(test_data, length):
     val = 0
 
     for i in range(length * LOOP_NUM):
         val += 10
 
-    if val != testData[0]:
-        print(f"atomicAdd failed val = {val} testData = {testData[0]}")
+    if val != test_data[0]:
+        print(f"atomicAdd failed val = {val} test_data = {test_data[0]}")
         return False
 
     val = 0
     found = False
     for i in range(length):
         # second element should be a member of [0, len)
-        if i == testData[1]:
+        if i == test_data[1]:
             found = True
             break
 
@@ -91,7 +91,7 @@ def verify(testData, length):
         # third element should be len-1
         val = max(val, i)
 
-    if val != testData[2]:
+    if val != test_data[2]:
         print("atomicMax failed")
         return False
 
@@ -100,7 +100,7 @@ def verify(testData, length):
     for i in range(length):
         val = min(val, i)
 
-    if val != testData[3]:
+    if val != test_data[3]:
         print("atomicMin failed")
         return False
 
@@ -110,7 +110,7 @@ def verify(testData, length):
     for i in range(length * LOOP_NUM):
         val = 0 if val >= limit else val + 1
 
-    if val != testData[4]:
+    if val != test_data[4]:
         print("atomicInc failed")
         return False
 
@@ -120,7 +120,7 @@ def verify(testData, length):
     for i in range(length * LOOP_NUM):
         val = limit if (val == 0) or (val > limit) else val - 1
 
-    if val != testData[5]:
+    if val != test_data[5]:
         print("atomicDec failed")
         return False
 
@@ -128,7 +128,7 @@ def verify(testData, length):
 
     for i in range(length):
         # seventh element should be a member of [0, len)
-        if i == testData[6]:
+        if i == test_data[6]:
             found = True
             break
 
@@ -142,13 +142,13 @@ def verify(testData, length):
         # 8th element should be 1
         val &= 2 * i + 7
 
-    if val != testData[7]:
+    if val != test_data[7]:
         print("atomicAnd failed")
         return False
 
     # 9th element should be 0xff
     val = -1
-    if val != testData[8]:
+    if val != test_data[8]:
         print("atomicOr failed")
         return False
 
@@ -158,7 +158,7 @@ def verify(testData, length):
         # 11th element should be 0xff
         val ^= i
 
-    if val != testData[9]:
+    if val != test_data[9]:
         print("atomicXor failed")
         return False
 
@@ -172,72 +172,74 @@ def main():
         pytest.skip("Atomics not supported on Windows")
 
     # set device
-    dev_id = findCudaDevice()
-    device_prop = checkCudaErrors(cudart.cudaGetDeviceProperties(dev_id))
+    dev_id = find_cuda_device()
+    device_prop = check_cuda_errors(cudart.cudaGetDeviceProperties(dev_id))
 
     if not device_prop.managedMemory:
         pytest.skip("Unified Memory not supported on this device")
 
-    computeMode = checkCudaErrors(cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeMode, dev_id))
-    if computeMode == cudart.cudaComputeMode.cudaComputeModeProhibited:
+    compute_mode = check_cuda_errors(
+        cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeMode, dev_id)
+    )
+    if compute_mode == cudart.cudaComputeMode.cudaComputeModeProhibited:
         pytest.skip("This sample requires a device in either default or process exclusive mode")
 
     if device_prop.major < 6:
         pytest.skip("Requires a minimum CUDA compute 6.0 capability")
 
-    numThreads = 256
-    numBlocks = 64
-    numData = 10
+    num_threads = 256
+    num_blocks = 64
+    num_data = 10
 
     if device_prop.pageableMemoryAccess:
         print("CAN access pageable memory")
-        atom_arr_h = (ctypes.c_int * numData)(0)
+        atom_arr_h = (ctypes.c_int * num_data)(0)
         atom_arr = ctypes.addressof(atom_arr_h)
     else:
         print("CANNOT access pageable memory")
-        atom_arr = checkCudaErrors(
-            cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * numData, cudart.cudaMemAttachGlobal)
+        atom_arr = check_cuda_errors(
+            cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * num_data, cudart.cudaMemAttachGlobal)
         )
-        atom_arr_h = (ctypes.c_int * numData).from_address(atom_arr)
+        atom_arr_h = (ctypes.c_int * num_data).from_address(atom_arr)
 
-    for i in range(numData):
+    for i in range(num_data):
         atom_arr_h[i] = 0
 
     # To make the AND and XOR tests generate something other than 0...
     atom_arr_h[7] = atom_arr_h[9] = 0xFF
 
-    with common.KernelHelper(systemWideAtomics, dev_id) as kernelHelper:
-        _atomicKernel = kernelHelper.getFunction(b"atomicKernel")
-        kernelArgs = ((atom_arr,), (ctypes.c_void_p,))
-        checkCudaErrors(
-            cuda.cuLaunchKernel(
-                _atomicKernel,
-                numBlocks,
-                1,
-                1,  # grid dim
-                numThreads,
-                1,
-                1,  # block dim
-                0,
-                cuda.CU_STREAM_LEGACY,  # shared mem and stream
-                kernelArgs,
-                0,
-            )
-        )  # arguments
+    kernel_helper = common.KernelHelper(system_wide_atomics, dev_id)
+    _atomic_kernel = kernel_helper.get_function(b"atomicKernel")
+    kernel_args = ((atom_arr,), (ctypes.c_void_p,))
+    check_cuda_errors(
+        cuda.cuLaunchKernel(
+            _atomic_kernel,
+            num_blocks,
+            1,
+            1,  # grid dim
+            num_threads,
+            1,
+            1,  # block dim
+            0,
+            cuda.CU_STREAM_LEGACY,  # shared mem and stream
+            kernel_args,
+            0,
+        )
+    )  # arguments
     # NOTE: Python doesn't have an equivalent system atomic operations
     # atomicKernel_CPU(atom_arr_h, numBlocks * numThreads)
 
-    checkCudaErrors(cudart.cudaDeviceSynchronize())
+    check_cuda_errors(cudart.cudaDeviceSynchronize())
 
     # Compute & verify reference solution
-    testResult = verify(atom_arr_h, numThreads * numBlocks)
+    test_result = verify(atom_arr_h, num_threads * num_blocks)
 
     if device_prop.pageableMemoryAccess:
         pass
     else:
-        checkCudaErrors(cudart.cudaFree(atom_arr))
+        check_cuda_errors(cudart.cudaFree(atom_arr))
 
-    if not testResult:
+    if not test_result:
         print("systemWideAtomics completed with errors", file=sys.stderr)
         sys.exit(1)
 
diff --git a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
index 8ee238e36b..8c70aadd3a 100644
--- a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
+++ b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
@@ -7,11 +7,11 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors, findCudaDeviceDRV
+from common.helper_cuda import check_cuda_errors, find_cuda_device_drv
 
 from cuda.bindings import driver as cuda
 
-vectorAddDrv = """\
+vector_add_drv = """\
 /* Vector addition: C = A + B.
  *
  * This sample is a very basic sample that implements element by element
@@ -32,82 +32,82 @@
 
 
 def main():
-    N = 50000
-    nbytes = N * np.dtype(np.float32).itemsize
+    n = 50000
+    nbytes = n * np.dtype(np.float32).itemsize
 
     # Initialize
-    checkCudaErrors(cuda.cuInit(0))
-    cuDevice = findCudaDeviceDRV()
+    check_cuda_errors(cuda.cuInit(0))
+    cu_device = find_cuda_device_drv()
     # Create context
-    cuContext = checkCudaErrors(cuda.cuCtxCreate(None, 0, cuDevice))
+    cu_context = check_cuda_errors(cuda.cuCtxCreate(None, 0, cu_device))
 
-    uvaSupported = checkCudaErrors(
-        cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice)
+    uva_supported = check_cuda_errors(
+        cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cu_device)
     )
-    if not uvaSupported:
+    if not uva_supported:
         import pytest
 
         pytest.skip("Accessing pageable memory directly requires UVA")
 
-    with common.KernelHelper(vectorAddDrv, int(cuDevice)) as kernelHelper:
-        _VecAdd_kernel = kernelHelper.getFunction(b"VecAdd_kernel")
-
-        # Allocate input vectors h_A and h_B in host memory
-        h_A = np.random.rand(N).astype(dtype=np.float32)
-        h_B = np.random.rand(N).astype(dtype=np.float32)
-        h_C = np.random.rand(N).astype(dtype=np.float32)
-
-        # Allocate vectors in device memory
-        d_A = checkCudaErrors(cuda.cuMemAlloc(nbytes))
-        d_B = checkCudaErrors(cuda.cuMemAlloc(nbytes))
-        d_C = checkCudaErrors(cuda.cuMemAlloc(nbytes))
-
-        # Copy vectors from host memory to device memory
-        checkCudaErrors(cuda.cuMemcpyHtoD(d_A, h_A, nbytes))
-        checkCudaErrors(cuda.cuMemcpyHtoD(d_B, h_B, nbytes))
-
-        if True:
-            # Grid/Block configuration
-            threadsPerBlock = 256
-            blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock
-
-            kernelArgs = ((d_A, d_B, d_C, N), (None, None, None, ctypes.c_int))
-
-            # Launch the CUDA kernel
-            checkCudaErrors(
-                cuda.cuLaunchKernel(
-                    _VecAdd_kernel,
-                    blocksPerGrid,
-                    1,
-                    1,
-                    threadsPerBlock,
-                    1,
-                    1,
-                    0,
-                    0,
-                    kernelArgs,
-                    0,
-                )
+    kernel_helper = common.KernelHelper(vector_add_drv, int(cu_device))
+    _vec_add_kernel = kernel_helper.get_function(b"VecAdd_kernel")
+
+    # Allocate input vectors h_A and h_B in host memory
+    h_a = np.random.rand(n).astype(dtype=np.float32)
+    h_b = np.random.rand(n).astype(dtype=np.float32)
+    h_c = np.random.rand(n).astype(dtype=np.float32)
+
+    # Allocate vectors in device memory
+    d_a = check_cuda_errors(cuda.cuMemAlloc(nbytes))
+    d_b = check_cuda_errors(cuda.cuMemAlloc(nbytes))
+    d_c = check_cuda_errors(cuda.cuMemAlloc(nbytes))
+
+    # Copy vectors from host memory to device memory
+    check_cuda_errors(cuda.cuMemcpyHtoD(d_a, h_a, nbytes))
+    check_cuda_errors(cuda.cuMemcpyHtoD(d_b, h_b, nbytes))
+
+    if True:
+        # Grid/Block configuration
+        threads_per_block = 256
+        blocks_per_grid = (n + threads_per_block - 1) / threads_per_block
+
+        kernel_args = ((d_a, d_b, d_c, n), (None, None, None, ctypes.c_int))
+
+        # Launch the CUDA kernel
+        check_cuda_errors(
+            cuda.cuLaunchKernel(
+                _vec_add_kernel,
+                blocks_per_grid,
+                1,
+                1,
+                threads_per_block,
+                1,
+                1,
+                0,
+                0,
+                kernel_args,
+                0,
             )
-        else:
-            pass
-
-        # Copy result from device memory to host memory
-        # h_C contains the result in host memory
-        checkCudaErrors(cuda.cuMemcpyDtoH(h_C, d_C, nbytes))
-
-        for i in range(N):
-            sum_all = h_A[i] + h_B[i]
-            if math.fabs(h_C[i] - sum_all) > 1e-7:
-                break
-
-        # Free device memory
-        checkCudaErrors(cuda.cuMemFree(d_A))
-        checkCudaErrors(cuda.cuMemFree(d_B))
-        checkCudaErrors(cuda.cuMemFree(d_C))
-
-    checkCudaErrors(cuda.cuCtxDestroy(cuContext))
-    if i + 1 != N:
+        )
+    else:
+        pass
+
+    # Copy result from device memory to host memory
+    # h_C contains the result in host memory
+    check_cuda_errors(cuda.cuMemcpyDtoH(h_c, d_c, nbytes))
+
+    for i in range(n):
+        sum_all = h_a[i] + h_b[i]
+        if math.fabs(h_c[i] - sum_all) > 1e-7:
+            break
+
+    # Free device memory
+    check_cuda_errors(cuda.cuMemFree(d_a))
+    check_cuda_errors(cuda.cuMemFree(d_b))
+    check_cuda_errors(cuda.cuMemFree(d_c))
+
+    check_cuda_errors(cuda.cuCtxDestroy(cu_context))
+    if i + 1 != n:
         print("Result = FAIL", file=sys.stderr)
         sys.exit(1)
 
diff --git a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
index c7f9e6275b..d5e2e3d26f 100644
--- a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
+++ b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
@@ -8,11 +8,11 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors, findCudaDeviceDRV
+from common.helper_cuda import check_cuda_errors, find_cuda_device_drv
 
 from cuda.bindings import driver as cuda
 
-vectorAddMMAP = """\
+vector_add_mmap = """\
 /* Vector addition: C = A + B.
  *
  * This sample is a very basic sample that implements element by element
@@ -36,35 +36,35 @@ def round_up(x, y):
     return int((x - 1) / y + 1) * y
 
 
-def getBackingDevices(cuDevice):
-    num_devices = checkCudaErrors(cuda.cuDeviceGetCount())
+def get_backing_devices(cu_device):
+    num_devices = check_cuda_errors(cuda.cuDeviceGetCount())
 
-    backingDevices = [cuDevice]
+    backing_devices = [cu_device]
     for dev in range(num_devices):
         # The mapping device is already in the backingDevices vector
-        if int(dev) == int(cuDevice):
+        if int(dev) == int(cu_device):
             continue
 
         # Only peer capable devices can map each others memory
-        capable = checkCudaErrors(cuda.cuDeviceCanAccessPeer(cuDevice, dev))
+        capable = check_cuda_errors(cuda.cuDeviceCanAccessPeer(cu_device, dev))
         if not capable:
             continue
 
         # The device needs to support virtual address management for the required apis to work
-        attributeVal = checkCudaErrors(
+        attribute_val = check_cuda_errors(
             cuda.cuDeviceGetAttribute(
                 cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
-                cuDevice,
+                cu_device,
             )
         )
-        if attributeVal == 0:
+        if attribute_val == 0:
             continue
 
-        backingDevices.append(cuda.CUdevice(dev))
-    return backingDevices
+        backing_devices.append(cuda.CUdevice(dev))
+    return backing_devices
 
 
-def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0):
+def simple_malloc_multi_device_mmap(size, resident_devices, mapping_devices, align=0):
     min_granularity = 0
 
     # Setup the properties common for all the chunks
@@ -77,7 +77,7 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0):
 
     # Get the minimum granularity needed for the resident devices
     # (the max of the minimum granularity of each participating device)
-    for device in residentDevices:
+    for device in resident_devices:
         prop.location.id = device
         status, granularity = cuda.cuMemGetAllocationGranularity(
             prop, cuda.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM
@@ -89,7 +89,7 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0):
 
     # Get the minimum granularity needed for the accessing devices
     # (the max of the minimum granularity of each participating device)
-    for device in mappingDevices:
+    for device in mapping_devices:
         prop.location.id = device
         status, granularity = cuda.cuMemGetAllocationGranularity(
             prop, cuda.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM
@@ -103,28 +103,28 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0):
     # Essentially size = N * residentDevices.size() * min_granularity is the requirement,
     # since each piece of the allocation will be stripeSize = N * min_granularity
     # and the min_granularity requirement applies to each stripeSize piece of the allocation.
-    size = round_up(size, len(residentDevices) * min_granularity)
-    stripeSize = size / len(residentDevices)
+    size = round_up(size, len(resident_devices) * min_granularity)
+    stripe_size = size / len(resident_devices)
 
     # Return the rounded up size to the caller for use in the free
-    allocationSize = size
+    allocation_size = size
 
     # Reserve the required contiguous VA space for the allocations
     status, dptr = cuda.cuMemAddressReserve(size, align, cuda.CUdeviceptr(0), 0)
     if status != cuda.CUresult.CUDA_SUCCESS:
-        simpleFreeMultiDeviceMmap(dptr, size)
+        simple_free_multi_device_mmap(dptr, size)
         return status, None, None
 
     # Create and map the backings on each gpu
     # note: reusing CUmemAllocationProp prop from earlier with prop.type & prop.location.type already specified.
-    for idx in range(len(residentDevices)):
+    for idx in range(len(resident_devices)):
         # Set the location for this chunk to this device
-        prop.location.id = residentDevices[idx]
+        prop.location.id = resident_devices[idx]
 
         # Create the allocation as a pinned allocation on this device
-        status, allocationHandle = cuda.cuMemCreate(stripeSize, prop, 0)
+        status, allocation_handle = cuda.cuMemCreate(stripe_size, prop, 0)
         if status != cuda.CUresult.CUDA_SUCCESS:
-            simpleFreeMultiDeviceMmap(dptr, size)
+            simple_free_multi_device_mmap(dptr, size)
             return status, None, None
 
         # Assign the chunk to the appropriate VA range and release the handle.
@@ -132,10 +132,10 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0):
         # Since we do not need to make any other mappings of this memory or export it,
         # we no longer need and can release the allocationHandle.
         # The allocation will be kept live until it is unmapped.
-        (status,) = cuda.cuMemMap(int(dptr) + (stripeSize * idx), stripeSize, 0, allocationHandle, 0)
+        (status,) = cuda.cuMemMap(int(dptr) + (stripe_size * idx), stripe_size, 0, allocation_handle, 0)
 
         # the handle needs to be released even if the mapping failed.
-        (status2,) = cuda.cuMemRelease(allocationHandle)
+        (status2,) = cuda.cuMemRelease(allocation_handle)
         if status != cuda.CUresult.CUDA_SUCCESS:
             # cuMemRelease should not have failed here
             # as the handle was just allocated successfully
@@ -144,31 +144,31 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0):
 
         # Cleanup in case of any mapping failures.
         if status != cuda.CUresult.CUDA_SUCCESS:
-            simpleFreeMultiDeviceMmap(dptr, size)
+            simple_free_multi_device_mmap(dptr, size)
             return status, None, None
 
     # Each accessDescriptor will describe the mapping requirement for a single device
-    accessDescriptors = [cuda.CUmemAccessDesc()] * len(mappingDevices)
+    access_descriptors = [cuda.CUmemAccessDesc()] * len(mapping_devices)
 
     # Prepare the access descriptor array indicating where and how the backings should be visible.
-    for idx in range(len(mappingDevices)):
+    for idx in range(len(mapping_devices)):
         # Specify which device we are adding mappings for.
-        accessDescriptors[idx].location.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-        accessDescriptors[idx].location.id = mappingDevices[idx]
+        access_descriptors[idx].location.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+        access_descriptors[idx].location.id = mapping_devices[idx]
 
         # Specify both read and write access.
-        accessDescriptors[idx].flags = cuda.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
+        access_descriptors[idx].flags = cuda.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
 
     # Apply the access descriptors to the whole VA range.
-    (status,) = cuda.cuMemSetAccess(dptr, size, accessDescriptors, len(accessDescriptors))
+    (status,) = cuda.cuMemSetAccess(dptr, size, access_descriptors, len(access_descriptors))
     if status != cuda.CUresult.CUDA_SUCCESS:
-        simpleFreeMultiDeviceMmap(dptr, size)
+        simple_free_multi_device_mmap(dptr, size)
         return status, None, None
 
-    return (status, dptr, allocationSize)
+    return (status, dptr, allocation_size)
 
 
-def simpleFreeMultiDeviceMmap(dptr, size):
+def simple_free_multi_device_mmap(dptr, size):
     # Unmap the mapped virtual memory region
     # Since the handles to the mapped backing stores have already been released
     # by cuMemRelease, and these are the only/last mappings referencing them,
@@ -204,97 +204,97 @@ def main():
     if platform.machine() == "sbsa":
         pytest.skip("vectorAddMMAP is not supported on sbsa")
 
-    N = 50000
-    size = N * np.dtype(np.float32).itemsize
+    n = 50000
+    size = n * np.dtype(np.float32).itemsize
 
     # Initialize
-    checkCudaErrors(cuda.cuInit(0))
+    check_cuda_errors(cuda.cuInit(0))
 
-    cuDevice = findCudaDeviceDRV()
+    cu_device = find_cuda_device_drv()
 
     # Check that the selected device supports virtual address management
-    attributeVal = checkCudaErrors(
+    attribute_val = check_cuda_errors(
         cuda.cuDeviceGetAttribute(
             cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
-            cuDevice,
+            cu_device,
         )
     )
-    print(f"Device {cuDevice} VIRTUAL ADDRESS MANAGEMENT SUPPORTED = {attributeVal}.")
-    if not attributeVal:
-        pytest.skip(f"Device {cuDevice} doesn't support VIRTUAL ADDRESS MANAGEMENT.")
+    print(f"Device {cu_device} VIRTUAL ADDRESS MANAGEMENT SUPPORTED = {attribute_val}.")
+    if not attribute_val:
+        pytest.skip(f"Device {cu_device} doesn't support VIRTUAL ADDRESS MANAGEMENT.")
 
     # The vector addition happens on cuDevice, so the allocations need to be mapped there.
-    mappingDevices = [cuDevice]
+    mapping_devices = [cu_device]
 
     # Collect devices accessible by the mapping device (cuDevice) into the backingDevices vector.
-    backingDevices = getBackingDevices(cuDevice)
+    backing_devices = get_backing_devices(cu_device)
 
     # Create context
-    cuContext = checkCudaErrors(cuda.cuCtxCreate(None, 0, cuDevice))
-
-    with common.KernelHelper(vectorAddMMAP, int(cuDevice)) as kernelHelper:
-        _VecAdd_kernel = kernelHelper.getFunction(b"VecAdd_kernel")
-
-        # Allocate input vectors h_A and h_B in host memory
-        h_A = np.random.rand(size).astype(dtype=np.float32)
-        h_B = np.random.rand(size).astype(dtype=np.float32)
-        h_C = np.random.rand(size).astype(dtype=np.float32)
-
-        # Allocate vectors in device memory
-        # note that a call to cuCtxEnablePeerAccess is not needed even though
-        # the backing devices and mapping device are not the same.
-        # This is because the cuMemSetAccess call explicitly specifies
-        # the cross device mapping.
-        # cuMemSetAccess is still subject to the constraints of cuDeviceCanAccessPeer
-        # for cross device mappings (hence why we checked cuDeviceCanAccessPeer earlier).
-        d_A, allocationSize = checkCudaErrors(simpleMallocMultiDeviceMmap(size, backingDevices, mappingDevices))
-        d_B, _ = checkCudaErrors(simpleMallocMultiDeviceMmap(size, backingDevices, mappingDevices))
-        d_C, _ = checkCudaErrors(simpleMallocMultiDeviceMmap(size, backingDevices, mappingDevices))
-
-        # Copy vectors from host memory to device memory
-        checkCudaErrors(cuda.cuMemcpyHtoD(d_A, h_A, size))
-        checkCudaErrors(cuda.cuMemcpyHtoD(d_B, h_B, size))
-
-        # Grid/Block configuration
-        threadsPerBlock = 256
-        blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock
-
-        kernelArgs = ((d_A, d_B, d_C, N), (None, None, None, ctypes.c_int))
-
-        # Launch the CUDA kernel
-        checkCudaErrors(
-            cuda.cuLaunchKernel(
-                _VecAdd_kernel,
-                blocksPerGrid,
-                1,
-                1,
-                threadsPerBlock,
-                1,
-                1,
-                0,
-                0,
-                kernelArgs,
-                0,
-            )
+    cu_context = check_cuda_errors(cuda.cuCtxCreate(None, 0, cu_device))
+
+    kernel_helper = common.KernelHelper(vector_add_mmap, int(cu_device))
+    _vec_add_kernel = kernel_helper.get_function(b"VecAdd_kernel")
+
+    # Allocate input vectors h_A and h_B in host memory
+    h_a = np.random.rand(size).astype(dtype=np.float32)
+    h_b = np.random.rand(size).astype(dtype=np.float32)
+    h_c = np.random.rand(size).astype(dtype=np.float32)
+
+    # Allocate vectors in device memory
+    # note that a call to cuCtxEnablePeerAccess is not needed even though
+    # the backing devices and mapping device are not the same.
+    # This is because the cuMemSetAccess call explicitly specifies
+    # the cross device mapping.
+    # cuMemSetAccess is still subject to the constraints of cuDeviceCanAccessPeer
+    # for cross device mappings (hence why we checked cuDeviceCanAccessPeer earlier).
+    d_a, allocation_size = check_cuda_errors(simple_malloc_multi_device_mmap(size, backing_devices, mapping_devices))
+    d_b, _ = check_cuda_errors(simple_malloc_multi_device_mmap(size, backing_devices, mapping_devices))
+    d_c, _ = check_cuda_errors(simple_malloc_multi_device_mmap(size, backing_devices, mapping_devices))
+
+    # Copy vectors from host memory to device memory
+    check_cuda_errors(cuda.cuMemcpyHtoD(d_a, h_a, size))
+    check_cuda_errors(cuda.cuMemcpyHtoD(d_b, h_b, size))
+
+    # Grid/Block configuration
+    threads_per_block = 256
+    blocks_per_grid = (n + threads_per_block - 1) / threads_per_block
+
+    kernel_args = ((d_a, d_b, d_c, n), (None, None, None, ctypes.c_int))
+
+    # Launch the CUDA kernel
+    check_cuda_errors(
+        cuda.cuLaunchKernel(
+            _vec_add_kernel,
+            blocks_per_grid,
+            1,
+            1,
+            threads_per_block,
+            1,
+            1,
+            0,
+            0,
+            kernel_args,
+            0,
         )
+    )
 
-        # Copy result from device memory to host memory
-        # h_C contains the result in host memory
-        checkCudaErrors(cuda.cuMemcpyDtoH(h_C, d_C, size))
+    # Copy result from device memory to host memory
+    # h_C contains the result in host memory
+    check_cuda_errors(cuda.cuMemcpyDtoH(h_c, d_c, size))
 
-        # Verify result
-        for i in range(N):
-            sum_all = h_A[i] + h_B[i]
-            if math.fabs(h_C[i] - sum_all) > 1e-7:
-                break
+    # Verify result
+    for i in range(n):
+        sum_all = h_a[i] + h_b[i]
+        if math.fabs(h_c[i] - sum_all) > 1e-7:
+            break
 
-        checkCudaErrors(simpleFreeMultiDeviceMmap(d_A, allocationSize))
-        checkCudaErrors(simpleFreeMultiDeviceMmap(d_B, allocationSize))
-        checkCudaErrors(simpleFreeMultiDeviceMmap(d_C, allocationSize))
+    check_cuda_errors(simple_free_multi_device_mmap(d_a, allocation_size))
+    check_cuda_errors(simple_free_multi_device_mmap(d_b, allocation_size))
+    check_cuda_errors(simple_free_multi_device_mmap(d_c, allocation_size))
 
-    checkCudaErrors(cuda.cuCtxDestroy(cuContext))
+    check_cuda_errors(cuda.cuCtxDestroy(cu_context))
 
-    if i + 1 != N:
+    if i + 1 != n:
         print("Result = FAIL", file=sys.stderr)
         sys.exit(1)
 
diff --git a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
index afe769ca15..f26dd2dabe 100644
--- a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
+++ b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
@@ -9,13 +9,13 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors, findCudaDevice
-from common.helper_string import checkCmdLineFlag
+from common.helper_cuda import check_cuda_errors, find_cuda_device
+from common.helper_string import check_cmd_line_flag
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
 
-streamOrderedAllocation = """\
+stream_ordered_allocation = """\
 /* Add two vectors on the GPU */
 extern "C"
 __global__ void vectorAddGPU(const float *a, const float *b, float *c, int N)
@@ -31,18 +31,18 @@
 MAX_ITER = 20
 
 
-def basicStreamOrderedAllocation(dev, nelem, a, b, c):
+def basic_stream_ordered_allocation(dev, nelem, a, b, c):
     num_bytes = nelem * np.dtype(np.float32).itemsize
 
     print("Starting basicStreamOrderedAllocation()")
-    checkCudaErrors(cudart.cudaSetDevice(dev))
-    stream = checkCudaErrors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking))
+    check_cuda_errors(cudart.cudaSetDevice(dev))
+    stream = check_cuda_errors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking))
 
-    d_a = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream))
-    d_b = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream))
-    d_c = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream))
-    checkCudaErrors(cudart.cudaMemcpyAsync(d_a, a, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream))
-    checkCudaErrors(cudart.cudaMemcpyAsync(d_b, b, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream))
+    d_a = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream))
+    d_b = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream))
+    d_c = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream))
+    check_cuda_errors(cudart.cudaMemcpyAsync(d_a, a, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream))
+    check_cuda_errors(cudart.cudaMemcpyAsync(d_b, b, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream))
 
     block = cudart.dim3()
     block.x = 256
@@ -53,13 +53,13 @@ def basicStreamOrderedAllocation(dev, nelem, a, b, c):
     grid.y = 1
     grid.z = 1
 
-    kernelArgs = (
+    kernel_args = (
         (d_a, d_b, d_c, nelem),
         (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int),
     )
-    checkCudaErrors(
+    check_cuda_errors(
         cuda.cuLaunchKernel(
-            _vectorAddGPU,
+            _vector_add_gpu,
             grid.x,
             grid.y,
             grid.z,  # grid dim
@@ -68,68 +68,72 @@ def basicStreamOrderedAllocation(dev, nelem, a, b, c):
             block.z,  # block dim
             0,
             stream,  # shared mem and stream
-            kernelArgs,
+            kernel_args,
             0,
         )
     )  # arguments
 
-    checkCudaErrors(cudart.cudaFreeAsync(d_a, stream))
-    checkCudaErrors(cudart.cudaFreeAsync(d_b, stream))
-    checkCudaErrors(cudart.cudaMemcpyAsync(c, d_c, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream))
-    checkCudaErrors(cudart.cudaFreeAsync(d_c, stream))
-    checkCudaErrors(cudart.cudaStreamSynchronize(stream))
+    check_cuda_errors(cudart.cudaFreeAsync(d_a, stream))
+    check_cuda_errors(cudart.cudaFreeAsync(d_b, stream))
+    check_cuda_errors(cudart.cudaMemcpyAsync(c, d_c, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream))
+    check_cuda_errors(cudart.cudaFreeAsync(d_c, stream))
+    check_cuda_errors(cudart.cudaStreamSynchronize(stream))
 
     # Compare the results
     print("> Checking the results from vectorAddGPU() ...")
-    errorNorm = 0.0
-    refNorm = 0.0
+    error_norm = 0.0
+    ref_norm = 0.0
 
     for n in range(nelem):
         ref = a[n] + b[n]
         diff = c[n] - ref
-        errorNorm += diff * diff
-        refNorm += ref * ref
+        error_norm += diff * diff
+        ref_norm += ref * ref
 
-    errorNorm = math.sqrt(errorNorm)
-    refNorm = math.sqrt(refNorm)
+    error_norm = math.sqrt(error_norm)
+    ref_norm = math.sqrt(ref_norm)
 
-    checkCudaErrors(cudart.cudaStreamDestroy(stream))
+    check_cuda_errors(cudart.cudaStreamDestroy(stream))
 
-    return errorNorm / refNorm < 1.0e-6
+    return error_norm / ref_norm < 1.0e-6
 
 
 # streamOrderedAllocationPostSync(): demonstrates If the application wants the memory to persist in the pool beyond
 # synchronization, then it sets the release threshold on the pool. This way, when the application reaches the "steady state",
 # it is no longer allocating/freeing memory from the OS.
-def streamOrderedAllocationPostSync(dev, nelem, a, b, c):
+def stream_ordered_allocation_post_sync(dev, nelem, a, b, c):
     num_bytes = nelem * np.dtype(np.float32).itemsize
 
     print("Starting streamOrderedAllocationPostSync()")
-    checkCudaErrors(cudart.cudaSetDevice(dev))
-    stream = checkCudaErrors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking))
-    start = checkCudaErrors(cudart.cudaEventCreate())
-    end = checkCudaErrors(cudart.cudaEventCreate())
+    check_cuda_errors(cudart.cudaSetDevice(dev))
+    stream = check_cuda_errors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking))
+    start = check_cuda_errors(cudart.cudaEventCreate())
+    end = check_cuda_errors(cudart.cudaEventCreate())
 
-    memPool = checkCudaErrors(cudart.cudaDeviceGetDefaultMemPool(dev))
-    thresholdVal = cuda.cuuint64_t(ctypes.c_uint64(-1).value)
+    mem_pool = check_cuda_errors(cudart.cudaDeviceGetDefaultMemPool(dev))
+    threshold_val = cuda.cuuint64_t(ctypes.c_uint64(-1).value)
     # Set high release threshold on the default pool so that cudaFreeAsync will not actually release memory to the system.
     # By default, the release threshold for a memory pool is set to zero. This implies that the CUDA driver is
     # allowed to release a memory chunk back to the system as long as it does not contain any active suballocations.
-    checkCudaErrors(
+    check_cuda_errors(
         cudart.cudaMemPoolSetAttribute(
-            memPool,
+            mem_pool,
             cudart.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold,
-            thresholdVal,
+            threshold_val,
         )
     )
     # Record teh start event
-    checkCudaErrors(cudart.cudaEventRecord(start, stream))
+    check_cuda_errors(cudart.cudaEventRecord(start, stream))
     for _i in range(MAX_ITER):
-        d_a = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream))
-        d_b = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream))
-        d_c = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream))
-        checkCudaErrors(cudart.cudaMemcpyAsync(d_a, a, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream))
-        checkCudaErrors(cudart.cudaMemcpyAsync(d_b, b, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream))
+        d_a = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream))
+        d_b = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream))
+        d_c = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream))
+        check_cuda_errors(
+            cudart.cudaMemcpyAsync(d_a, a, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
+        )
+        check_cuda_errors(
+            cudart.cudaMemcpyAsync(d_b, b, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
+        )
 
         block = cudart.dim3()
         block.x = 256
@@ -140,13 +144,13 @@ def streamOrderedAllocationPostSync(dev, nelem, a, b, c):
         grid.y = 1
         grid.z = 1
 
-        kernelArgs = (
+        kernel_args = (
             (d_a, d_b, d_c, nelem),
             (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int),
         )
-        checkCudaErrors(
+        check_cuda_errors(
             cuda.cuLaunchKernel(
-                _vectorAddGPU,
+                _vector_add_gpu,
                 grid.x,
                 grid.y,
                 grid.z,  # grid dim
@@ -155,40 +159,42 @@ def streamOrderedAllocationPostSync(dev, nelem, a, b, c):
                 block.z,  # block dim
                 0,
                 stream,  # shared mem and stream
-                kernelArgs,
+                kernel_args,
                 0,
             )
         )  # arguments
 
-        checkCudaErrors(cudart.cudaFreeAsync(d_a, stream))
-        checkCudaErrors(cudart.cudaFreeAsync(d_b, stream))
-        checkCudaErrors(cudart.cudaMemcpyAsync(c, d_c, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream))
-        checkCudaErrors(cudart.cudaFreeAsync(d_c, stream))
-        checkCudaErrors(cudart.cudaStreamSynchronize(stream))
-    checkCudaErrors(cudart.cudaEventRecord(end, stream))
+        check_cuda_errors(cudart.cudaFreeAsync(d_a, stream))
+        check_cuda_errors(cudart.cudaFreeAsync(d_b, stream))
+        check_cuda_errors(
+            cudart.cudaMemcpyAsync(c, d_c, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
+        )
+        check_cuda_errors(cudart.cudaFreeAsync(d_c, stream))
+        check_cuda_errors(cudart.cudaStreamSynchronize(stream))
+    check_cuda_errors(cudart.cudaEventRecord(end, stream))
     # Wait for the end event to complete
-    checkCudaErrors(cudart.cudaEventSynchronize(end))
+    check_cuda_errors(cudart.cudaEventSynchronize(end))
 
-    msecTotal = checkCudaErrors(cudart.cudaEventElapsedTime(start, end))
-    print(f"Total elapsed time = {msecTotal} ms over {MAX_ITER} iterations")
+    msec_total = check_cuda_errors(cudart.cudaEventElapsedTime(start, end))
+    print(f"Total elapsed time = {msec_total} ms over {MAX_ITER} iterations")
 
     # Compare the results
     print("> Checking the results from vectorAddGPU() ...")
-    errorNorm = 0.0
-    refNorm = 0.0
+    error_norm = 0.0
+    ref_norm = 0.0
 
     for n in range(nelem):
         ref = a[n] + b[n]
         diff = c[n] - ref
-        errorNorm += diff * diff
-        refNorm += ref * ref
+        error_norm += diff * diff
+        ref_norm += ref * ref
 
-    errorNorm = math.sqrt(errorNorm)
-    refNorm = math.sqrt(refNorm)
+    error_norm = math.sqrt(error_norm)
+    ref_norm = math.sqrt(ref_norm)
 
-    checkCudaErrors(cudart.cudaStreamDestroy(stream))
+    check_cuda_errors(cudart.cudaStreamDestroy(stream))
 
-    return errorNorm / refNorm < 1.0e-6
+    return error_norm / ref_norm < 1.0e-6
 
 
 def main():
@@ -198,42 +204,42 @@ def main():
         pytest.skip("streamOrderedAllocation is not supported on Mac OSX")
 
     cuda.cuInit(0)
-    if checkCmdLineFlag("help"):
+    if check_cmd_line_flag("help"):
         print("Usage:  streamOrderedAllocation [OPTION]\n", file=sys.stderr)
         print("Options:", file=sys.stderr)
         print("  device=[device #]  Specify the device to be used", file=sys.stderr)
         sys.exit(1)
 
-    dev = findCudaDevice()
+    dev = find_cuda_device()
 
-    version = checkCudaErrors(cudart.cudaDriverGetVersion())
+    version = check_cuda_errors(cudart.cudaDriverGetVersion())
     if version < 11030:
-        isMemPoolSupported = False
+        is_mem_pool_supported = False
     else:
-        isMemPoolSupported = checkCudaErrors(
+        is_mem_pool_supported = check_cuda_errors(
             cudart.cudaDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev)
         )
-    if not isMemPoolSupported:
+    if not is_mem_pool_supported:
         pytest.skip("Waiving execution as device does not support Memory Pools")
 
-    global _vectorAddGPU
-    with common.KernelHelper(streamOrderedAllocation, dev) as kernelHelper:
-        _vectorAddGPU = kernelHelper.getFunction(b"vectorAddGPU")
+    global _vector_add_gpu
+    kernel_helper = common.KernelHelper(stream_ordered_allocation, dev)
+    _vector_add_gpu = kernel_helper.get_function(b"vectorAddGPU")
 
-        # Allocate CPU memory
-        nelem = 1048576
-        nelem * np.dtype(np.float32).itemsize
+    # Allocate CPU memory
+    nelem = 1048576
+    nelem * np.dtype(np.float32).itemsize
 
-        a = np.zeros(nelem, dtype="float32")
-        b = np.zeros(nelem, dtype="float32")
-        c = np.zeros(nelem, dtype="float32")
-        # Initialize the vectors
-        for i in range(nelem):
-            a[i] = rnd.random()
-            b[i] = rnd.random()
+    a = np.zeros(nelem, dtype="float32")
+    b = np.zeros(nelem, dtype="float32")
+    c = np.zeros(nelem, dtype="float32")
+    # Initialize the vectors
+    for i in range(nelem):
+        a[i] = rnd.random()
+        b[i] = rnd.random()
 
-        ret1 = basicStreamOrderedAllocation(dev, nelem, a, b, c)
-        ret2 = streamOrderedAllocationPostSync(dev, nelem, a, b, c)
+    ret1 = basic_stream_ordered_allocation(dev, nelem, a, b, c)
+    ret2 = stream_ordered_allocation_post_sync(dev, nelem, a, b, c)
 
     if not ret1 or not ret2:
         sys.exit(1)
diff --git a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
index aaa03e446a..722d19dcb5 100644
--- a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
+++ b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
@@ -9,16 +9,16 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors, findCudaDevice
-from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt
+from common.helper_cuda import check_cuda_errors, find_cuda_device
+from common.helper_string import check_cmd_line_flag, get_cmd_line_argument_int
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
 
-blockSize = 16
+block_size = 16
 
 
-class kernels(Enum):
+class Kernels(Enum):
     AsyncCopyMultiStageLargeChunk = 0
     AsyncCopyLargeChunk = 1
     AsyncCopyLargeChunkAWBarrier = 2
@@ -29,7 +29,7 @@ class kernels(Enum):
     NaiveLargeChunk = 7
 
 
-kernelNames = [
+kernel_names = [
     "AsyncCopyMultiStageLargeChunk",
     "AsyncCopyLargeChunk",
     "AsyncCopyLargeChunkAWBarrier",
@@ -40,7 +40,7 @@ class kernels(Enum):
     "NaiveLargeChunk",
 ]
 
-globalToShmemAsyncCopy = """\
+global_to_shmem_async_copy = """\
 #line __LINE__
 #if __CUDA_ARCH__ >= 700
 #include <cuda/barrier>
@@ -709,7 +709,7 @@ class kernels(Enum):
 """
 
 
-def ConstantInit(data, size, val):
+def constant_init(data, size, val):
     p_data = (ctypes.c_float * size).from_address(data)
     for i in range(size):
         p_data[i] = val
@@ -718,78 +718,82 @@ def ConstantInit(data, size, val):
 #
 # Run matrix multiplication using CUDA
 #
-def MatrixMultiply(dimsA, dimsB, kernel_number):
+def matrix_multiply(dims_a, dims_b, kernel_number):
     # Allocate host memory for matricies A and B
-    size_A = dimsA.x * dimsA.y
-    mem_size_A = np.dtype(np.float32).itemsize * size_A
-    h_A = checkCudaErrors(cudart.cudaMallocHost(mem_size_A))
-    size_B = dimsB.x * dimsB.y
-    mem_size_B = np.dtype(np.float32).itemsize * size_B
-    h_B = checkCudaErrors(cudart.cudaMallocHost(mem_size_B))
+    size_a = dims_a.x * dims_a.y
+    mem_size_a = np.dtype(np.float32).itemsize * size_a
+    h_a = check_cuda_errors(cudart.cudaMallocHost(mem_size_a))
+    size_b = dims_b.x * dims_b.y
+    mem_size_b = np.dtype(np.float32).itemsize * size_b
+    h_b = check_cuda_errors(cudart.cudaMallocHost(mem_size_b))
 
     # Initialize host memory
-    valB = 2.10
-    ConstantInit(h_A, size_A, 1.0)
-    ConstantInit(h_B, size_B, valB)
+    val_b = 2.10
+    constant_init(h_a, size_a, 1.0)
+    constant_init(h_b, size_b, val_b)
 
     # Allocate Device Memory
 
     # Allocate host matrix C
-    dimsC = cudart.dim3()
-    dimsC.x = dimsB.x
-    dimsC.y = dimsA.y
-    dimsC.z = 1
-    mem_size_C = dimsC.x * dimsC.y * np.dtype(np.float32).itemsize
-    h_C = checkCudaErrors(cudart.cudaMallocHost(mem_size_C))
-
-    if h_C == 0:
+    dims_c = cudart.dim3()
+    dims_c.x = dims_b.x
+    dims_c.y = dims_a.y
+    dims_c.z = 1
+    mem_size_c = dims_c.x * dims_c.y * np.dtype(np.float32).itemsize
+    h_c = check_cuda_errors(cudart.cudaMallocHost(mem_size_c))
+
+    if h_c == 0:
         print("Failed to allocate host matrix C!", file=sys.stderr)
         sys.exit(1)
 
-    d_A = checkCudaErrors(cudart.cudaMalloc(mem_size_A))
-    d_B = checkCudaErrors(cudart.cudaMalloc(mem_size_B))
-    d_C = checkCudaErrors(cudart.cudaMalloc(mem_size_C))
+    d_a = check_cuda_errors(cudart.cudaMalloc(mem_size_a))
+    d_b = check_cuda_errors(cudart.cudaMalloc(mem_size_b))
+    d_c = check_cuda_errors(cudart.cudaMalloc(mem_size_c))
     # Allocate CUDA events that we'll use for timing
-    start = checkCudaErrors(cudart.cudaEventCreate())
-    stop = checkCudaErrors(cudart.cudaEventCreate())
+    start = check_cuda_errors(cudart.cudaEventCreate())
+    stop = check_cuda_errors(cudart.cudaEventCreate())
 
-    stream = checkCudaErrors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking))
+    stream = check_cuda_errors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking))
 
     # Copy host memory to device
-    checkCudaErrors(cudart.cudaMemcpyAsync(d_A, h_A, mem_size_A, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream))
-    checkCudaErrors(cudart.cudaMemcpyAsync(d_B, h_B, mem_size_B, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream))
-    checkCudaErrors(cudart.cudaMemsetAsync(d_C, 0, mem_size_C, stream))
+    check_cuda_errors(
+        cudart.cudaMemcpyAsync(d_a, h_a, mem_size_a, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
+    )
+    check_cuda_errors(
+        cudart.cudaMemcpyAsync(d_b, h_b, mem_size_b, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
+    )
+    check_cuda_errors(cudart.cudaMemsetAsync(d_c, 0, mem_size_c, stream))
 
     # Setup execution parameters
     threads = cudart.dim3()
-    threads.x = threads.y = blockSize
+    threads.x = threads.y = block_size
     threads.z = 1
     grid = cudart.dim3()
-    grid.x = dimsB.x / threads.x
-    grid.y = dimsA.y / threads.y
+    grid.x = dims_b.x / threads.x
+    grid.y = dims_a.y / threads.y
     grid.z = 1
 
     # Here the block size is 16x18, where first 16 rows are consumer thread group
     # and last 2 rows (1 warp) is producer thread group
-    threadsSharedStateKernel = cudart.dim3()
-    threadsSharedStateKernel.x = blockSize
-    threadsSharedStateKernel.y = blockSize + 2
-    threadsSharedStateKernel.z = 1
-    gridSharedStateKernel = cudart.dim3()
-    gridSharedStateKernel.x = dimsB.x / threadsSharedStateKernel.x
-    gridSharedStateKernel.y = dimsA.y / threadsSharedStateKernel.x
-
-    print(f"Running kernel = {kernel_number} - {kernelNames[kernel_number.value]}")
+    threads_shared_state_kernel = cudart.dim3()
+    threads_shared_state_kernel.x = block_size
+    threads_shared_state_kernel.y = block_size + 2
+    threads_shared_state_kernel.z = 1
+    grid_shared_state_kernel = cudart.dim3()
+    grid_shared_state_kernel.x = dims_b.x / threads_shared_state_kernel.x
+    grid_shared_state_kernel.y = dims_a.y / threads_shared_state_kernel.x
+
+    print(f"Running kernel = {kernel_number} - {kernel_names[kernel_number.value]}")
     # Create and start timer
     print("Computing result using CUDA Kernel...")
 
     # Performs warmup operation using matrixMul CUDA kernel
-    kernelArguments = (
-        (d_C, d_A, d_B, dimsA.x, dimsB.x),
+    kernel_arguments = (
+        (d_c, d_a, d_b, dims_a.x, dims_b.x),
         (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int),
     )
-    if kernel_number == kernels.AsyncCopyMultiStageLargeChunk:
-        checkCudaErrors(
+    if kernel_number == Kernels.AsyncCopyMultiStageLargeChunk:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopyMultiStageLargeChunk,
                 grid.x,
@@ -800,12 +804,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.AsyncCopyLargeChunk:
-        checkCudaErrors(
+    elif kernel_number == Kernels.AsyncCopyLargeChunk:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopyLargeChunk,
                 grid.x,
@@ -816,12 +820,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.AsyncCopyLargeChunkAWBarrier:
-        checkCudaErrors(
+    elif kernel_number == Kernels.AsyncCopyLargeChunkAWBarrier:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopyLargeChunkAWBarrier,
                 grid.x,
@@ -832,28 +836,28 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.AsyncCopyMultiStageSharedState:
-        checkCudaErrors(
+    elif kernel_number == Kernels.AsyncCopyMultiStageSharedState:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopyMultiStageSharedState,
-                gridSharedStateKernel.x,
-                gridSharedStateKernel.y,
-                gridSharedStateKernel.z,  # grid dim
-                threadsSharedStateKernel.x,
-                threadsSharedStateKernel.y,
-                threadsSharedStateKernel.z,  # block dim
+                grid_shared_state_kernel.x,
+                grid_shared_state_kernel.y,
+                grid_shared_state_kernel.z,  # grid dim
+                threads_shared_state_kernel.x,
+                threads_shared_state_kernel.y,
+                threads_shared_state_kernel.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.AsyncCopyMultiStage:
-        checkCudaErrors(
+    elif kernel_number == Kernels.AsyncCopyMultiStage:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopyMultiStage,
                 grid.x,
@@ -864,12 +868,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.AsyncCopySingleStage:
-        checkCudaErrors(
+    elif kernel_number == Kernels.AsyncCopySingleStage:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopySingleStage,
                 grid.x,
@@ -880,12 +884,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.Naive:
-        checkCudaErrors(
+    elif kernel_number == Kernels.Naive:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulNaive,
                 grid.x,
@@ -896,12 +900,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.NaiveLargeChunk:
-        checkCudaErrors(
+    elif kernel_number == Kernels.NaiveLargeChunk:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulNaiveLargeChunk,
                 grid.x,
@@ -912,21 +916,21 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
 
-    checkCudaErrors(cudart.cudaStreamSynchronize(stream))
+    check_cuda_errors(cudart.cudaStreamSynchronize(stream))
 
     # Execute the kernel
-    nIter = 100
+    n_iter = 100
 
     # Record the start event
-    checkCudaErrors(cudart.cudaEventRecord(start, stream))
+    check_cuda_errors(cudart.cudaEventRecord(start, stream))
 
-    if kernel_number == kernels.AsyncCopyMultiStageLargeChunk:
-        checkCudaErrors(
+    if kernel_number == Kernels.AsyncCopyMultiStageLargeChunk:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopyMultiStageLargeChunk,
                 grid.x,
@@ -937,12 +941,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.AsyncCopyLargeChunk:
-        checkCudaErrors(
+    elif kernel_number == Kernels.AsyncCopyLargeChunk:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopyLargeChunk,
                 grid.x,
@@ -953,12 +957,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.AsyncCopyLargeChunkAWBarrier:
-        checkCudaErrors(
+    elif kernel_number == Kernels.AsyncCopyLargeChunkAWBarrier:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopyLargeChunkAWBarrier,
                 grid.x,
@@ -969,28 +973,28 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.AsyncCopyMultiStageSharedState:
-        checkCudaErrors(
+    elif kernel_number == Kernels.AsyncCopyMultiStageSharedState:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopyMultiStageSharedState,
-                gridSharedStateKernel.x,
-                gridSharedStateKernel.y,
-                gridSharedStateKernel.z,  # grid dim
-                threadsSharedStateKernel.x,
-                threadsSharedStateKernel.y,
-                threadsSharedStateKernel.z,  # block dim
+                grid_shared_state_kernel.x,
+                grid_shared_state_kernel.y,
+                grid_shared_state_kernel.z,  # grid dim
+                threads_shared_state_kernel.x,
+                threads_shared_state_kernel.y,
+                threads_shared_state_kernel.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.AsyncCopyMultiStage:
-        checkCudaErrors(
+    elif kernel_number == Kernels.AsyncCopyMultiStage:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopyMultiStage,
                 grid.x,
@@ -1001,12 +1005,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.AsyncCopySingleStage:
-        checkCudaErrors(
+    elif kernel_number == Kernels.AsyncCopySingleStage:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulAsyncCopySingleStage,
                 grid.x,
@@ -1017,12 +1021,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.Naive:
-        checkCudaErrors(
+    elif kernel_number == Kernels.Naive:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulNaive,
                 grid.x,
@@ -1033,12 +1037,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
-    elif kernel_number == kernels.NaiveLargeChunk:
-        checkCudaErrors(
+    elif kernel_number == Kernels.NaiveLargeChunk:
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 _MatrixMulNaiveLargeChunk,
                 grid.x,
@@ -1049,31 +1053,33 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
                 threads.z,  # block dim
                 0,  # shared mem
                 stream,  # stream
-                kernelArguments,
+                kernel_arguments,
                 0,
             )
         )  # arguments
 
     # Record the stop event
-    checkCudaErrors(cudart.cudaEventRecord(stop, stream))
+    check_cuda_errors(cudart.cudaEventRecord(stop, stream))
 
     # Wait for the stop event to complete
-    checkCudaErrors(cudart.cudaEventSynchronize(stop))
+    check_cuda_errors(cudart.cudaEventSynchronize(stop))
 
-    msecTotal = checkCudaErrors(cudart.cudaEventElapsedTime(start, stop))
+    msec_total = check_cuda_errors(cudart.cudaEventElapsedTime(start, stop))
 
     # Compute and print the performance
-    msecPerMatrixMul = msecTotal / nIter
-    flopsPerMatrixMul = 2.0 * dimsA.x * dimsA.y * dimsB.x
-    gigaFlops = (flopsPerMatrixMul * 1.0e-9) / (msecPerMatrixMul / 1000.0)
+    msec_per_matrix_mul = msec_total / n_iter
+    flops_per_matrix_mul = 2.0 * dims_a.x * dims_a.y * dims_b.x
+    giga_flops = (flops_per_matrix_mul * 1.0e-9) / (msec_per_matrix_mul / 1000.0)
 
     print(
-        f"Performance= {gigaFlops:.2f} GFlop/s, Time= {msecPerMatrixMul:.2f} msec, Size= {flopsPerMatrixMul:.0f} Ops, WorkgroupSize= {threads.x * threads.y} threads/block"
+        f"Performance= {giga_flops:.2f} GFlop/s, Time= {msec_per_matrix_mul:.2f} msec, Size= {flops_per_matrix_mul:.0f} Ops, WorkgroupSize= {threads.x * threads.y} threads/block"
     )
 
     # Copy result from device to host
-    checkCudaErrors(cudart.cudaMemcpyAsync(h_C, d_C, mem_size_C, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream))
-    checkCudaErrors(cudart.cudaStreamSynchronize(stream))
+    check_cuda_errors(
+        cudart.cudaMemcpyAsync(h_c, d_c, mem_size_c, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
+    )
+    check_cuda_errors(cudart.cudaStreamSynchronize(stream))
 
     correct = True
 
@@ -1081,16 +1087,16 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
     # |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
     eps = 1.0e-6
 
-    h_C_local = (ctypes.c_float * (dimsC.x * dimsC.y)).from_address(h_C)
-    for i in range(dimsC.x * dimsC.y):
-        abs_err = math.fabs(h_C_local[i] - (dimsA.x * valB))
-        dot_length = dimsA.x
-        abs_val = math.fabs(h_C_local[i])
+    h_c_local = (ctypes.c_float * (dims_c.x * dims_c.y)).from_address(h_c)
+    for i in range(dims_c.x * dims_c.y):
+        abs_err = math.fabs(h_c_local[i] - (dims_a.x * val_b))
+        dot_length = dims_a.x
+        abs_val = math.fabs(h_c_local[i])
         rel_err = abs_err / abs_val / dot_length
 
         if rel_err > eps:
             print(
-                f"Error! Matrix[{i:.5f}]={h_C_local[i]:.8f} ref={dimsA.x * valB:.8f} err term is > {rel_err}",
+                f"Error! Matrix[{i:.5f}]={h_c_local[i]:.8f} ref={dims_a.x * val_b:.8f} err term is > {rel_err}",
                 file=sys.stderr,
             )
             correct = False
@@ -1099,14 +1105,14 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
         print("Result = FAIL", file=sys.stderr)
 
     # Clean up memory
-    checkCudaErrors(cudart.cudaFreeHost(h_A))
-    checkCudaErrors(cudart.cudaFreeHost(h_B))
-    checkCudaErrors(cudart.cudaFreeHost(h_C))
-    checkCudaErrors(cudart.cudaFree(d_A))
-    checkCudaErrors(cudart.cudaFree(d_B))
-    checkCudaErrors(cudart.cudaFree(d_C))
-    checkCudaErrors(cudart.cudaEventDestroy(start))
-    checkCudaErrors(cudart.cudaEventDestroy(stop))
+    check_cuda_errors(cudart.cudaFreeHost(h_a))
+    check_cuda_errors(cudart.cudaFreeHost(h_b))
+    check_cuda_errors(cudart.cudaFreeHost(h_c))
+    check_cuda_errors(cudart.cudaFree(d_a))
+    check_cuda_errors(cudart.cudaFree(d_b))
+    check_cuda_errors(cudart.cudaFree(d_c))
+    check_cuda_errors(cudart.cudaEventDestroy(start))
+    check_cuda_errors(cudart.cudaEventDestroy(stop))
     print(
         "\nNOTE: The CUDA Samples are not meant for performance "
         "measurements. Results may vary when GPU Boost is enabled."
@@ -1119,16 +1125,16 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
 def main():
     import pytest
 
-    common.pytest_skipif_compute_capability_too_low(findCudaDevice(), (7, 0))
+    common.pytest_skipif_compute_capability_too_low(find_cuda_device(), (7, 0))
 
     if platform.machine() == "qnx":
         pytest.skip("globalToShmemAsyncCopy is not supported on QNX")
 
-    version = checkCudaErrors(cuda.cuDriverGetVersion())
+    version = check_cuda_errors(cuda.cuDriverGetVersion())
     if version < 11010:
         pytest.skip("CUDA Toolkit 11.1 or greater is required")
 
-    if checkCmdLineFlag("help") or checkCmdLineFlag("?"):
+    if check_cmd_line_flag("help") or check_cmd_line_flag("?"):
         print("Usage device=n (n >= 0 for deviceID)", file=sys.stderr)
         print("      wA=WidthA hA=HeightA (Width x Height of Matrix A)", file=sys.stderr)
         print("      wB=WidthB hB=HeightB (Width x Height of Matrix B)", file=sys.stderr)
@@ -1149,54 +1155,54 @@ def main():
 
     # This will pick the best possible CUDA capable device, otherwise
     # override the device ID based on input provided at the command line
-    devID = findCudaDevice()
+    dev_id = find_cuda_device()
 
-    matrixBlock = 32
-    dimsA = cudart.dim3()
-    dimsA.x = dimsA.y = 10 * 4 * matrixBlock
-    dimsA.z = 1
-    dimsB = cudart.dim3()
-    dimsB.x = dimsB.y = 10 * 4 * matrixBlock
-    dimsB.z = 1
+    matrix_block = 32
+    dims_a = cudart.dim3()
+    dims_a.x = dims_a.y = 10 * 4 * matrix_block
+    dims_a.z = 1
+    dims_b = cudart.dim3()
+    dims_b.x = dims_b.y = 10 * 4 * matrix_block
+    dims_b.z = 1
 
     # width of Matrix A
-    if checkCmdLineFlag("wA="):
-        dimsA.x = int(getCmdLineArgumentInt("wA="))
+    if check_cmd_line_flag("wA="):
+        dims_a.x = int(get_cmd_line_argument_int("wA="))
 
     # height of Matrix A
-    if checkCmdLineFlag("hA="):
-        dimsA.y = int(getCmdLineArgumentInt("hA="))
+    if check_cmd_line_flag("hA="):
+        dims_a.y = int(get_cmd_line_argument_int("hA="))
 
     # width of Matrix B
-    if checkCmdLineFlag("wB="):
-        dimsB.x = int(getCmdLineArgumentInt("wB="))
+    if check_cmd_line_flag("wB="):
+        dims_b.x = int(get_cmd_line_argument_int("wB="))
 
     # height of Matrix B
-    if checkCmdLineFlag("hB="):
-        dimsB.y = int(getCmdLineArgumentInt("hB="))
+    if check_cmd_line_flag("hB="):
+        dims_b.y = int(get_cmd_line_argument_int("hB="))
 
-    if dimsA.x != dimsB.y:
-        print(f"Error: outer matrix dimensions must be equal. ({dimsA.x} != {dimsB.y})", file=sys.stderr)
+    if dims_a.x != dims_b.y:
+        print(f"Error: outer matrix dimensions must be equal. ({dims_a.x} != {dims_b.y})", file=sys.stderr)
         sys.exit(1)
 
-    selected_kernel = kernels.AsyncCopyMultiStageLargeChunk
+    selected_kernel = Kernels.AsyncCopyMultiStageLargeChunk
 
     # kernel to run - default (AsyncCopyMultiStageLargeChunk == 0)
-    if checkCmdLineFlag("kernel="):
-        kernel_number = int(getCmdLineArgumentInt("kernel="))
+    if check_cmd_line_flag("kernel="):
+        kernel_number = int(get_cmd_line_argument_int("kernel="))
         if kernel_number < 8:
-            selected_kernel = kernels(kernel_number)
+            selected_kernel = Kernels(kernel_number)
         else:
             print("Error: kernel number should be between 0 to 7", file=sys.stderr)
             sys.exit(1)
 
-    major = checkCudaErrors(
-        cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID)
+    major = check_cuda_errors(
+        cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, dev_id)
     )
     if major < 7:
         pytest.skip("globalToShmemAsyncCopy requires SM 7.0 or higher.")
 
-    print(f"MatrixA({dimsA.x},{dimsA.y}), MatrixB({dimsB.x},{dimsB.y})")
+    print(f"MatrixA({dims_a.x},{dims_a.y}), MatrixB({dims_b.x},{dims_b.y})")
 
     global _MatrixMulAsyncCopyMultiStageLargeChunk
     global _MatrixMulAsyncCopyLargeChunk
@@ -1206,17 +1212,17 @@ def main():
     global _MatrixMulAsyncCopySingleStage
     global _MatrixMulNaive
     global _MatrixMulNaiveLargeChunk
-    with common.KernelHelper(globalToShmemAsyncCopy, devID) as kernelHelper:
-        _MatrixMulAsyncCopyMultiStageLargeChunk = kernelHelper.getFunction(b"MatrixMulAsyncCopyMultiStageLargeChunk")
-        _MatrixMulAsyncCopyLargeChunk = kernelHelper.getFunction(b"MatrixMulAsyncCopyLargeChunk")
-        _MatrixMulAsyncCopyLargeChunkAWBarrier = kernelHelper.getFunction(b"MatrixMulAsyncCopyLargeChunkAWBarrier")
-        _MatrixMulAsyncCopyMultiStageSharedState = kernelHelper.getFunction(b"MatrixMulAsyncCopyMultiStageSharedState")
-        _MatrixMulAsyncCopyMultiStage = kernelHelper.getFunction(b"MatrixMulAsyncCopyMultiStage")
-        _MatrixMulAsyncCopySingleStage = kernelHelper.getFunction(b"MatrixMulAsyncCopySingleStage")
-        _MatrixMulNaive = kernelHelper.getFunction(b"MatrixMulNaive")
-        _MatrixMulNaiveLargeChunk = kernelHelper.getFunction(b"MatrixMulNaiveLargeChunk")
-
-        matrix_result = MatrixMultiply(dimsA, dimsB, selected_kernel)
+    kernel_helper = common.KernelHelper(global_to_shmem_async_copy, dev_id)
+    _MatrixMulAsyncCopyMultiStageLargeChunk = kernel_helper.get_function(b"MatrixMulAsyncCopyMultiStageLargeChunk")
+    _MatrixMulAsyncCopyLargeChunk = kernel_helper.get_function(b"MatrixMulAsyncCopyLargeChunk")
+    _MatrixMulAsyncCopyLargeChunkAWBarrier = kernel_helper.get_function(b"MatrixMulAsyncCopyLargeChunkAWBarrier")
+    _MatrixMulAsyncCopyMultiStageSharedState = kernel_helper.get_function(b"MatrixMulAsyncCopyMultiStageSharedState")
+    _MatrixMulAsyncCopyMultiStage = kernel_helper.get_function(b"MatrixMulAsyncCopyMultiStage")
+    _MatrixMulAsyncCopySingleStage = kernel_helper.get_function(b"MatrixMulAsyncCopySingleStage")
+    _MatrixMulNaive = kernel_helper.get_function(b"MatrixMulNaive")
+    _MatrixMulNaiveLargeChunk = kernel_helper.get_function(b"MatrixMulNaiveLargeChunk")
+
+    matrix_result = matrix_multiply(dims_a, dims_b, selected_kernel)
 
     if matrix_result != 0:
         sys.exit(1)
diff --git a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
index 7746bd08e3..b08da3edc0 100644
--- a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
+++ b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors, findCudaDevice
+from common.helper_cuda import check_cuda_errors, find_cuda_device
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
@@ -14,7 +14,7 @@
 THREADS_PER_BLOCK = 512
 GRAPH_LAUNCH_ITERATIONS = 3
 
-simpleCudaGraphs = """\
+simple_cuda_graphs = """\
 #include <cooperative_groups.h>
 #include <cuda_runtime.h>
 
@@ -121,185 +121,185 @@ def init_input(a, size):
         a_list[i] = rnd.random()
 
 
-def cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, numOfBlocks):
+def cuda_graphs_manual(input_vec_h, input_vec_d, output_vec_d, result_d, input_size, num_of_blocks):
     result_h = ctypes.c_double(0.0)
-    nodeDependencies = []
+    node_dependencies = []
 
-    streamForGraph = checkCudaErrors(cudart.cudaStreamCreate())
+    stream_for_graph = check_cuda_errors(cudart.cudaStreamCreate())
 
-    kernelNodeParams = cuda.CUDA_KERNEL_NODE_PARAMS()
-    memcpyParams = cudart.cudaMemcpy3DParms()
-    memsetParams = cudart.cudaMemsetParams()
+    kernel_node_params = cuda.CUDA_KERNEL_NODE_PARAMS()
+    memcpy_params = cudart.cudaMemcpy3DParms()
+    memset_params = cudart.cudaMemsetParams()
 
-    memcpyParams.srcArray = None
-    memcpyParams.srcPos = cudart.make_cudaPos(0, 0, 0)
-    memcpyParams.srcPtr = cudart.make_cudaPitchedPtr(
-        inputVec_h, np.dtype(np.float32).itemsize * inputSize, inputSize, 1
+    memcpy_params.srcArray = None
+    memcpy_params.srcPos = cudart.make_cudaPos(0, 0, 0)
+    memcpy_params.srcPtr = cudart.make_cudaPitchedPtr(
+        input_vec_h, np.dtype(np.float32).itemsize * input_size, input_size, 1
     )
-    memcpyParams.dstArray = None
-    memcpyParams.dstPos = cudart.make_cudaPos(0, 0, 0)
-    memcpyParams.dstPtr = cudart.make_cudaPitchedPtr(
-        inputVec_d, np.dtype(np.float32).itemsize * inputSize, inputSize, 1
+    memcpy_params.dstArray = None
+    memcpy_params.dstPos = cudart.make_cudaPos(0, 0, 0)
+    memcpy_params.dstPtr = cudart.make_cudaPitchedPtr(
+        input_vec_d, np.dtype(np.float32).itemsize * input_size, input_size, 1
     )
-    memcpyParams.extent = cudart.make_cudaExtent(np.dtype(np.float32).itemsize * inputSize, 1, 1)
-    memcpyParams.kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
+    memcpy_params.extent = cudart.make_cudaExtent(np.dtype(np.float32).itemsize * input_size, 1, 1)
+    memcpy_params.kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
 
-    memsetParams.dst = outputVec_d
-    memsetParams.value = 0
-    memsetParams.pitch = 0
-    memsetParams.elementSize = np.dtype(np.float32).itemsize  # elementSize can be max 4 bytes
-    memsetParams.width = numOfBlocks * 2
-    memsetParams.height = 1
+    memset_params.dst = output_vec_d
+    memset_params.value = 0
+    memset_params.pitch = 0
+    memset_params.elementSize = np.dtype(np.float32).itemsize  # elementSize can be max 4 bytes
+    memset_params.width = num_of_blocks * 2
+    memset_params.height = 1
 
-    graph = checkCudaErrors(cudart.cudaGraphCreate(0))
+    graph = check_cuda_errors(cudart.cudaGraphCreate(0))
 
-    memcpyNode = checkCudaErrors(cudart.cudaGraphAddMemcpyNode(graph, None, 0, memcpyParams))
-    memsetNode = checkCudaErrors(cudart.cudaGraphAddMemsetNode(graph, None, 0, memsetParams))
+    memcpy_node = check_cuda_errors(cudart.cudaGraphAddMemcpyNode(graph, None, 0, memcpy_params))
+    memset_node = check_cuda_errors(cudart.cudaGraphAddMemsetNode(graph, None, 0, memset_params))
 
-    nodeDependencies.append(memsetNode)
-    nodeDependencies.append(memcpyNode)
+    node_dependencies.append(memset_node)
+    node_dependencies.append(memcpy_node)
 
-    kernelArgs = (
-        (inputVec_d, outputVec_d, inputSize, numOfBlocks),
+    kernel_args = (
+        (input_vec_d, output_vec_d, input_size, num_of_blocks),
         (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_uint),
     )
 
-    kernelNodeParams.func = _reduce
-    kernelNodeParams.gridDimX = numOfBlocks
-    kernelNodeParams.gridDimY = kernelNodeParams.gridDimZ = 1
-    kernelNodeParams.blockDimX = THREADS_PER_BLOCK
-    kernelNodeParams.blockDimY = kernelNodeParams.blockDimZ = 1
-    kernelNodeParams.sharedMemBytes = 0
-    kernelNodeParams.kernelParams = kernelArgs
+    kernel_node_params.func = _reduce
+    kernel_node_params.gridDimX = num_of_blocks
+    kernel_node_params.gridDimY = kernel_node_params.gridDimZ = 1
+    kernel_node_params.blockDimX = THREADS_PER_BLOCK
+    kernel_node_params.blockDimY = kernel_node_params.blockDimZ = 1
+    kernel_node_params.sharedMemBytes = 0
+    kernel_node_params.kernelParams = kernel_args
     # kernelNodeParams.extra = None
 
-    kernelNode = checkCudaErrors(
-        cuda.cuGraphAddKernelNode(graph, nodeDependencies, len(nodeDependencies), kernelNodeParams)
+    kernel_node = check_cuda_errors(
+        cuda.cuGraphAddKernelNode(graph, node_dependencies, len(node_dependencies), kernel_node_params)
     )
 
-    nodeDependencies.clear()
-    nodeDependencies.append(kernelNode)
-
-    memsetParams = cudart.cudaMemsetParams()
-    memsetParams.dst = result_d
-    memsetParams.value = 0
-    memsetParams.elementSize = np.dtype(np.float32).itemsize
-    memsetParams.width = 2
-    memsetParams.height = 1
-    memsetNode = checkCudaErrors(cudart.cudaGraphAddMemsetNode(graph, None, 0, memsetParams))
-
-    nodeDependencies.append(memsetNode)
-
-    kernelNodeParams = cuda.CUDA_KERNEL_NODE_PARAMS()
-    kernelNodeParams.func = _reduceFinal
-    kernelNodeParams.gridDimX = kernelNodeParams.gridDimY = kernelNodeParams.gridDimZ = 1
-    kernelNodeParams.blockDimX = THREADS_PER_BLOCK
-    kernelNodeParams.blockDimY = kernelNodeParams.blockDimZ = 1
-    kernelNodeParams.sharedMemBytes = 0
-    kernelArgs2 = (
-        (outputVec_d, result_d, numOfBlocks),
+    node_dependencies.clear()
+    node_dependencies.append(kernel_node)
+
+    memset_params = cudart.cudaMemsetParams()
+    memset_params.dst = result_d
+    memset_params.value = 0
+    memset_params.elementSize = np.dtype(np.float32).itemsize
+    memset_params.width = 2
+    memset_params.height = 1
+    memset_node = check_cuda_errors(cudart.cudaGraphAddMemsetNode(graph, None, 0, memset_params))
+
+    node_dependencies.append(memset_node)
+
+    kernel_node_params = cuda.CUDA_KERNEL_NODE_PARAMS()
+    kernel_node_params.func = _reduceFinal
+    kernel_node_params.gridDimX = kernel_node_params.gridDimY = kernel_node_params.gridDimZ = 1
+    kernel_node_params.blockDimX = THREADS_PER_BLOCK
+    kernel_node_params.blockDimY = kernel_node_params.blockDimZ = 1
+    kernel_node_params.sharedMemBytes = 0
+    kernel_args2 = (
+        (output_vec_d, result_d, num_of_blocks),
         (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_uint),
     )
-    kernelNodeParams.kernelParams = kernelArgs2
+    kernel_node_params.kernelParams = kernel_args2
     # kernelNodeParams.extra = None
 
-    kernelNode = checkCudaErrors(
-        cuda.cuGraphAddKernelNode(graph, nodeDependencies, len(nodeDependencies), kernelNodeParams)
+    kernel_node = check_cuda_errors(
+        cuda.cuGraphAddKernelNode(graph, node_dependencies, len(node_dependencies), kernel_node_params)
     )
 
-    nodeDependencies.clear()
-    nodeDependencies.append(kernelNode)
-
-    memcpyParams = cudart.cudaMemcpy3DParms()
-
-    memcpyParams.srcArray = None
-    memcpyParams.srcPos = cudart.make_cudaPos(0, 0, 0)
-    memcpyParams.srcPtr = cudart.make_cudaPitchedPtr(result_d, np.dtype(np.float64).itemsize, 1, 1)
-    memcpyParams.dstArray = None
-    memcpyParams.dstPos = cudart.make_cudaPos(0, 0, 0)
-    memcpyParams.dstPtr = cudart.make_cudaPitchedPtr(result_h, np.dtype(np.float64).itemsize, 1, 1)
-    memcpyParams.extent = cudart.make_cudaExtent(np.dtype(np.float64).itemsize, 1, 1)
-    memcpyParams.kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
-    memcpyNode = checkCudaErrors(
-        cudart.cudaGraphAddMemcpyNode(graph, nodeDependencies, len(nodeDependencies), memcpyParams)
+    node_dependencies.clear()
+    node_dependencies.append(kernel_node)
+
+    memcpy_params = cudart.cudaMemcpy3DParms()
+
+    memcpy_params.srcArray = None
+    memcpy_params.srcPos = cudart.make_cudaPos(0, 0, 0)
+    memcpy_params.srcPtr = cudart.make_cudaPitchedPtr(result_d, np.dtype(np.float64).itemsize, 1, 1)
+    memcpy_params.dstArray = None
+    memcpy_params.dstPos = cudart.make_cudaPos(0, 0, 0)
+    memcpy_params.dstPtr = cudart.make_cudaPitchedPtr(result_h, np.dtype(np.float64).itemsize, 1, 1)
+    memcpy_params.extent = cudart.make_cudaExtent(np.dtype(np.float64).itemsize, 1, 1)
+    memcpy_params.kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
+    memcpy_node = check_cuda_errors(
+        cudart.cudaGraphAddMemcpyNode(graph, node_dependencies, len(node_dependencies), memcpy_params)
     )
 
-    nodeDependencies.clear()
-    nodeDependencies.append(memcpyNode)
+    node_dependencies.clear()
+    node_dependencies.append(memcpy_node)
 
     # WIP: Host nodes
 
-    nodes, numNodes = checkCudaErrors(cudart.cudaGraphGetNodes(graph))
-    print(f"\nNum of nodes in the graph created manually = {numNodes}")
+    nodes, num_nodes = check_cuda_errors(cudart.cudaGraphGetNodes(graph))
+    print(f"\nNum of nodes in the graph created manually = {num_nodes}")
 
-    graphExec = checkCudaErrors(cudart.cudaGraphInstantiate(graph, 0))
+    graph_exec = check_cuda_errors(cudart.cudaGraphInstantiate(graph, 0))
 
-    clonedGraph = checkCudaErrors(cudart.cudaGraphClone(graph))
-    clonedGraphExec = checkCudaErrors(cudart.cudaGraphInstantiate(clonedGraph, 0))
+    cloned_graph = check_cuda_errors(cudart.cudaGraphClone(graph))
+    cloned_graph_exec = check_cuda_errors(cudart.cudaGraphInstantiate(cloned_graph, 0))
 
     for _i in range(GRAPH_LAUNCH_ITERATIONS):
-        checkCudaErrors(cudart.cudaGraphLaunch(graphExec, streamForGraph))
+        check_cuda_errors(cudart.cudaGraphLaunch(graph_exec, stream_for_graph))
 
-    checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph))
+    check_cuda_errors(cudart.cudaStreamSynchronize(stream_for_graph))
 
     print("Cloned Graph Output..")
     for _i in range(GRAPH_LAUNCH_ITERATIONS):
-        checkCudaErrors(cudart.cudaGraphLaunch(clonedGraphExec, streamForGraph))
+        check_cuda_errors(cudart.cudaGraphLaunch(cloned_graph_exec, stream_for_graph))
 
-    checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph))
+    check_cuda_errors(cudart.cudaStreamSynchronize(stream_for_graph))
 
-    checkCudaErrors(cudart.cudaGraphExecDestroy(graphExec))
-    checkCudaErrors(cudart.cudaGraphExecDestroy(clonedGraphExec))
-    checkCudaErrors(cudart.cudaGraphDestroy(graph))
-    checkCudaErrors(cudart.cudaGraphDestroy(clonedGraph))
-    checkCudaErrors(cudart.cudaStreamDestroy(streamForGraph))
+    check_cuda_errors(cudart.cudaGraphExecDestroy(graph_exec))
+    check_cuda_errors(cudart.cudaGraphExecDestroy(cloned_graph_exec))
+    check_cuda_errors(cudart.cudaGraphDestroy(graph))
+    check_cuda_errors(cudart.cudaGraphDestroy(cloned_graph))
+    check_cuda_errors(cudart.cudaStreamDestroy(stream_for_graph))
 
 
-def cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, numOfBlocks):
+def cuda_graphs_using_stream_capture(input_vec_h, input_vec_d, output_vec_d, result_d, input_size, num_of_blocks):
     result_h = ctypes.c_double(0.0)
 
-    stream1 = checkCudaErrors(cudart.cudaStreamCreate())
-    stream2 = checkCudaErrors(cudart.cudaStreamCreate())
-    stream3 = checkCudaErrors(cudart.cudaStreamCreate())
-    streamForGraph = checkCudaErrors(cudart.cudaStreamCreate())
+    stream1 = check_cuda_errors(cudart.cudaStreamCreate())
+    stream2 = check_cuda_errors(cudart.cudaStreamCreate())
+    stream3 = check_cuda_errors(cudart.cudaStreamCreate())
+    stream_for_graph = check_cuda_errors(cudart.cudaStreamCreate())
 
-    forkStreamEvent = checkCudaErrors(cudart.cudaEventCreate())
-    memsetEvent1 = checkCudaErrors(cudart.cudaEventCreate())
-    memsetEvent2 = checkCudaErrors(cudart.cudaEventCreate())
+    fork_stream_event = check_cuda_errors(cudart.cudaEventCreate())
+    memset_event1 = check_cuda_errors(cudart.cudaEventCreate())
+    memset_event2 = check_cuda_errors(cudart.cudaEventCreate())
 
-    checkCudaErrors(cudart.cudaStreamBeginCapture(stream1, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal))
+    check_cuda_errors(cudart.cudaStreamBeginCapture(stream1, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal))
 
-    checkCudaErrors(cudart.cudaEventRecord(forkStreamEvent, stream1))
-    checkCudaErrors(cudart.cudaStreamWaitEvent(stream2, forkStreamEvent, 0))
-    checkCudaErrors(cudart.cudaStreamWaitEvent(stream3, forkStreamEvent, 0))
+    check_cuda_errors(cudart.cudaEventRecord(fork_stream_event, stream1))
+    check_cuda_errors(cudart.cudaStreamWaitEvent(stream2, fork_stream_event, 0))
+    check_cuda_errors(cudart.cudaStreamWaitEvent(stream3, fork_stream_event, 0))
 
-    checkCudaErrors(
+    check_cuda_errors(
         cudart.cudaMemcpyAsync(
-            inputVec_d,
-            inputVec_h,
-            np.dtype(np.float32).itemsize * inputSize,
+            input_vec_d,
+            input_vec_h,
+            np.dtype(np.float32).itemsize * input_size,
             cudart.cudaMemcpyKind.cudaMemcpyDefault,
             stream1,
         )
     )
 
-    checkCudaErrors(cudart.cudaMemsetAsync(outputVec_d, 0, np.dtype(np.float64).itemsize * numOfBlocks, stream2))
+    check_cuda_errors(cudart.cudaMemsetAsync(output_vec_d, 0, np.dtype(np.float64).itemsize * num_of_blocks, stream2))
 
-    checkCudaErrors(cudart.cudaEventRecord(memsetEvent1, stream2))
+    check_cuda_errors(cudart.cudaEventRecord(memset_event1, stream2))
 
-    checkCudaErrors(cudart.cudaMemsetAsync(result_d, 0, np.dtype(np.float64).itemsize, stream3))
-    checkCudaErrors(cudart.cudaEventRecord(memsetEvent2, stream3))
+    check_cuda_errors(cudart.cudaMemsetAsync(result_d, 0, np.dtype(np.float64).itemsize, stream3))
+    check_cuda_errors(cudart.cudaEventRecord(memset_event2, stream3))
 
-    checkCudaErrors(cudart.cudaStreamWaitEvent(stream1, memsetEvent1, 0))
+    check_cuda_errors(cudart.cudaStreamWaitEvent(stream1, memset_event1, 0))
 
-    kernelArgs = (
-        (inputVec_d, outputVec_d, inputSize, numOfBlocks),
+    kernel_args = (
+        (input_vec_d, output_vec_d, input_size, num_of_blocks),
         (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_uint),
     )
-    checkCudaErrors(
+    check_cuda_errors(
         cuda.cuLaunchKernel(
             _reduce,
-            numOfBlocks,
+            num_of_blocks,
             1,
             1,
             THREADS_PER_BLOCK,
@@ -307,20 +307,20 @@ def cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d,
             1,
             0,
             stream1,
-            kernelArgs,
+            kernel_args,
             0,
         )
     )
 
-    checkCudaErrors(cudart.cudaStreamWaitEvent(stream1, memsetEvent2, 0))
+    check_cuda_errors(cudart.cudaStreamWaitEvent(stream1, memset_event2, 0))
 
-    kernelArgs2 = (
-        (outputVec_d, result_d, numOfBlocks),
+    kernel_args2 = (
+        (output_vec_d, result_d, num_of_blocks),
         (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_uint),
     )
-    checkCudaErrors(cuda.cuLaunchKernel(_reduceFinal, 1, 1, 1, THREADS_PER_BLOCK, 1, 1, 0, stream1, kernelArgs2, 0))
+    check_cuda_errors(cuda.cuLaunchKernel(_reduceFinal, 1, 1, 1, THREADS_PER_BLOCK, 1, 1, 0, stream1, kernel_args2, 0))
 
-    checkCudaErrors(
+    check_cuda_errors(
         cudart.cudaMemcpyAsync(
             result_h,
             result_d,
@@ -332,71 +332,67 @@ def cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d,
 
     # WIP: Host nodes
 
-    graph = checkCudaErrors(cudart.cudaStreamEndCapture(stream1))
+    graph = check_cuda_errors(cudart.cudaStreamEndCapture(stream1))
 
-    nodes, numNodes = checkCudaErrors(cudart.cudaGraphGetNodes(graph))
-    print(f"\nNum of nodes in the graph created using stream capture API = {numNodes}")
+    nodes, num_nodes = check_cuda_errors(cudart.cudaGraphGetNodes(graph))
+    print(f"\nNum of nodes in the graph created using stream capture API = {num_nodes}")
 
-    graphExec = checkCudaErrors(cudart.cudaGraphInstantiate(graph, 0))
+    graph_exec = check_cuda_errors(cudart.cudaGraphInstantiate(graph, 0))
 
-    clonedGraph = checkCudaErrors(cudart.cudaGraphClone(graph))
-    clonedGraphExec = checkCudaErrors(cudart.cudaGraphInstantiate(clonedGraph, 0))
+    cloned_graph = check_cuda_errors(cudart.cudaGraphClone(graph))
+    cloned_graph_exec = check_cuda_errors(cudart.cudaGraphInstantiate(cloned_graph, 0))
 
     for _i in range(GRAPH_LAUNCH_ITERATIONS):
-        checkCudaErrors(cudart.cudaGraphLaunch(graphExec, streamForGraph))
+        check_cuda_errors(cudart.cudaGraphLaunch(graph_exec, stream_for_graph))
 
-    checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph))
+    check_cuda_errors(cudart.cudaStreamSynchronize(stream_for_graph))
 
     print("Cloned Graph Output..")
     for _i in range(GRAPH_LAUNCH_ITERATIONS):
-        checkCudaErrors(cudart.cudaGraphLaunch(clonedGraphExec, streamForGraph))
+        check_cuda_errors(cudart.cudaGraphLaunch(cloned_graph_exec, stream_for_graph))
 
-    checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph))
+    check_cuda_errors(cudart.cudaStreamSynchronize(stream_for_graph))
 
-    checkCudaErrors(cudart.cudaGraphExecDestroy(graphExec))
-    checkCudaErrors(cudart.cudaGraphExecDestroy(clonedGraphExec))
-    checkCudaErrors(cudart.cudaGraphDestroy(graph))
-    checkCudaErrors(cudart.cudaGraphDestroy(clonedGraph))
-    checkCudaErrors(cudart.cudaEventDestroy(memsetEvent2))
-    checkCudaErrors(cudart.cudaEventDestroy(memsetEvent1))
-    checkCudaErrors(cudart.cudaEventDestroy(forkStreamEvent))
-    checkCudaErrors(cudart.cudaStreamDestroy(stream3))
-    checkCudaErrors(cudart.cudaStreamDestroy(stream1))
-    checkCudaErrors(cudart.cudaStreamDestroy(stream2))
-    checkCudaErrors(cudart.cudaStreamDestroy(streamForGraph))
+    check_cuda_errors(cudart.cudaGraphExecDestroy(graph_exec))
+    check_cuda_errors(cudart.cudaGraphExecDestroy(cloned_graph_exec))
+    check_cuda_errors(cudart.cudaGraphDestroy(graph))
+    check_cuda_errors(cudart.cudaGraphDestroy(cloned_graph))
+    check_cuda_errors(cudart.cudaStreamDestroy(stream1))
+    check_cuda_errors(cudart.cudaStreamDestroy(stream2))
+    check_cuda_errors(cudart.cudaStreamDestroy(stream_for_graph))
 
 
 def main():
     size = 1 << 24  # number of elements to reduce
-    maxBlocks = 512
+    max_blocks = 512
 
     # This will pick the best possible CUDA capable device
-    devID = findCudaDevice()
+    dev_id = find_cuda_device()
 
     global _reduce
     global _reduceFinal
-    with common.KernelHelper(simpleCudaGraphs, devID) as kernelHelper:
-        _reduce = kernelHelper.getFunction(b"reduce")
-        _reduceFinal = kernelHelper.getFunction(b"reduceFinal")
+    kernel_helper = common.KernelHelper(simple_cuda_graphs, dev_id)
+    _reduce = kernel_helper.get_function(b"reduce")
+    _reduceFinal = kernel_helper.get_function(b"reduceFinal")
 
-        print(f"{size} elements")
-        print(f"threads per block  = {THREADS_PER_BLOCK}")
-        print(f"Graph Launch iterations = {GRAPH_LAUNCH_ITERATIONS}")
+    print(f"{size} elements")
+    print(f"threads per block  = {THREADS_PER_BLOCK}")
+    print(f"Graph Launch iterations = {GRAPH_LAUNCH_ITERATIONS}")
 
-        inputVec_h = checkCudaErrors(cudart.cudaMallocHost(size * np.dtype(np.float32).itemsize))
-        inputVec_d = checkCudaErrors(cudart.cudaMalloc(size * np.dtype(np.float32).itemsize))
-        outputVec_d = checkCudaErrors(cudart.cudaMalloc(maxBlocks * np.dtype(np.float64).itemsize))
-        result_d = checkCudaErrors(cudart.cudaMalloc(np.dtype(np.float64).itemsize))
+    input_vec_h = check_cuda_errors(cudart.cudaMallocHost(size * np.dtype(np.float32).itemsize))
+    input_vec_d = check_cuda_errors(cudart.cudaMalloc(size * np.dtype(np.float32).itemsize))
+    output_vec_d = check_cuda_errors(cudart.cudaMalloc(max_blocks * np.dtype(np.float64).itemsize))
+    result_d = check_cuda_errors(cudart.cudaMalloc(np.dtype(np.float64).itemsize))
 
-        init_input(inputVec_h, size)
+    init_input(input_vec_h, size)
 
-        cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, size, maxBlocks)
-        cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d, size, maxBlocks)
+    cuda_graphs_manual(input_vec_h, input_vec_d, output_vec_d, result_d, size, max_blocks)
+    cuda_graphs_using_stream_capture(input_vec_h, input_vec_d, output_vec_d, result_d, size, max_blocks)
 
-    checkCudaErrors(cudart.cudaFree(inputVec_d))
-    checkCudaErrors(cudart.cudaFree(outputVec_d))
-    checkCudaErrors(cudart.cudaFree(result_d))
-    checkCudaErrors(cudart.cudaFreeHost(inputVec_h))
+    check_cuda_errors(cudart.cudaFree(input_vec_d))
+    check_cuda_errors(cudart.cudaFree(output_vec_d))
+    check_cuda_errors(cudart.cudaFree(result_d))
+    check_cuda_errors(cudart.cudaFreeHost(input_vec_h))
 
 
 if __name__ == "__main__":
diff --git a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
index 257a7afa14..8ef5506257 100644
--- a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
+++ b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
@@ -9,12 +9,12 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors, findCudaDevice
+from common.helper_cuda import check_cuda_errors, find_cuda_device
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
 
-conjugateGradientMultiBlockCG = """\
+conjugate_gradient_multi_block_cg = """\
 #line __LINE__
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
@@ -163,37 +163,37 @@
 """
 
 
-def genTridiag(I, J, val, N, nz):
-    I[0] = 0
-    J[0] = 0
-    J[1] = 0
+def gen_tridiag(i, j, val, n, nz):
+    i[0] = 0
+    j[0] = 0
+    j[1] = 0
 
     val[0] = float(random()) + 10.0
     val[1] = float(random())
 
-    for i in range(1, N):
+    for i in range(1, n):
         if i > 1:
-            I[i] = I[i - 1] + 3
+            i[i] = i[i - 1] + 3
         else:
-            I[1] = 2
+            i[1] = 2
 
         start = (i - 1) * 3 + 2
-        J[start] = i - 1
-        J[start + 1] = i
+        j[start] = i - 1
+        j[start + 1] = i
 
-        if i < N - 1:
-            J[start + 2] = i + 1
+        if i < n - 1:
+            j[start + 2] = i + 1
 
         val[start] = val[start - 1]
         val[start + 1] = float(random()) + 10.0
 
-        if i < N - 1:
+        if i < n - 1:
             val[start + 2] = float(random())
-    I[N] = nz
+    i[n] = nz
 
 
 THREADS_PER_BLOCK = 512
-sSDKname = "conjugateGradientMultiBlockCG"
+s_sd_kname = "conjugateGradientMultiBlockCG"
 
 
 def main():
@@ -214,139 +214,137 @@ def main():
         pytest.skip("conjugateGradientMultiBlockCG is not supported on QNX")
 
     # This will pick the best possible CUDA capable device
-    devID = findCudaDevice()
-    deviceProp = checkCudaErrors(cudart.cudaGetDeviceProperties(devID))
+    dev_id = find_cuda_device()
+    device_prop = check_cuda_errors(cudart.cudaGetDeviceProperties(dev_id))
 
-    if not deviceProp.managedMemory:
+    if not device_prop.managedMemory:
         pytest.skip("Unified Memory not supported on this device")
 
     # This sample requires being run on a device that supports Cooperative Kernel
     # Launch
-    if not deviceProp.cooperativeLaunch:
-        pytest.skip(f"Selected GPU {devID} does not support Cooperative Kernel Launch")
+    if not device_prop.cooperativeLaunch:
+        pytest.skip(f"Selected GPU {dev_id} does not support Cooperative Kernel Launch")
 
     # Statistics about the GPU device
     print(
-        f"> GPU device has {deviceProp.multiProcessorCount:%d} Multi-Processors, SM {deviceProp.major:%d}.{deviceProp.minor:%d} compute capabilities\n"
+        f"> GPU device has {device_prop.multiProcessorCount:%d} Multi-Processors, SM {device_prop.major:%d}.{device_prop.minor:%d} compute capabilities\n"
     )
 
     # Get kernel
-    with common.KernelHelper(conjugateGradientMultiBlockCG, devID) as kernelHelper:
-        _gpuConjugateGradient = kernelHelper.getFunction(b"gpuConjugateGradient")
-
-        # Generate a random tridiagonal symmetric matrix in CSR format
-        N = 1048576
-        nz = (N - 2) * 3 + 4
-
-        I = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * (N + 1), cudart.cudaMemAttachGlobal))
-        J = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * nz, cudart.cudaMemAttachGlobal))
-        val = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * nz, cudart.cudaMemAttachGlobal))
-        I_local = (ctypes.c_int * (N + 1)).from_address(I)
-        J_local = (ctypes.c_int * nz).from_address(J)
-        val_local = (ctypes.c_float * nz).from_address(val)
-
-        genTridiag(I_local, J_local, val_local, N, nz)
-
-        x = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal))
-        rhs = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal))
-        dot_result = checkCudaErrors(
-            cudart.cudaMallocManaged(np.dtype(np.float64).itemsize, cudart.cudaMemAttachGlobal)
-        )
-        x_local = (ctypes.c_float * N).from_address(x)
-        rhs_local = (ctypes.c_float * N).from_address(rhs)
-        dot_result_local = (ctypes.c_double).from_address(dot_result)
-        dot_result_local = 0
-
-        # temp memory for CG
-        r = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal))
-        p = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal))
-        Ax = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal))
-        r_local = (ctypes.c_float * N).from_address(r)
-
-        checkCudaErrors(cudart.cudaDeviceSynchronize())
-
-        start = checkCudaErrors(cudart.cudaEventCreate())
-        stop = checkCudaErrors(cudart.cudaEventCreate())
-
-        for i in range(N):
-            r_local[i] = rhs_local[i] = 1.0
-            x_local[i] = 0.0
-
-        kernelArgs_value = (I, J, val, x, Ax, p, r, dot_result, nz, N, tol)
-        kernelArgs_types = (
-            ctypes.c_void_p,
-            ctypes.c_void_p,
-            ctypes.c_void_p,
-            ctypes.c_void_p,
-            ctypes.c_void_p,
-            ctypes.c_void_p,
-            ctypes.c_void_p,
-            ctypes.c_void_p,
-            ctypes.c_int,
-            ctypes.c_int,
-            ctypes.c_float,
-        )
-        kernelArgs = (kernelArgs_value, kernelArgs_types)
+    kernel_helper = common.KernelHelper(conjugate_gradient_multi_block_cg, dev_id)
+    _gpu_conjugate_gradient = kernel_helper.get_function(b"gpuConjugateGradient")
+
+    # Generate a random tridiagonal symmetric matrix in CSR format
+    n = 1048576
+    nz = (n - 2) * 3 + 4
+
+    i = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * (n + 1), cudart.cudaMemAttachGlobal))
+    j = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * nz, cudart.cudaMemAttachGlobal))
+    val = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * nz, cudart.cudaMemAttachGlobal))
+    i_local = (ctypes.c_int * (n + 1)).from_address(i)
+    j_local = (ctypes.c_int * nz).from_address(j)
+    val_local = (ctypes.c_float * nz).from_address(val)
+
+    gen_tridiag(i_local, j_local, val_local, n, nz)
+
+    x = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal))
+    rhs = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal))
+    dot_result = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float64).itemsize, cudart.cudaMemAttachGlobal))
+    x_local = (ctypes.c_float * n).from_address(x)
+    rhs_local = (ctypes.c_float * n).from_address(rhs)
+    dot_result_local = (ctypes.c_double).from_address(dot_result)
+    dot_result_local = 0
+
+    # temp memory for CG
+    r = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal))
+    p = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal))
+    ax = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal))
+    r_local = (ctypes.c_float * n).from_address(r)
+
+    check_cuda_errors(cudart.cudaDeviceSynchronize())
+
+    start = check_cuda_errors(cudart.cudaEventCreate())
+    stop = check_cuda_errors(cudart.cudaEventCreate())
+
+    for i in range(n):
+        r_local[i] = rhs_local[i] = 1.0
+        x_local[i] = 0.0
+
+    kernel_args_value = (i, j, val, x, ax, p, r, dot_result, nz, n, tol)
+    kernel_args_types = (
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_float,
+    )
+    kernel_args = (kernel_args_value, kernel_args_types)
 
-        sMemSize = np.dtype(np.float64).itemsize * ((THREADS_PER_BLOCK / 32) + 1)
-        numThreads = THREADS_PER_BLOCK
-        numBlocksPerSm = checkCudaErrors(
-            cuda.cuOccupancyMaxActiveBlocksPerMultiprocessor(_gpuConjugateGradient, numThreads, sMemSize)
-        )
-        numSms = deviceProp.multiProcessorCount
-        dimGrid = cudart.dim3()
-        dimGrid.x = numSms * numBlocksPerSm
-        dimGrid.y = 1
-        dimGrid.z = 1
-        dimBlock = cudart.dim3()
-        dimBlock.x = THREADS_PER_BLOCK
-        dimBlock.y = 1
-        dimBlock.z = 1
-
-        checkCudaErrors(cudart.cudaEventRecord(start, 0))
-        checkCudaErrors(
-            cuda.cuLaunchCooperativeKernel(
-                _gpuConjugateGradient,
-                dimGrid.x,
-                dimGrid.y,
-                dimGrid.z,
-                dimBlock.x,
-                dimBlock.y,
-                dimBlock.z,
-                0,
-                0,
-                kernelArgs,
-            )
+    s_mem_size = np.dtype(np.float64).itemsize * ((THREADS_PER_BLOCK / 32) + 1)
+    num_threads = THREADS_PER_BLOCK
+    num_blocks_per_sm = check_cuda_errors(
+        cuda.cuOccupancyMaxActiveBlocksPerMultiprocessor(_gpu_conjugate_gradient, num_threads, s_mem_size)
+    )
+    num_sms = device_prop.multiProcessorCount
+    dim_grid = cudart.dim3()
+    dim_grid.x = num_sms * num_blocks_per_sm
+    dim_grid.y = 1
+    dim_grid.z = 1
+    dim_block = cudart.dim3()
+    dim_block.x = THREADS_PER_BLOCK
+    dim_block.y = 1
+    dim_block.z = 1
+
+    check_cuda_errors(cudart.cudaEventRecord(start, 0))
+    check_cuda_errors(
+        cuda.cuLaunchCooperativeKernel(
+            _gpu_conjugate_gradient,
+            dim_grid.x,
+            dim_grid.y,
+            dim_grid.z,
+            dim_block.x,
+            dim_block.y,
+            dim_block.z,
+            0,
+            0,
+            kernel_args,
         )
-        checkCudaErrors(cudart.cudaEventRecord(stop, 0))
-        checkCudaErrors(cudart.cudaDeviceSynchronize())
-
-        time = checkCudaErrors(cudart.cudaEventElapsedTime(start, stop))
-        print(f"GPU Final, residual = {math.sqrt(dot_result_local):e}, kernel execution time = {time:f} ms")
-
-        err = 0.0
-        for i in range(N):
-            rsum = 0.0
-
-            for j in range(I_local[i], I_local[i + 1]):
-                rsum += val_local[j] * x_local[J_local[j]]
-
-            diff = math.fabs(rsum - rhs_local[i])
-
-            if diff > err:
-                err = diff
-
-        checkCudaErrors(cudart.cudaFree(I))
-        checkCudaErrors(cudart.cudaFree(J))
-        checkCudaErrors(cudart.cudaFree(val))
-        checkCudaErrors(cudart.cudaFree(x))
-        checkCudaErrors(cudart.cudaFree(rhs))
-        checkCudaErrors(cudart.cudaFree(r))
-        checkCudaErrors(cudart.cudaFree(p))
-        checkCudaErrors(cudart.cudaFree(Ax))
-        checkCudaErrors(cudart.cudaFree(dot_result))
-    checkCudaErrors(cudart.cudaEventDestroy(start))
-    checkCudaErrors(cudart.cudaEventDestroy(stop))
+    )
+    check_cuda_errors(cudart.cudaEventRecord(stop, 0))
+    check_cuda_errors(cudart.cudaDeviceSynchronize())
+
+    time = check_cuda_errors(cudart.cudaEventElapsedTime(start, stop))
+    print(f"GPU Final, residual = {math.sqrt(dot_result_local):e}, kernel execution time = {time:f} ms")
+
+    err = 0.0
+    for i in range(n):
+        rsum = 0.0
+
+        for j in range(i_local[i], i_local[i + 1]):
+            rsum += val_local[j] * x_local[j_local[j]]
+
+        diff = math.fabs(rsum - rhs_local[i])
+
+        if diff > err:
+            err = diff
+
+    check_cuda_errors(cudart.cudaFree(i))
+    check_cuda_errors(cudart.cudaFree(j))
+    check_cuda_errors(cudart.cudaFree(val))
+    check_cuda_errors(cudart.cudaFree(x))
+    check_cuda_errors(cudart.cudaFree(rhs))
+    check_cuda_errors(cudart.cudaFree(r))
+    check_cuda_errors(cudart.cudaFree(p))
+    check_cuda_errors(cudart.cudaFree(ax))
+    check_cuda_errors(cudart.cudaFree(dot_result))
+    check_cuda_errors(cudart.cudaEventDestroy(start))
+    check_cuda_errors(cudart.cudaEventDestroy(stop))
 
     print(f"Test Summary:  Error amount = {err:f}")
     if math.sqrt(dot_result_local) >= tol:
diff --git a/cuda_bindings/examples/common/common.py b/cuda_bindings/examples/common/common.py
index 8723abe26a..5b5151ef24 100644
--- a/cuda_bindings/examples/common/common.py
+++ b/cuda_bindings/examples/common/common.py
@@ -2,10 +2,8 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 
-from contextlib import suppress
-
 import numpy as np
-from common.helper_cuda import checkCudaErrors
+from common.helper_cuda import check_cuda_errors
 
 from cuda import pathfinder
 from cuda.bindings import driver as cuda
@@ -13,14 +11,14 @@
 from cuda.bindings import runtime as cudart
 
 
-def pytest_skipif_compute_capability_too_low(devID, required_cc_major_minor):
+def pytest_skipif_compute_capability_too_low(dev_id, required_cc_major_minor):
     import pytest
 
-    cc_major = checkCudaErrors(
-        cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID)
+    cc_major = check_cuda_errors(
+        cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, dev_id)
     )
-    cc_minor = checkCudaErrors(
-        cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, devID)
+    cc_minor = check_cuda_errors(
+        cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, dev_id)
     )
     have_cc_major_minor = (cc_major, cc_minor)
     if have_cc_major_minor < required_cc_major_minor:
@@ -28,8 +26,7 @@ def pytest_skipif_compute_capability_too_low(devID, required_cc_major_minor):
 
 
 class KernelHelper:
-    def __init__(self, code, devID):
-        self.module = None
+    def __init__(self, code, dev_id):
         include_dirs = []
         for libname in ("cudart", "cccl"):
             hdr_dir = pathfinder.find_nvidia_header_directory(libname)
@@ -39,18 +36,18 @@ def __init__(self, code, devID):
                 pytest.skip(f'pathfinder.find_nvidia_header_directory("{libname}") returned None')
             include_dirs.append(hdr_dir)
 
-        prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None))
+        prog = check_cuda_errors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None))
 
         # Initialize CUDA
-        checkCudaErrors(cudart.cudaFree(0))
+        check_cuda_errors(cudart.cudaFree(0))
 
-        major = checkCudaErrors(
-            cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID)
+        major = check_cuda_errors(
+            cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, dev_id)
         )
-        minor = checkCudaErrors(
-            cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, devID)
+        minor = check_cuda_errors(
+            cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, dev_id)
         )
-        _, nvrtc_minor = checkCudaErrors(nvrtc.nvrtcVersion())
+        _, nvrtc_minor = check_cuda_errors(nvrtc.nvrtcVersion())
         use_cubin = nvrtc_minor >= 1
         prefix = "sm" if use_cubin else "compute"
         arch_arg = bytes(f"--gpu-architecture={prefix}_{major}{minor}", "ascii")
@@ -65,44 +62,27 @@ def __init__(self, code, devID):
             opts.append(f"--include-path={inc_dir}".encode())
 
         try:
-            checkCudaErrors(nvrtc.nvrtcCompileProgram(prog, len(opts), opts))
-
-            if use_cubin:
-                dataSize = checkCudaErrors(nvrtc.nvrtcGetCUBINSize(prog))
-                data = b" " * dataSize
-                checkCudaErrors(nvrtc.nvrtcGetCUBIN(prog, data))
-            else:
-                dataSize = checkCudaErrors(nvrtc.nvrtcGetPTXSize(prog))
-                data = b" " * dataSize
-                checkCudaErrors(nvrtc.nvrtcGetPTX(prog, data))
+            check_cuda_errors(nvrtc.nvrtcCompileProgram(prog, len(opts), opts))
         except RuntimeError as err:
-            logSize = checkCudaErrors(nvrtc.nvrtcGetProgramLogSize(prog))
-            log = b" " * logSize
-            checkCudaErrors(nvrtc.nvrtcGetProgramLog(prog, log))
+            log_size = check_cuda_errors(nvrtc.nvrtcGetProgramLogSize(prog))
+            log = b" " * log_size
+            check_cuda_errors(nvrtc.nvrtcGetProgramLog(prog, log))
             import sys
 
             print(log.decode(), file=sys.stderr)
             print(err, file=sys.stderr)
             sys.exit(1)
-        finally:
-            checkCudaErrors(nvrtc.nvrtcDestroyProgram(prog))
-
-        self.module = checkCudaErrors(cuda.cuModuleLoadData(np.char.array(data)))
-
-    def getFunction(self, name):
-        return checkCudaErrors(cuda.cuModuleGetFunction(self.module, name))
-
-    def close(self):
-        if self.module is not None:
-            checkCudaErrors(cuda.cuModuleUnload(self.module))
-            self.module = None
 
-    def __enter__(self):
-        return self
+        if use_cubin:
+            data_size = check_cuda_errors(nvrtc.nvrtcGetCUBINSize(prog))
+            data = b" " * data_size
+            check_cuda_errors(nvrtc.nvrtcGetCUBIN(prog, data))
+        else:
+            data_size = check_cuda_errors(nvrtc.nvrtcGetPTXSize(prog))
+            data = b" " * data_size
+            check_cuda_errors(nvrtc.nvrtcGetPTX(prog, data))
 
-    def __exit__(self, exc_type, exc, tb):
-        self.close()
+        self.module = check_cuda_errors(cuda.cuModuleLoadData(np.char.array(data)))
 
-    def __del__(self):
-        with suppress(Exception):
-            self.close()
+    def get_function(self, name):
+        return check_cuda_errors(cuda.cuModuleGetFunction(self.module, name))
diff --git a/cuda_bindings/examples/common/helper_cuda.py b/cuda_bindings/examples/common/helper_cuda.py
index d741eb54d9..9fbfe8c82f 100644
--- a/cuda_bindings/examples/common/helper_cuda.py
+++ b/cuda_bindings/examples/common/helper_cuda.py
@@ -1,14 +1,14 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt
+from common.helper_string import check_cmd_line_flag, get_cmd_line_argument_int
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import nvrtc
 from cuda.bindings import runtime as cudart
 
 
-def _cudaGetErrorEnum(error):
+def _cuda_get_error_enum(error):
     if isinstance(error, cuda.CUresult):
         err, name = cuda.cuGetErrorName(error)
         return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>"
@@ -20,9 +20,9 @@ def _cudaGetErrorEnum(error):
         raise RuntimeError(f"Unknown error type: {error}")
 
 
-def checkCudaErrors(result):
+def check_cuda_errors(result):
     if result[0].value:
-        raise RuntimeError(f"CUDA error code={result[0].value}({_cudaGetErrorEnum(result[0])})")
+        raise RuntimeError(f"CUDA error code={result[0].value}({_cuda_get_error_enum(result[0])})")
     if len(result) == 1:
         return None
     elif len(result) == 2:
@@ -31,18 +31,18 @@ def checkCudaErrors(result):
         return result[1:]
 
 
-def findCudaDevice():
-    devID = 0
-    if checkCmdLineFlag("device="):
-        devID = getCmdLineArgumentInt("device=")
-    checkCudaErrors(cudart.cudaSetDevice(devID))
-    return devID
+def find_cuda_device():
+    dev_id = 0
+    if check_cmd_line_flag("device="):
+        dev_id = get_cmd_line_argument_int("device=")
+    check_cuda_errors(cudart.cudaSetDevice(dev_id))
+    return dev_id
 
 
-def findCudaDeviceDRV():
-    devID = 0
-    if checkCmdLineFlag("device="):
-        devID = getCmdLineArgumentInt("device=")
-    checkCudaErrors(cuda.cuInit(0))
-    cuDevice = checkCudaErrors(cuda.cuDeviceGet(devID))
-    return cuDevice
+def find_cuda_device_drv():
+    dev_id = 0
+    if check_cmd_line_flag("device="):
+        dev_id = get_cmd_line_argument_int("device=")
+    check_cuda_errors(cuda.cuInit(0))
+    cu_device = check_cuda_errors(cuda.cuDeviceGet(dev_id))
+    return cu_device
diff --git a/cuda_bindings/examples/common/helper_string.py b/cuda_bindings/examples/common/helper_string.py
index 9f8e70a6c4..47d9d36569 100644
--- a/cuda_bindings/examples/common/helper_string.py
+++ b/cuda_bindings/examples/common/helper_string.py
@@ -4,12 +4,12 @@
 import sys
 
 
-def checkCmdLineFlag(stringRef):
-    return any(stringRef == i and k < len(sys.argv) - 1 for i, k in enumerate(sys.argv))
+def check_cmd_line_flag(string_ref):
+    return any(string_ref == i and k < len(sys.argv) - 1 for i, k in enumerate(sys.argv))
 
 
-def getCmdLineArgumentInt(stringRef):
+def get_cmd_line_argument_int(string_ref):
     for i, k in enumerate(sys.argv):
-        if stringRef == i and k < len(sys.argv) - 1:
+        if string_ref == i and k < len(sys.argv) - 1:
             return sys.argv[k + 1]
     return 0
diff --git a/cuda_bindings/examples/extra/isoFDModelling_test.py b/cuda_bindings/examples/extra/isoFDModelling_test.py
index 148d836adf..21303664ac 100644
--- a/cuda_bindings/examples/extra/isoFDModelling_test.py
+++ b/cuda_bindings/examples/extra/isoFDModelling_test.py
@@ -5,12 +5,12 @@
 
 import numpy as np
 from common import common
-from common.helper_cuda import checkCudaErrors
+from common.helper_cuda import check_cuda_errors
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
 
-isoPropagator = """\
+iso_propagator = """\
 extern "C"
 __global__ void injectSource(float *__restrict__ in, float *__restrict__ src, int it)
 {
@@ -177,7 +177,7 @@ def align_ny(ny, blk, nops):
 #
 # this class contains the input params
 #
-class params:
+class Params:
     def __init__(self):
         self.BDIMX = 32  # tiles x y for fd operators
         self.BDIMY = 16
@@ -209,53 +209,53 @@ def __init__(self):
 #
 # this class contains all the kernels to be used bu propagator
 #
-class cudaKernels:
+class CudaKernels:
     def __init__(self, cntx):
-        checkCudaErrors(cuda.cuInit(0))
-        checkCudaErrors(cuda.cuCtxSetCurrent(cntx))
-        dev = checkCudaErrors(cuda.cuCtxGetDevice())
+        check_cuda_errors(cuda.cuInit(0))
+        check_cuda_errors(cuda.cuCtxSetCurrent(cntx))
+        dev = check_cuda_errors(cuda.cuCtxGetDevice())
 
-        self.kernelHelper = common.KernelHelper(isoPropagator, int(dev))
+        self.kernel_helper = common.KernelHelper(iso_propagator, int(dev))
 
         # kernel to create a source fnction with some max frequency
-        self.creatSource = self.kernelHelper.getFunction(b"createSource")
+        self.creatSource = self.kernel_helper.get_function(b"createSource")
         # create a velocity to try things: just a sphere on the middle 4500 m/s and 2500 m/s all around
-        self.createVelocity = self.kernelHelper.getFunction(b"createVelocity")
+        self.create_velocity = self.kernel_helper.get_function(b"createVelocity")
 
         # kernel to propagate the wavefield by 1 step in time
-        self.fdPropag = self.kernelHelper.getFunction(b"fwd_3D_orderX2k")
+        self.fdPropag = self.kernel_helper.get_function(b"fwd_3D_orderX2k")
 
         # kernel to propagate the wavefield by 1 step in time
-        self.injectSource = self.kernelHelper.getFunction(b"injectSource")
+        self.inject_source = self.kernel_helper.get_function(b"injectSource")
 
 
 #
 # this class contains: propagator, source creation, velocity creation
 # injection of data and domain exchange
 #
-class propagator:
+class Propagator:
     def __init__(self, params, _dev):
         print("init object for device ", _dev)
         self.dev = _dev
 
-        checkCudaErrors(cuda.cuInit(0))
-        self.cuDevice = checkCudaErrors(cuda.cuDeviceGet(_dev))
-        self.context = checkCudaErrors(cuda.cuCtxCreate(None, 0, self.cuDevice))
+        check_cuda_errors(cuda.cuInit(0))
+        self.cu_device = check_cuda_errors(cuda.cuDeviceGet(_dev))
+        self.context = check_cuda_errors(cuda.cuCtxCreate(None, 0, self.cu_device))
         self.waveOut = 0
         self.waveIn = 0
-        self.streamCenter = checkCudaErrors(cuda.cuStreamCreate(0))
-        self.streamHalo = checkCudaErrors(cuda.cuStreamCreate(0))
-        self.params = params
+        self.streamCenter = check_cuda_errors(cuda.cuStreamCreate(0))
+        self.streamHalo = check_cuda_errors(cuda.cuStreamCreate(0))
+        self.Params = params
 
     def __del__(self):
-        checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
-        checkCudaErrors(cuda.cuStreamDestroy(self.streamHalo))
-        checkCudaErrors(cuda.cuStreamDestroy(self.streamCenter))
+        check_cuda_errors(cuda.cuCtxSetCurrent(self.context))
+        check_cuda_errors(cuda.cuStreamDestroy(self.streamHalo))
+        check_cuda_errors(cuda.cuStreamDestroy(self.streamCenter))
         if self.waveIn != 0:
-            checkCudaErrors(cuda.cuMemFree(self.waveIn))
+            check_cuda_errors(cuda.cuMemFree(self.waveIn))
         if self.waveOut != 0:
-            checkCudaErrors(cuda.cuMemFree(self.waveOut))
-        checkCudaErrors(cuda.cuCtxDestroy(self.context))
+            check_cuda_errors(cuda.cuMemFree(self.waveOut))
+        check_cuda_errors(cuda.cuCtxDestroy(self.context))
 
     #
     # swap waveIn with waveOut
@@ -275,45 +275,45 @@ def swap(self):
     # allocate the device memory
     #
     def allocate(self):
-        nel = self.params.nx * self.params.ny * self.params.nz
+        nel = self.Params.nx * self.Params.ny * self.Params.nz
         n = np.array(nel, dtype=np.uint32)
 
-        bufferSize = n * np.dtype(np.float32).itemsize
-        checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
+        buffer_size = n * np.dtype(np.float32).itemsize
+        check_cuda_errors(cuda.cuCtxSetCurrent(self.context))
 
-        self.velocity = checkCudaErrors(cuda.cuMemAlloc(bufferSize))
-        checkCudaErrors(cuda.cuMemsetD32(self.velocity, 0, n))
+        self.velocity = check_cuda_errors(cuda.cuMemAlloc(buffer_size))
+        check_cuda_errors(cuda.cuMemsetD32(self.velocity, 0, n))
 
-        nel += self.params.lead
+        nel += self.Params.lead
         n = np.array(nel, dtype=np.uint32)  ## we need to align at the beginning of the tile
 
-        bufferSize = n * np.dtype(np.float32).itemsize
-        self.waveIn = checkCudaErrors(cuda.cuMemAlloc(bufferSize))
-        checkCudaErrors(cuda.cuMemsetD32(self.waveIn, 0, n))
+        buffer_size = n * np.dtype(np.float32).itemsize
+        self.waveIn = check_cuda_errors(cuda.cuMemAlloc(buffer_size))
+        check_cuda_errors(cuda.cuMemsetD32(self.waveIn, 0, n))
 
-        self.waveOut = checkCudaErrors(cuda.cuMemAlloc(bufferSize))
-        checkCudaErrors(cuda.cuMemsetD32(self.waveOut, 0, n))
+        self.waveOut = check_cuda_errors(cuda.cuMemAlloc(buffer_size))
+        check_cuda_errors(cuda.cuMemsetD32(self.waveOut, 0, n))
 
-        n = np.array(self.params.nt, dtype=np.uint32)
-        bufferSize = n * np.dtype(np.float32).itemsize
-        self.source = checkCudaErrors(cuda.cuMemAlloc(bufferSize))
-        checkCudaErrors(cuda.cuMemsetD32(self.source, 0, n))
+        n = np.array(self.Params.nt, dtype=np.uint32)
+        buffer_size = n * np.dtype(np.float32).itemsize
+        self.source = check_cuda_errors(cuda.cuMemAlloc(buffer_size))
+        check_cuda_errors(cuda.cuMemsetD32(self.source, 0, n))
 
     #
     # create source data
     #
-    def createSource(self, kernel):
+    def create_source(self, kernel):
         print("creating source on device ", self.dev)
 
         buf = np.array([int(self.source)], dtype=np.uint64)
-        nt = np.array(self.params.nt, dtype=np.uint32)
-        dt = np.array(self.params.dt, dtype=np.float32)
-        freq = np.array(self.params.freqMax, dtype=np.float32)
+        nt = np.array(self.Params.nt, dtype=np.uint32)
+        dt = np.array(self.Params.dt, dtype=np.float32)
+        freq = np.array(self.Params.freqMax, dtype=np.float32)
 
         args = [buf, dt, freq, nt]
         argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
-        checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
-        checkCudaErrors(
+        check_cuda_errors(cuda.cuCtxSetCurrent(self.context))
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 kernel.creatSource,
                 1,
@@ -328,34 +328,34 @@ def createSource(self, kernel):
                 0,
             )
         )  # arguments
-        checkCudaErrors(cuda.cuStreamSynchronize(self.streamHalo))
+        check_cuda_errors(cuda.cuStreamSynchronize(self.streamHalo))
 
     #
     # inject source function: ony on the domain 0
     #
-    def injectSource(self, kernel, iter):
-        checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
+    def inject_source(self, kernel, iter):
+        check_cuda_errors(cuda.cuCtxSetCurrent(self.context))
 
         if self.dev != 0:
             return
 
         wavein = np.array([int(self.waveIn)], dtype=np.uint64)
         src = np.array([int(self.source)], dtype=np.uint64)
-        offset_sourceInject = (
-            self.params.lead
-            + (int)(self.params.nz / 2) * self.params.nx * self.params.ny
-            + (int)(self.params.ny / 2) * self.params.nx
-            + (int)(self.params.nx / 2)
+        offset_source_inject = (
+            self.Params.lead
+            + (int)(self.Params.nz / 2) * self.Params.nx * self.Params.ny
+            + (int)(self.Params.ny / 2) * self.Params.nx
+            + (int)(self.Params.nx / 2)
         )
-        offset_sourceInject *= np.dtype(np.float32).itemsize
+        offset_source_inject *= np.dtype(np.float32).itemsize
 
         np_it = np.array(iter, dtype=np.uint32)
 
-        args = [wavein + offset_sourceInject, src, np_it]
+        args = [wavein + offset_source_inject, src, np_it]
         argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
-        checkCudaErrors(
+        check_cuda_errors(
             cuda.cuLaunchKernel(
-                kernel.injectSource,
+                kernel.inject_source,
                 1,
                 1,
                 1,  # grid dim
@@ -372,39 +372,39 @@ def injectSource(self, kernel, iter):
     #
     # create velocity
     #
-    def createVelocity(self, kernel):
+    def create_velocity(self, kernel):
         print("running create velocity on device ", self.dev)
 
         offset_velocity = (
-            self.params.FD_ORDER * self.params.nx * self.params.ny
-            + self.params.FD_ORDER * self.params.nx
-            + self.params.FD_ORDER
+            self.Params.FD_ORDER * self.Params.nx * self.Params.ny
+            + self.Params.FD_ORDER * self.Params.nx
+            + self.Params.FD_ORDER
         )
         offset_velocity *= np.dtype(np.float32).itemsize
 
         vel = np.array([int(self.velocity)], dtype=np.uint64)
-        dx_dt2 = (self.params.dt * self.params.dt) / (self.params.delta * self.params.delta)
+        dx_dt2 = (self.Params.dt * self.Params.dt) / (self.Params.delta * self.Params.delta)
 
-        stride = self.params.nx * self.params.ny
+        stride = self.Params.nx * self.Params.ny
         np_dx_dt2 = np.array(dx_dt2, dtype=np.float32)
-        np_nz = np.array((self.params.nz - 2 * self.params.FD_ORDER), dtype=np.uint32)
-        np_nx = np.array(self.params.nx, dtype=np.uint32)
+        np_nz = np.array((self.Params.nz - 2 * self.Params.FD_ORDER), dtype=np.uint32)
+        np_nx = np.array(self.Params.nx, dtype=np.uint32)
         np_stride = np.array(stride, dtype=np.uint32)
 
         args = [vel + offset_velocity, np_dx_dt2, np_nz, np_nx, np_stride]
         argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
 
-        checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
+        check_cuda_errors(cuda.cuCtxSetCurrent(self.context))
 
         # do halo up
-        checkCudaErrors(
+        check_cuda_errors(
             cuda.cuLaunchKernel(
-                kernel.createVelocity,
-                self.params.blkx,
-                self.params.blky,
+                kernel.create_velocity,
+                self.Params.blkx,
+                self.Params.blky,
                 1,  # grid dim
-                2 * self.params.BDIMX,
-                self.params.BDIMY,
+                2 * self.Params.BDIMX,
+                self.Params.BDIMY,
                 1,  # block dim
                 0,
                 self.streamHalo,  # shared mem and stream
@@ -412,22 +412,22 @@ def createVelocity(self, kernel):
                 0,
             )
         )  # arguments
-        checkCudaErrors(cuda.cuStreamSynchronize(self.streamHalo))
+        check_cuda_errors(cuda.cuStreamSynchronize(self.streamHalo))
 
     #
     # execute the center part of propagation
     #
-    def executeCenter(self, kernel):
+    def execute_center(self, kernel):
         if verbose_prints:
             print("running center on device ", self.dev)
-        checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
+        check_cuda_errors(cuda.cuCtxSetCurrent(self.context))
         offset_velocity = (
-            2 * self.params.FD_ORDER * self.params.nx * self.params.ny
-            + self.params.FD_ORDER * self.params.nx
-            + self.params.FD_ORDER
+            2 * self.Params.FD_ORDER * self.Params.nx * self.Params.ny
+            + self.Params.FD_ORDER * self.Params.nx
+            + self.Params.FD_ORDER
         )
 
-        offset_wave = self.params.lead + offset_velocity
+        offset_wave = self.Params.lead + offset_velocity
 
         offset_wave *= np.dtype(np.float32).itemsize
         offset_velocity *= np.dtype(np.float32).itemsize
@@ -436,9 +436,9 @@ def executeCenter(self, kernel):
         waveout = np.array([int(self.waveOut)], dtype=np.uint64)
 
         vel = np.array([int(self.velocity)], dtype=np.uint64)
-        stride = self.params.nx * self.params.ny
-        np_nz = np.array(self.params.nz - 4 * self.params.FD_ORDER, dtype=np.uint32)
-        np_nx = np.array(self.params.nx, dtype=np.uint32)
+        stride = self.Params.nx * self.Params.ny
+        np_nz = np.array(self.Params.nz - 4 * self.Params.FD_ORDER, dtype=np.uint32)
+        np_nx = np.array(self.Params.nx, dtype=np.uint32)
         np_stride = np.array(stride, dtype=np.uint32)
 
         args = [
@@ -452,14 +452,14 @@ def executeCenter(self, kernel):
         argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
 
         # do center propagation from 2 * fd_order to nz - 2 * fd_order
-        checkCudaErrors(
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 kernel.fdPropag,
-                self.params.blkx,
-                self.params.blky,
+                self.Params.blkx,
+                self.Params.blky,
                 1,  # grid dim
-                self.params.BDIMX,
-                self.params.BDIMY,
+                self.Params.BDIMX,
+                self.Params.BDIMY,
                 1,  # block dim
                 0,
                 self.streamCenter,  # shared mem and stream
@@ -471,18 +471,18 @@ def executeCenter(self, kernel):
     #
     # execute the halo part of propagation
     #
-    def executeHalo(self, kernel):
+    def execute_halo(self, kernel):
         if verbose_prints:
             print("running halos on device ", self.dev)
-        checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
+        check_cuda_errors(cuda.cuCtxSetCurrent(self.context))
 
         offset_velocity = (
-            self.params.FD_ORDER * self.params.nx * self.params.ny
-            + self.params.FD_ORDER * self.params.nx
-            + self.params.FD_ORDER
+            self.Params.FD_ORDER * self.Params.nx * self.Params.ny
+            + self.Params.FD_ORDER * self.Params.nx
+            + self.Params.FD_ORDER
         )
 
-        offset_wave = self.params.lead + offset_velocity
+        offset_wave = self.Params.lead + offset_velocity
 
         offset_wave *= np.dtype(np.float32).itemsize
         offset_velocity *= np.dtype(np.float32).itemsize
@@ -491,9 +491,9 @@ def executeHalo(self, kernel):
         waveout = np.array([int(self.waveOut)], dtype=np.uint64)
 
         vel = np.array([int(self.velocity)], dtype=np.uint64)
-        stride = self.params.nx * self.params.ny
-        np_nz = np.array(self.params.FD_ORDER, dtype=np.uint32)
-        np_nx = np.array(self.params.nx, dtype=np.uint32)
+        stride = self.Params.nx * self.Params.ny
+        np_nz = np.array(self.Params.FD_ORDER, dtype=np.uint32)
+        np_nx = np.array(self.Params.nx, dtype=np.uint32)
         np_stride = np.array(stride, dtype=np.uint32)
 
         args = [
@@ -507,14 +507,14 @@ def executeHalo(self, kernel):
         argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
 
         # do halo up
-        checkCudaErrors(
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 kernel.fdPropag,
-                self.params.blkx,
-                self.params.blky,
+                self.Params.blkx,
+                self.Params.blky,
                 1,  # grid dim
-                self.params.BDIMX,
-                self.params.BDIMY,
+                self.Params.BDIMX,
+                self.Params.BDIMY,
                 1,  # block dim
                 0,
                 self.streamHalo,  # shared mem and stream
@@ -525,11 +525,11 @@ def executeHalo(self, kernel):
 
         # do halo down
         offset_velocity = (
-            (self.params.nz - 2 * self.params.FD_ORDER) * self.params.nx * self.params.ny
-            + self.params.FD_ORDER * self.params.nx
-            + self.params.FD_ORDER
+            (self.Params.nz - 2 * self.Params.FD_ORDER) * self.Params.nx * self.Params.ny
+            + self.Params.FD_ORDER * self.Params.nx
+            + self.Params.FD_ORDER
         )
-        offset_wave = self.params.lead + offset_velocity
+        offset_wave = self.Params.lead + offset_velocity
 
         offset_wave *= np.dtype(np.float32).itemsize
         offset_velocity *= np.dtype(np.float32).itemsize
@@ -543,14 +543,14 @@ def executeHalo(self, kernel):
             np_stride,
         ]
         argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
-        checkCudaErrors(
+        check_cuda_errors(
             cuda.cuLaunchKernel(
                 kernel.fdPropag,
-                self.params.blkx,
-                self.params.blky,
+                self.Params.blkx,
+                self.Params.blky,
                 1,  # grid dim
-                self.params.BDIMX,
-                self.params.BDIMY,
+                self.Params.BDIMX,
+                self.Params.BDIMY,
                 1,  # block dim
                 0,
                 self.streamHalo,  # shared mem and stream
@@ -562,79 +562,79 @@ def executeHalo(self, kernel):
     #
     # exchange the halos
     #
-    def exchangeHalo(self, propag):
+    def exchange_halo(self, propag):
         if verbose_prints:
             print("exchange  halos on device ", self.dev, "with dev ", propag.dev)
-        checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
+        check_cuda_errors(cuda.cuCtxSetCurrent(self.context))
 
         #
         # the following variables don't change
         #
-        nstride = self.params.nx * self.params.ny
+        nstride = self.Params.nx * self.Params.ny
 
-        devS = self.context
-        devD = propag.context
+        dev_s = self.context
+        dev_d = propag.context
 
-        n_exch = self.params.FD_ORDER * nstride
+        n_exch = self.Params.FD_ORDER * nstride
         n_exch *= np.dtype(np.float32).itemsize
 
         if self.dev < propag.dev:
             # exchange up
-            offsetS = self.params.lead + (self.params.nz - 2 * self.params.FD_ORDER) * nstride
-            offsetD = propag.params.lead
+            offset_s = self.Params.lead + (self.Params.nz - 2 * self.Params.FD_ORDER) * nstride
+            offset_d = propag.Params.lead
 
-            offsetS *= np.dtype(np.float32).itemsize
-            offsetD *= np.dtype(np.float32).itemsize
+            offset_s *= np.dtype(np.float32).itemsize
+            offset_d *= np.dtype(np.float32).itemsize
 
-            waveD = cuda.CUdeviceptr(int(propag.waveOut) + offsetD)
-            waveS = cuda.CUdeviceptr(int(self.waveOut) + offsetS)
+            wave_d = cuda.CUdeviceptr(int(propag.waveOut) + offset_d)
+            wave_s = cuda.CUdeviceptr(int(self.waveOut) + offset_s)
 
-            checkCudaErrors(cuda.cuMemcpyPeerAsync(waveD, devD, waveS, devS, n_exch, self.streamHalo))
+            check_cuda_errors(cuda.cuMemcpyPeerAsync(wave_d, dev_d, wave_s, dev_s, n_exch, self.streamHalo))
         else:
             # exchange down
-            offsetS = self.params.lead + self.params.FD_ORDER * nstride
-            offsetD = propag.params.lead + (propag.params.nz - propag.params.FD_ORDER) * nstride
+            offset_s = self.Params.lead + self.Params.FD_ORDER * nstride
+            offset_d = propag.Params.lead + (propag.Params.nz - propag.Params.FD_ORDER) * nstride
 
-            offsetS *= np.dtype(np.float32).itemsize
-            offsetD *= np.dtype(np.float32).itemsize
+            offset_s *= np.dtype(np.float32).itemsize
+            offset_d *= np.dtype(np.float32).itemsize
 
-            waveD = cuda.CUdeviceptr(int(propag.waveOut) + offsetD)
-            waveS = cuda.CUdeviceptr(int(self.waveOut) + offsetS)
+            wave_d = cuda.CUdeviceptr(int(propag.waveOut) + offset_d)
+            wave_s = cuda.CUdeviceptr(int(self.waveOut) + offset_s)
 
-            checkCudaErrors(cuda.cuMemcpyPeerAsync(waveD, devD, waveS, devS, n_exch, self.streamHalo))
+            check_cuda_errors(cuda.cuMemcpyPeerAsync(wave_d, dev_d, wave_s, dev_s, n_exch, self.streamHalo))
 
     #
     # sync stream
     #
-    def syncStream(self, stream):
-        checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
-        checkCudaErrors(cuda.cuStreamSynchronize(stream))
+    def sync_stream(self, stream):
+        check_cuda_errors(cuda.cuCtxSetCurrent(self.context))
+        check_cuda_errors(cuda.cuStreamSynchronize(stream))
 
 
 def main():
-    checkCudaErrors(cuda.cuInit(0))
+    check_cuda_errors(cuda.cuInit(0))
 
     # Number of GPUs
     print("Checking for multiple GPUs...")
-    gpu_n = checkCudaErrors(cuda.cuDeviceGetCount())
+    gpu_n = check_cuda_errors(cuda.cuDeviceGetCount())
     print(f"CUDA-capable device count: {gpu_n}")
 
     if gpu_n < 2:
         print("Two or more GPUs with Peer-to-Peer access capability are required")
         return
 
-    prop = [checkCudaErrors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)]
+    prop = [check_cuda_errors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)]
     # Check possibility for peer access
     print("\nChecking GPU(s) for support of peer to peer memory access...")
 
-    p2pCapableGPUs = [-1, -1]
+    p2p_capable_gp_us = [-1, -1]
     for i in range(gpu_n):
-        p2pCapableGPUs[0] = i
+        p2p_capable_gp_us[0] = i
         for j in range(gpu_n):
             if i == j:
                 continue
-            i_access_j = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(i, j))
-            j_access_i = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(j, i))
+            i_access_j = check_cuda_errors(cudart.cudaDeviceCanAccessPeer(i, j))
+            j_access_i = check_cuda_errors(cudart.cudaDeviceCanAccessPeer(j, i))
             print(
                 "> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format(
                     prop[i].name, i, prop[j].name, j, "Yes" if i_access_j else "No"
@@ -646,23 +646,23 @@ def main():
                 )
             )
             if i_access_j and j_access_i:
-                p2pCapableGPUs[1] = j
+                p2p_capable_gp_us[1] = j
                 break
-        if p2pCapableGPUs[1] != -1:
+        if p2p_capable_gp_us[1] != -1:
             break
 
-    if p2pCapableGPUs[0] == -1 or p2pCapableGPUs[1] == -1:
+    if p2p_capable_gp_us[0] == -1 or p2p_capable_gp_us[1] == -1:
         print("Two or more GPUs with Peer-to-Peer access capability are required.")
         print("Peer to Peer access is not available amongst GPUs in the system, waiving test.")
         return
 
     # Use first pair of p2p capable GPUs detected
-    gpuid = [p2pCapableGPUs[0], p2pCapableGPUs[1]]
+    gpuid = [p2p_capable_gp_us[0], p2p_capable_gp_us[1]]
 
     #
     # init device
     #
-    pars = params()
+    pars = Params()
 
     #
     # create propagators
@@ -674,16 +674,16 @@ def main():
     # create kernels and propagators that are going to be used on device
     #
     for i in gpuid:
-        p = propagator(pars, i)
-        k = cudaKernels(p.context)
+        p = Propagator(pars, i)
+        k = CudaKernels(p.context)
         propags.append(p)
         kerns.append(k)
 
     # allocate resources in device
     for propag, kern in zip(propags, kerns):
         propag.allocate()
-        propag.createSource(kern)
-        propag.createVelocity(kern)
+        propag.create_source(kern)
+        propag.create_velocity(kern)
 
     #
     # loop over time iterations
@@ -691,26 +691,26 @@ def main():
     start = time.time()
     for it in range(pars.nt):
         for propag in propags:
-            propag.syncStream(propag.streamHalo)
+            propag.sync_stream(propag.streamHalo)
 
         for propag, kern in zip(propags, kerns):
-            propag.injectSource(kern, it)
+            propag.inject_source(kern, it)
 
         for propag, kern in zip(propags, kerns):
-            propag.executeHalo(kern)
+            propag.execute_halo(kern)
 
         for propag in propags:
-            propag.syncStream(propag.streamHalo)
+            propag.sync_stream(propag.streamHalo)
 
-        propags[1].exchangeHalo(propags[0])
+        propags[1].exchange_halo(propags[0])
 
-        propags[0].exchangeHalo(propags[1])
+        propags[0].exchange_halo(propags[1])
 
         for propag, kern in zip(propags, kerns):
-            propag.executeCenter(kern)
+            propag.execute_center(kern)
 
         for propag in propags:
-            propag.syncStream(propag.streamCenter)
+            propag.sync_stream(propag.streamCenter)
 
         for propag in propags:
             propag.swap()
@@ -727,19 +727,19 @@ def main():
     #
     nz = 2 * (int)(pars.nz - 2 * pars.FD_ORDER)
     print(" nz= ", nz, " nx= ", pars.nx)
-    hOut = np.zeros((nz, pars.nx), dtype="float32")
+    h_out = np.zeros((nz, pars.nx), dtype="float32")
 
     istart = 0
     for propag in propags:
-        checkCudaErrors(cuda.cuCtxSetCurrent(propag.context))
+        check_cuda_errors(cuda.cuCtxSetCurrent(propag.context))
         offset = pars.lead + pars.FD_ORDER * pars.nx * pars.ny + (int)(pars.ny / 2) * pars.nx
 
         for j in range(pars.nz - 2 * pars.FD_ORDER):
             ptr = cuda.CUdeviceptr(int(propag.waveOut) + offset * 4)
 
-            checkCudaErrors(
+            check_cuda_errors(
                 cuda.cuMemcpyDtoH(
-                    hOut[istart].ctypes.data,
+                    h_out[istart].ctypes.data,
                     ptr,
                     pars.nx * np.dtype(np.float32).itemsize,
                 )
@@ -756,7 +756,7 @@ def main():
     if display_graph:
         nrows = nz
         ncols = pars.nx
-        dbz = hOut
+        dbz = h_out
         dbz = np.reshape(dbz, (nrows, ncols))
 
         ##
diff --git a/cuda_bindings/examples/extra/jit_program_test.py b/cuda_bindings/examples/extra/jit_program_test.py
index be78deafc1..80e7e73376 100644
--- a/cuda_bindings/examples/extra/jit_program_test.py
+++ b/cuda_bindings/examples/extra/jit_program_test.py
@@ -9,7 +9,7 @@
 from cuda.bindings import nvrtc
 
 
-def ASSERT_DRV(err):
+def assert_drv(err):
     if isinstance(err, cuda.CUresult):
         if err != cuda.CUresult.CUDA_SUCCESS:
             raise RuntimeError(f"Cuda Error: {err}")
@@ -35,31 +35,31 @@ def ASSERT_DRV(err):
 def main():
     # Init
     (err,) = cuda.cuInit(0)
-    ASSERT_DRV(err)
+    assert_drv(err)
 
     # Device
-    err, cuDevice = cuda.cuDeviceGet(0)
-    ASSERT_DRV(err)
+    err, cu_device = cuda.cuDeviceGet(0)
+    assert_drv(err)
 
     # Ctx
-    err, context = cuda.cuCtxCreate(None, 0, cuDevice)
-    ASSERT_DRV(err)
+    err, context = cuda.cuCtxCreate(None, 0, cu_device)
+    assert_drv(err)
 
     # Create program
     err, prog = nvrtc.nvrtcCreateProgram(str.encode(saxpy), b"saxpy.cu", 0, None, None)
-    ASSERT_DRV(err)
+    assert_drv(err)
 
     # Get target architecture
     err, major = cuda.cuDeviceGetAttribute(
-        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice
+        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cu_device
     )
-    ASSERT_DRV(err)
+    assert_drv(err)
     err, minor = cuda.cuDeviceGetAttribute(
-        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice
+        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cu_device
     )
-    ASSERT_DRV(err)
+    assert_drv(err)
     err, nvrtc_major, nvrtc_minor = nvrtc.nvrtcVersion()
-    ASSERT_DRV(err)
+    assert_drv(err)
     use_cubin = nvrtc_minor >= 1
     prefix = "sm" if use_cubin else "compute"
     arch_arg = bytes(f"--gpu-architecture={prefix}_{major}{minor}", "ascii")
@@ -67,82 +67,80 @@ def main():
     # Compile program
     opts = [b"--fmad=false", arch_arg]
     (err,) = nvrtc.nvrtcCompileProgram(prog, len(opts), opts)
-    ASSERT_DRV(err)
+    assert_drv(err)
 
     # Get log from compilation
-    err, logSize = nvrtc.nvrtcGetProgramLogSize(prog)
-    ASSERT_DRV(err)
-    log = b" " * logSize
+    err, log_size = nvrtc.nvrtcGetProgramLogSize(prog)
+    assert_drv(err)
+    log = b" " * log_size
     (err,) = nvrtc.nvrtcGetProgramLog(prog, log)
-    ASSERT_DRV(err)
+    assert_drv(err)
     print(log.decode())
 
     # Get data from compilation
     if use_cubin:
-        err, dataSize = nvrtc.nvrtcGetCUBINSize(prog)
-        ASSERT_DRV(err)
-        data = b" " * dataSize
+        err, data_size = nvrtc.nvrtcGetCUBINSize(prog)
+        assert_drv(err)
+        data = b" " * data_size
         (err,) = nvrtc.nvrtcGetCUBIN(prog, data)
-        ASSERT_DRV(err)
+        assert_drv(err)
     else:
-        err, dataSize = nvrtc.nvrtcGetPTXSize(prog)
-        ASSERT_DRV(err)
-        data = b" " * dataSize
+        err, data_size = nvrtc.nvrtcGetPTXSize(prog)
+        assert_drv(err)
+        data = b" " * data_size
         (err,) = nvrtc.nvrtcGetPTX(prog, data)
-        ASSERT_DRV(err)
-    (err,) = nvrtc.nvrtcDestroyProgram(prog)
-    ASSERT_DRV(err)
+        assert_drv(err)
 
     # Load data as module data and retrieve function
     data = np.char.array(data)
     err, module = cuda.cuModuleLoadData(data)
-    ASSERT_DRV(err)
+    assert_drv(err)
     err, kernel = cuda.cuModuleGetFunction(module, b"saxpy")
-    ASSERT_DRV(err)
+    assert_drv(err)
 
     # Test the kernel
-    NUM_THREADS = 128
-    NUM_BLOCKS = 32
+    num_threads = 128
+    num_blocks = 32
 
     a = np.float32(2.0)
-    n = np.array(NUM_THREADS * NUM_BLOCKS, dtype=np.uint32)
-    bufferSize = n * a.itemsize
+    n = np.array(num_threads * num_blocks, dtype=np.uint32)
+    buffer_size = n * a.itemsize
 
-    err, dX = cuda.cuMemAlloc(bufferSize)
-    ASSERT_DRV(err)
-    err, dY = cuda.cuMemAlloc(bufferSize)
-    ASSERT_DRV(err)
-    err, dOut = cuda.cuMemAlloc(bufferSize)
-    ASSERT_DRV(err)
+    err, d_x = cuda.cuMemAlloc(buffer_size)
+    assert_drv(err)
+    err, d_y = cuda.cuMemAlloc(buffer_size)
+    assert_drv(err)
+    err, d_out = cuda.cuMemAlloc(buffer_size)
+    assert_drv(err)
 
-    hX = np.random.rand(n).astype(dtype=np.float32)
-    hY = np.random.rand(n).astype(dtype=np.float32)
-    hOut = np.zeros(n).astype(dtype=np.float32)
+    h_x = np.random.rand(n).astype(dtype=np.float32)
+    h_y = np.random.rand(n).astype(dtype=np.float32)
+    h_out = np.zeros(n).astype(dtype=np.float32)
 
     err, stream = cuda.cuStreamCreate(0)
-    ASSERT_DRV(err)
+    assert_drv(err)
 
-    (err,) = cuda.cuMemcpyHtoDAsync(dX, hX, bufferSize, stream)
-    ASSERT_DRV(err)
-    (err,) = cuda.cuMemcpyHtoDAsync(dY, hY, bufferSize, stream)
-    ASSERT_DRV(err)
+    (err,) = cuda.cuMemcpyHtoDAsync(d_x, h_x, buffer_size, stream)
+    assert_drv(err)
+    (err,) = cuda.cuMemcpyHtoDAsync(d_y, h_y, buffer_size, stream)
+    assert_drv(err)
 
     (err,) = cuda.cuStreamSynchronize(stream)
-    ASSERT_DRV(err)
+    assert_drv(err)
 
     # Assert values are different before running kernel
-    hZ = a * hX + hY
-    if np.allclose(hOut, hZ):
+    h_z = a * h_x + h_y
+    if np.allclose(h_out, h_z):
         raise ValueError("Error inside tolerence for host-device vectors")
 
-    arg_values = (a, dX, dY, dOut, n)
+    arg_values = (a, d_x, d_y, d_out, n)
     arg_types = (ctypes.c_float, None, None, None, ctypes.c_size_t)
     (err,) = cuda.cuLaunchKernel(
         kernel,
-        NUM_BLOCKS,
+        num_blocks,
         1,
         1,  # grid dim
-        NUM_THREADS,
+        num_threads,
         1,
         1,  # block dim
         0,
@@ -150,32 +148,32 @@ def main():
         (arg_values, arg_types),
         0,
     )  # arguments
-    ASSERT_DRV(err)
+    assert_drv(err)
 
-    (err,) = cuda.cuMemcpyDtoHAsync(hOut, dOut, bufferSize, stream)
-    ASSERT_DRV(err)
+    (err,) = cuda.cuMemcpyDtoHAsync(h_out, d_out, buffer_size, stream)
+    assert_drv(err)
     (err,) = cuda.cuStreamSynchronize(stream)
-    ASSERT_DRV(err)
+    assert_drv(err)
 
     # Assert values are same after running kernel
-    hZ = a * hX + hY
-    if not np.allclose(hOut, hZ):
+    h_z = a * h_x + h_y
+    if not np.allclose(h_out, h_z):
         raise ValueError("Error outside tolerence for host-device vectors")
 
     (err,) = cuda.cuStreamDestroy(stream)
-    ASSERT_DRV(err)
+    assert_drv(err)
 
-    (err,) = cuda.cuMemFree(dX)
-    ASSERT_DRV(err)
-    (err,) = cuda.cuMemFree(dY)
-    ASSERT_DRV(err)
-    (err,) = cuda.cuMemFree(dOut)
-    ASSERT_DRV(err)
+    (err,) = cuda.cuMemFree(d_x)
+    assert_drv(err)
+    (err,) = cuda.cuMemFree(d_y)
+    assert_drv(err)
+    (err,) = cuda.cuMemFree(d_out)
+    assert_drv(err)
 
     (err,) = cuda.cuModuleUnload(module)
-    ASSERT_DRV(err)
+    assert_drv(err)
     (err,) = cuda.cuCtxDestroy(context)
-    ASSERT_DRV(err)
+    assert_drv(err)
 
 
 if __name__ == "__main__":
diff --git a/cuda_core/examples/cuda_graphs.py b/cuda_core/examples/cuda_graphs.py
index 02d1b59ec1..c6233dd5d9 100644
--- a/cuda_core/examples/cuda_graphs.py
+++ b/cuda_core/examples/cuda_graphs.py
@@ -84,9 +84,9 @@ def main():
         result3 = cp.empty_like(a)
 
         # Prepare launch configuration
-        block_size = 256
-        grid_size = (size + block_size - 1) // block_size
-        config = LaunchConfig(grid=grid_size, block=block_size)
+        block = 256
+        grid = (size + block - 1) // block
+        config = LaunchConfig(grid=grid, block=block)
 
         # Sync before graph capture
         dev.sync()
diff --git a/cuda_core/examples/gl_interop_plasma.py b/cuda_core/examples/gl_interop_plasma.py
index 7b8b43cd8d..46fa59ee3f 100644
--- a/cuda_core/examples/gl_interop_plasma.py
+++ b/cuda_core/examples/gl_interop_plasma.py
@@ -94,8 +94,8 @@ def setup_cuda(kernel_source):
     dev.set_current()
     stream = dev.create_stream()
 
-    opts = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
-    prog = Program(kernel_source, code_type="c++", options=opts)
+    program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
+    prog = Program(kernel_source, code_type="c++", options=program_options)
     mod = prog.compile("cubin")
     kernel = mod.get_kernel("plasma")
 
diff --git a/cuda_core/examples/pytorch_example.py b/cuda_core/examples/pytorch_example.py
index 3919953eab..4e3bfcceb5 100644
--- a/cuda_core/examples/pytorch_example.py
+++ b/cuda_core/examples/pytorch_example.py
@@ -48,7 +48,7 @@ def __cuda_stream__(self):
         return (0, stream_id)  # Return format required by CUDA Python
 
 
-s = dev.create_stream(PyTorchStreamWrapper(pt_stream))
+stream = dev.create_stream(PyTorchStreamWrapper(pt_stream))
 
 try:
     # prepare program
@@ -61,7 +61,7 @@ def __cuda_stream__(self):
     )
 
     # Run in single precision
-    ker = mod.get_kernel("saxpy_kernel<float>")
+    kernel = mod.get_kernel("saxpy_kernel<float>")
     dtype = torch.float32
 
     # prepare input/output
@@ -76,16 +76,16 @@ def __cuda_stream__(self):
     block = 32
     grid = int((size + block - 1) // block)
     config = LaunchConfig(grid=grid, block=block)
-    ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
+    kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
 
     # launch kernel on our stream
-    launch(s, config, ker, *ker_args)
+    launch(stream, config, kernel, *kernel_args)
 
     # check result
     assert torch.allclose(out, a.item() * x + y)
 
     # let's repeat again with double precision
-    ker = mod.get_kernel("saxpy_kernel<double>")
+    kernel = mod.get_kernel("saxpy_kernel<double>")
     dtype = torch.float64
 
     # prepare input
@@ -102,12 +102,12 @@ def __cuda_stream__(self):
     block = 64
     grid = int((size + block - 1) // block)
     config = LaunchConfig(grid=grid, block=block)
-    ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
+    kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
 
     # launch kernel on PyTorch's stream
-    launch(s, config, ker, *ker_args)
+    launch(stream, config, kernel, *kernel_args)
 
     # check result
     assert torch.allclose(out, a * x + y)
 finally:
-    s.close()
+    stream.close()
diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py
index d7eb401ac3..548af802be 100644
--- a/cuda_core/examples/saxpy.py
+++ b/cuda_core/examples/saxpy.py
@@ -35,7 +35,7 @@
 
 dev = Device()
 dev.set_current()
-s = dev.create_stream()
+stream = dev.create_stream()
 buf = None
 
 try:
@@ -53,7 +53,7 @@
     )
 
     # run in single precision
-    ker = mod.get_kernel("saxpy<float>")
+    kernel = mod.get_kernel("saxpy<float>")
     dtype = cp.float32
 
     # prepare input/output
@@ -63,24 +63,24 @@
     x = rng.random(size, dtype=dtype)
     y = rng.random(size, dtype=dtype)
     out = cp.empty_like(x)
-    dev.sync()  # cupy runs on a different stream from s, so sync before accessing
+    dev.sync()  # cupy runs on a different stream from stream, so sync before accessing
 
     # prepare launch
     block = 32
     grid = int((size + block - 1) // block)
     config = LaunchConfig(grid=grid, block=block)
-    ker_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
+    kernel_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
 
-    # launch kernel on stream s
-    launch(s, config, ker, *ker_args)
-    s.sync()
+    # launch kernel on stream
+    launch(stream, config, kernel, *kernel_args)
+    stream.sync()
 
     # check result
     assert cp.allclose(out, a * x + y)
 
     # let's repeat again, this time allocates our own out buffer instead of cupy's
     # run in double precision
-    ker = mod.get_kernel("saxpy<double>")
+    kernel = mod.get_kernel("saxpy<double>")
     dtype = cp.float64
 
     # prepare input
@@ -93,18 +93,18 @@
     # prepare output
     buf = dev.allocate(
         size * 8,  # = dtype.itemsize
-        stream=s,
+        stream=stream,
     )
 
     # prepare launch
     block = 64
     grid = int((size + block - 1) // block)
     config = LaunchConfig(grid=grid, block=block)
-    ker_args = (a, x.data.ptr, y.data.ptr, buf, size)
+    kernel_args = (a, x.data.ptr, y.data.ptr, buf, size)
 
-    # launch kernel on stream s
-    launch(s, config, ker, *ker_args)
-    s.sync()
+    # launch kernel on stream
+    launch(stream, config, kernel, *kernel_args)
+    stream.sync()
 
     # check result
     # we wrap output buffer as a cupy array for simplicity
@@ -115,5 +115,5 @@
 finally:
     # cupy cleans up automatically the rest
     if buf is not None:
-        buf.close(s)
-    s.close()
+        buf.close(stream)
+    stream.close()
diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py
index 0fbb4466bb..882ce8bbb3 100644
--- a/cuda_core/examples/simple_multi_gpu_example.py
+++ b/cuda_core/examples/simple_multi_gpu_example.py
@@ -13,7 +13,7 @@
 
 import cupy as cp
 
-from cuda.core import Device, LaunchConfig, Program, launch, system
+from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch, system
 
 if system.get_num_devices() < 2:
     print("this example requires at least 2 GPUs", file=sys.stderr)
@@ -56,9 +56,9 @@ def __cuda_stream__(self):
     }
 }
 """
-    prog_add = Program(code_add, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev0.arch}"})
+    prog_add = Program(code_add, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}"))
     mod_add = prog_add.compile("cubin")
-    ker_add = mod_add.get_kernel("vector_add")
+    add_kernel = mod_add.get_kernel("vector_add")
 
     # Set GPU 1
     dev1 = Device(1)
@@ -78,9 +78,9 @@ def __cuda_stream__(self):
     }
 }
 """
-    prog_sub = Program(code_sub, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev1.arch}"})
+    prog_sub = Program(code_sub, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}"))
     mod_sub = prog_sub.compile("cubin")
-    ker_sub = mod_sub.get_kernel("vector_sub")
+    sub_kernel = mod_sub.get_kernel("vector_sub")
 
     # Create launch configs for each kernel that will be executed on the respective
     # CUDA streams.
@@ -103,7 +103,7 @@ def __cuda_stream__(self):
     stream0.wait(cp_stream0)
 
     # Launch the add kernel on GPU 0 / stream 0
-    launch(stream0, config0, ker_add, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
+    launch(stream0, config0, add_kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
 
     # Allocate memory on GPU 1
     # Note: This runs on CuPy's current stream for GPU 1.
@@ -118,7 +118,7 @@ def __cuda_stream__(self):
     stream1.wait(cp_stream1)
 
     # Launch the subtract kernel on GPU 1 / stream 1
-    launch(stream1, config1, ker_sub, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
+    launch(stream1, config1, sub_kernel, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
 
     # Synchronize both GPUs are validate the results
     dev0.set_current()
diff --git a/cuda_core/examples/strided_memory_view_gpu.py b/cuda_core/examples/strided_memory_view_gpu.py
index d53c4278b2..9d4e4aacff 100644
--- a/cuda_core/examples/strided_memory_view_gpu.py
+++ b/cuda_core/examples/strided_memory_view_gpu.py
@@ -57,7 +57,7 @@
 # We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
 # of which are supported by StridedMemoryView).
 @args_viewable_as_strided_memory((0,))
-def my_func(arr, work_stream, gpu_ker):
+def my_func(arr, work_stream, kernel):
     # Create a memory view over arr (assumed to be a 1D array of int32). The stream
     # ordering is taken care of, so that arr can be safely accessed on our work
     # stream (ordered after a data stream on which arr is potentially prepared).
@@ -73,7 +73,7 @@ def my_func(arr, work_stream, gpu_ker):
     block = 256
     grid = (size + block - 1) // block
     config = LaunchConfig(grid=grid, block=block)
-    launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size))
+    launch(work_stream, config, kernel, view.ptr, np.uint64(size))
     # Here we're being conservative and synchronize over our work stream,
     # assuming we do not know the data stream; if we know then we could
     # just order the data stream after the work stream here, e.g.
@@ -101,24 +101,24 @@ def run():
     # To know the GPU's compute capability, we need to identify which GPU to use.
     dev = Device(0)
     dev.set_current()
-    gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11"))
-    mod = gpu_prog.compile(target_type="cubin")
-    gpu_ker = mod.get_kernel(func_name)
+    prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11"))
+    mod = prog.compile(target_type="cubin")
+    kernel = mod.get_kernel(func_name)
 
-    s = dev.create_stream()
+    stream = dev.create_stream()
     try:
         # Create input array on GPU
         arr_gpu = cp.ones(1024, dtype=cp.int32)
         print(f"before: {arr_gpu[:10]=}")
 
         # Run the workload
-        my_func(arr_gpu, s, gpu_ker)
+        my_func(arr_gpu, stream, kernel)
 
         # Check the result
         print(f"after: {arr_gpu[:10]=}")
         assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32))
     finally:
-        s.close()
+        stream.close()
 
 
 if __name__ == "__main__":
diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py
index 5e36270eab..a5f50d4189 100644
--- a/cuda_core/examples/thread_block_cluster.py
+++ b/cuda_core/examples/thread_block_cluster.py
@@ -94,7 +94,7 @@
     options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=include_path),
 )
 mod = prog.compile(target_type="cubin")
-ker = mod.get_kernel("check_cluster_info")
+kernel = mod.get_kernel("check_cluster_info")
 
 # prepare launch config
 grid = 4
@@ -126,7 +126,7 @@
     block_dims[:] = 0
 
     # launch kernel on the default stream
-    launch(dev.default_stream, config, ker, grid_buffer, cluster_buffer, block_buffer)
+    launch(dev.default_stream, config, kernel, grid_buffer, cluster_buffer, block_buffer)
     dev.sync()
 
     # verify results
diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py
index 4c645fc7dd..e648a3846f 100644
--- a/cuda_core/examples/vector_add.py
+++ b/cuda_core/examples/vector_add.py
@@ -30,7 +30,7 @@
 
 dev = Device()
 dev.set_current()
-s = dev.create_stream()
+stream = dev.create_stream()
 
 try:
     # prepare program
@@ -39,7 +39,7 @@
     mod = prog.compile("cubin", name_expressions=("vector_add<float>",))
 
     # run in single precision
-    ker = mod.get_kernel("vector_add<float>")
+    kernel = mod.get_kernel("vector_add<float>")
     dtype = cp.float32
 
     # prepare input/output
@@ -49,7 +49,7 @@
     b = rng.random(size, dtype=dtype)
     c = cp.empty_like(a)
 
-    # cupy runs on a different stream from s, so sync before accessing
+    # cupy runs on a different stream from stream, so sync before accessing
     dev.sync()
 
     # prepare launch
@@ -57,11 +57,11 @@
     grid = (size + block - 1) // block
     config = LaunchConfig(grid=grid, block=block)
 
-    # launch kernel on stream s
-    launch(s, config, ker, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
-    s.sync()
+    # launch kernel on stream
+    launch(stream, config, kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
+    stream.sync()
 
     # check result
     assert cp.allclose(c, a + b)
 finally:
-    s.close()
+    stream.close()
diff --git a/ruff.toml b/ruff.toml
index 7f3853529e..76f548848c 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -123,13 +123,15 @@ inline-quotes = "double"
 ]
 
 # CUDA bindings mirror C API naming conventions (CamelCase types, camelCase functions)
-"cuda_bindings/**" = [
+# Keep examples opted-in to enforce naming conventions in example-local identifiers.
+"cuda_bindings/{benchmarks,cuda,docs,tests}/**" = [
   "N801",    # invalid-class-name
   "N802",    # invalid-function-name
   "N803",    # invalid-argument-name
   "N806",    # non-lowercase-variable-in-function
   "N816",    # mixed-case-variable-in-global-scope
 ]
+"cuda_bindings/{build_hooks.py,setup.py}" = ["N801", "N802", "N803", "N806", "N816"]
 
 # scripts and build tooling — print is the expected output method
 "toolshed/**" = ["T201"]