diff --git a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py index dc1084bea8..d67f180fe0 100644 --- a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py +++ b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py @@ -5,7 +5,7 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors, findCudaDevice +from common.helper_cuda import check_cuda_errors, find_cuda_device from cuda.bindings import driver as cuda @@ -50,8 +50,8 @@ } """ -NUM_BLOCKS = 64 -NUM_THREADS = 256 +num_blocks = 64 +num_threads = 256 def elems_to_bytes(nelems, dt): @@ -64,52 +64,52 @@ def main(): if platform.machine() == "armv7l": pytest.skip("clock_nvrtc is not supported on ARMv7") - timer = np.empty(NUM_BLOCKS * 2, dtype="int64") - hinput = np.empty(NUM_THREADS * 2, dtype="float32") + timer = np.empty(num_blocks * 2, dtype="int64") + hinput = np.empty(num_threads * 2, dtype="float32") - for i in range(NUM_THREADS * 2): + for i in range(num_threads * 2): hinput[i] = i - devID = findCudaDevice() - with common.KernelHelper(clock_nvrtc, devID) as kernelHelper: - kernel_addr = kernelHelper.getFunction(b"timedReduction") - - dinput = checkCudaErrors(cuda.cuMemAlloc(hinput.nbytes)) - doutput = checkCudaErrors(cuda.cuMemAlloc(elems_to_bytes(NUM_BLOCKS, np.float32))) - dtimer = checkCudaErrors(cuda.cuMemAlloc(timer.nbytes)) - checkCudaErrors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes)) - - args = ((dinput, doutput, dtimer), (None, None, None)) - shared_memory_nbytes = elems_to_bytes(2 * NUM_THREADS, np.float32) - - grid_dims = (NUM_BLOCKS, 1, 1) - block_dims = (NUM_THREADS, 1, 1) - - checkCudaErrors( - cuda.cuLaunchKernel( - kernel_addr, - *grid_dims, # grid dim - *block_dims, # block dim - shared_memory_nbytes, - 0, # shared mem, stream - args, - 0, - ) - ) # arguments - - checkCudaErrors(cuda.cuCtxSynchronize()) - checkCudaErrors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes)) - checkCudaErrors(cuda.cuMemFree(dinput)) - checkCudaErrors(cuda.cuMemFree(doutput)) - checkCudaErrors(cuda.cuMemFree(dtimer)) - - avgElapsedClocks = 0.0 - - for i in range(NUM_BLOCKS): - avgElapsedClocks += timer[i + NUM_BLOCKS] - timer[i] - - avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS - print(f"Average clocks/block = {avgElapsedClocks}") + dev_id = find_cuda_device() + kernel_helper = common.KernelHelper(clock_nvrtc, dev_id) + kernel_addr = kernel_helper.get_function(b"timedReduction") + + dinput = check_cuda_errors(cuda.cuMemAlloc(hinput.nbytes)) + doutput = check_cuda_errors(cuda.cuMemAlloc(elems_to_bytes(num_blocks, np.float32))) + dtimer = check_cuda_errors(cuda.cuMemAlloc(timer.nbytes)) + check_cuda_errors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes)) + + args = ((dinput, doutput, dtimer), (None, None, None)) + shared_memory_nbytes = elems_to_bytes(2 * num_threads, np.float32) + + grid_dims = (num_blocks, 1, 1) + block_dims = (num_threads, 1, 1) + + check_cuda_errors( + cuda.cuLaunchKernel( + kernel_addr, + *grid_dims, # grid dim + *block_dims, # block dim + shared_memory_nbytes, + 0, # shared mem, stream + args, + 0, + ) + ) # arguments + + check_cuda_errors(cuda.cuCtxSynchronize()) + check_cuda_errors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes)) + check_cuda_errors(cuda.cuMemFree(dinput)) + check_cuda_errors(cuda.cuMemFree(doutput)) + check_cuda_errors(cuda.cuMemFree(dtimer)) + + avg_elapsed_clocks = 0.0 + + for i in range(num_blocks): + avg_elapsed_clocks += timer[i + num_blocks] - timer[i] + + avg_elapsed_clocks = avg_elapsed_clocks / num_blocks + print(f"Average clocks/block = {avg_elapsed_clocks}") if __name__ == "__main__": diff --git a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py index 75f1b0800d..5d764509ce 100644 --- a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py +++ b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py @@ -7,12 +7,12 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors, findCudaDevice +from common.helper_cuda import check_cuda_errors, find_cuda_device from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart -simpleCubemapTexture = """\ +simple_cubemap_texture = """\ extern "C" __global__ void transformKernel(float *g_odata, int width, cudaTextureObject_t tex) { @@ -83,14 +83,14 @@ def main(): # Use command-line specified CUDA device, otherwise use device with highest Gflops/s - devID = findCudaDevice() + dev_id = find_cuda_device() # Get number of SMs on this GPU - deviceProps = checkCudaErrors(cudart.cudaGetDeviceProperties(devID)) + device_props = check_cuda_errors(cudart.cudaGetDeviceProperties(dev_id)) print( - f"CUDA device [{deviceProps.name}] has {deviceProps.multiProcessorCount} Multi-Processors SM {deviceProps.major}.{deviceProps.minor}" + f"CUDA device [{device_props.name}] has {device_props.multiProcessorCount} Multi-Processors SM {device_props.major}.{device_props.minor}" ) - if deviceProps.major < 2: + if device_props.major < 2: import pytest pytest.skip("Test requires SM 2.0 or higher for support of Texture Arrays.") @@ -107,15 +107,15 @@ def main(): h_data_ref = np.repeat(np.arange(num_layers, dtype=h_data.dtype), cubemap_size) - h_data # Allocate device memory for result - d_data = checkCudaErrors(cudart.cudaMalloc(size)) + d_data = check_cuda_errors(cudart.cudaMalloc(size)) # Allocate array and copy image data - channelDesc = checkCudaErrors( + channel_desc = check_cuda_errors( cudart.cudaCreateChannelDesc(32, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat) ) - cu_3darray = checkCudaErrors( + cu_3darray = check_cuda_errors( cudart.cudaMalloc3DArray( - channelDesc, + channel_desc, cudart.make_cudaExtent(width, width, num_faces), cudart.cudaArrayCubemap, ) @@ -128,90 +128,90 @@ def main(): myparms.dstArray = cu_3darray myparms.extent = cudart.make_cudaExtent(width, width, num_faces) myparms.kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice - checkCudaErrors(cudart.cudaMemcpy3D(myparms)) - - texRes = cudart.cudaResourceDesc() - texRes.resType = cudart.cudaResourceType.cudaResourceTypeArray - texRes.res.array.array = cu_3darray - - texDescr = cudart.cudaTextureDesc() - texDescr.normalizedCoords = True - texDescr.filterMode = cudart.cudaTextureFilterMode.cudaFilterModeLinear - texDescr.addressMode[0] = cudart.cudaTextureAddressMode.cudaAddressModeWrap - texDescr.addressMode[1] = cudart.cudaTextureAddressMode.cudaAddressModeWrap - texDescr.addressMode[2] = cudart.cudaTextureAddressMode.cudaAddressModeWrap - texDescr.readMode = cudart.cudaTextureReadMode.cudaReadModeElementType - - tex = checkCudaErrors(cudart.cudaCreateTextureObject(texRes, texDescr, None)) - dimBlock = cudart.dim3() - dimBlock.x = 8 - dimBlock.y = 8 - dimBlock.z = 1 - dimGrid = cudart.dim3() - dimGrid.x = width / dimBlock.x - dimGrid.y = width / dimBlock.y - dimGrid.z = 1 + check_cuda_errors(cudart.cudaMemcpy3D(myparms)) + + tex_res = cudart.cudaResourceDesc() + tex_res.resType = cudart.cudaResourceType.cudaResourceTypeArray + tex_res.res.array.array = cu_3darray + + tex_descr = cudart.cudaTextureDesc() + tex_descr.normalizedCoords = True + tex_descr.filterMode = cudart.cudaTextureFilterMode.cudaFilterModeLinear + tex_descr.addressMode[0] = cudart.cudaTextureAddressMode.cudaAddressModeWrap + tex_descr.addressMode[1] = cudart.cudaTextureAddressMode.cudaAddressModeWrap + tex_descr.addressMode[2] = cudart.cudaTextureAddressMode.cudaAddressModeWrap + tex_descr.readMode = cudart.cudaTextureReadMode.cudaReadModeElementType + + tex = check_cuda_errors(cudart.cudaCreateTextureObject(tex_res, tex_descr, None)) + dim_block = cudart.dim3() + dim_block.x = 8 + dim_block.y = 8 + dim_block.z = 1 + dim_grid = cudart.dim3() + dim_grid.x = width / dim_block.x + dim_grid.y = width / dim_block.y + dim_grid.z = 1 print( - f"Covering Cubemap data array of {width}~3 x {num_layers}: Grid size is {dimGrid.x} x {dimGrid.y}, each block has 8 x 8 threads" + f"Covering Cubemap data array of {width}~3 x {num_layers}: Grid size is {dim_grid.x} x {dim_grid.y}, each block has 8 x 8 threads" ) - with common.KernelHelper(simpleCubemapTexture, devID) as kernelHelper: - _transformKernel = kernelHelper.getFunction(b"transformKernel") - kernelArgs = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None)) - checkCudaErrors( - cuda.cuLaunchKernel( - _transformKernel, - dimGrid.x, - dimGrid.y, - dimGrid.z, # grid dim - dimBlock.x, - dimBlock.y, - dimBlock.z, # block dim - 0, - 0, # shared mem and stream - kernelArgs, - 0, - ) - ) # arguments - - checkCudaErrors(cudart.cudaDeviceSynchronize()) - - start = time.time() - - # Execute the kernel - checkCudaErrors( - cuda.cuLaunchKernel( - _transformKernel, - dimGrid.x, - dimGrid.y, - dimGrid.z, # grid dim - dimBlock.x, - dimBlock.y, - dimBlock.z, # block dim - 0, - 0, # shared mem and stream - kernelArgs, - 0, - ) - ) # arguments - - checkCudaErrors(cudart.cudaDeviceSynchronize()) - stop = time.time() - print(f"Processing time: {stop - start:.3f} msec") - print(f"{cubemap_size / ((stop - start + 1) / 1000.0) / 1e6:.2f} Mtexlookups/sec") - - # Allocate mem for the result on host side - h_odata = np.empty_like(h_data) - # Copy result from device to host - checkCudaErrors(cudart.cudaMemcpy(h_odata, d_data, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)) - - checkCudaErrors(cudart.cudaDestroyTextureObject(tex)) - checkCudaErrors(cudart.cudaFree(d_data)) - checkCudaErrors(cudart.cudaFreeArray(cu_3darray)) - - MIN_EPSILON_ERROR = 5.0e-3 - if np.max(np.abs(h_odata - h_data_ref)) > MIN_EPSILON_ERROR: + kernel_helper = common.KernelHelper(simple_cubemap_texture, dev_id) + _transform_kernel = kernel_helper.get_function(b"transformKernel") + kernel_args = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None)) + check_cuda_errors( + cuda.cuLaunchKernel( + _transform_kernel, + dim_grid.x, + dim_grid.y, + dim_grid.z, # grid dim + dim_block.x, + dim_block.y, + dim_block.z, # block dim + 0, + 0, # shared mem and stream + kernel_args, + 0, + ) + ) # arguments + + check_cuda_errors(cudart.cudaDeviceSynchronize()) + + start = time.time() + + # Execute the kernel + check_cuda_errors( + cuda.cuLaunchKernel( + _transform_kernel, + dim_grid.x, + dim_grid.y, + dim_grid.z, # grid dim + dim_block.x, + dim_block.y, + dim_block.z, # block dim + 0, + 0, # shared mem and stream + kernel_args, + 0, + ) + ) # arguments + + check_cuda_errors(cudart.cudaDeviceSynchronize()) + stop = time.time() + print(f"Processing time: {stop - start:.3f} msec") + print(f"{cubemap_size / ((stop - start + 1) / 1000.0) / 1e6:.2f} Mtexlookups/sec") + + # Allocate mem for the result on host side + h_odata = np.empty_like(h_data) + # Copy result from device to host + check_cuda_errors(cudart.cudaMemcpy(h_odata, d_data, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)) + + check_cuda_errors(cudart.cudaDestroyTextureObject(tex)) + check_cuda_errors(cudart.cudaFree(d_data)) + check_cuda_errors(cudart.cudaFreeArray(cu_3darray)) + + min_epsilon_error = 5.0e-3 + if np.max(np.abs(h_odata - h_data_ref)) > min_epsilon_error: print("Failed", file=sys.stderr) sys.exit(1) diff --git a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py index a60dbac5bc..09dafa1be1 100644 --- a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py +++ b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py @@ -7,7 +7,7 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors +from common.helper_cuda import check_cuda_errors from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart @@ -41,24 +41,24 @@ def main(): # Number of GPUs print("Checking for multiple GPUs...") - gpu_n = checkCudaErrors(cudart.cudaGetDeviceCount()) + gpu_n = check_cuda_errors(cudart.cudaGetDeviceCount()) print(f"CUDA-capable device count: {gpu_n}") if gpu_n < 2: pytest.skip("Two or more GPUs with Peer-to-Peer access capability are required") - prop = [checkCudaErrors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)] + prop = [check_cuda_errors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)] # Check possibility for peer access print("\nChecking GPU(s) for support of peer to peer memory access...") - p2pCapableGPUs = [-1, -1] + p2p_capable_gp_us = [-1, -1] for i in range(gpu_n): - p2pCapableGPUs[0] = i + p2p_capable_gp_us[0] = i for j in range(gpu_n): if i == j: continue - i_access_j = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(i, j)) - j_access_i = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(j, i)) + i_access_j = check_cuda_errors(cudart.cudaDeviceCanAccessPeer(i, j)) + j_access_i = check_cuda_errors(cudart.cudaDeviceCanAccessPeer(j, i)) print( "> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format( prop[i].name, i, prop[j].name, j, "Yes" if i_access_j else "No" @@ -70,54 +70,54 @@ def main(): ) ) if i_access_j and j_access_i: - p2pCapableGPUs[1] = j + p2p_capable_gp_us[1] = j break - if p2pCapableGPUs[1] != -1: + if p2p_capable_gp_us[1] != -1: break - if p2pCapableGPUs[0] == -1 or p2pCapableGPUs[1] == -1: + if p2p_capable_gp_us[0] == -1 or p2p_capable_gp_us[1] == -1: pytest.skip("Peer to Peer access is not available amongst GPUs in the system") # Use first pair of p2p capable GPUs detected - gpuid = [p2pCapableGPUs[0], p2pCapableGPUs[1]] + gpuid = [p2p_capable_gp_us[0], p2p_capable_gp_us[1]] # Enable peer access print(f"Enabling peer access between GPU{gpuid[0]} and GPU{gpuid[1]}...") - checkCudaErrors(cudart.cudaSetDevice(gpuid[0])) - checkCudaErrors(cudart.cudaDeviceEnablePeerAccess(gpuid[1], 0)) - checkCudaErrors(cudart.cudaSetDevice(gpuid[1])) - checkCudaErrors(cudart.cudaDeviceEnablePeerAccess(gpuid[0], 0)) + check_cuda_errors(cudart.cudaSetDevice(gpuid[0])) + check_cuda_errors(cudart.cudaDeviceEnablePeerAccess(gpuid[1], 0)) + check_cuda_errors(cudart.cudaSetDevice(gpuid[1])) + check_cuda_errors(cudart.cudaDeviceEnablePeerAccess(gpuid[0], 0)) # Allocate buffers buf_size = 1024 * 1024 * 16 * np.dtype(np.float32).itemsize print(f"Allocating buffers ({int(buf_size / 1024 / 1024)}MB on GPU{gpuid[0]}, GPU{gpuid[1]} and CPU Host)...") - checkCudaErrors(cudart.cudaSetDevice(gpuid[0])) - g0 = checkCudaErrors(cudart.cudaMalloc(buf_size)) - checkCudaErrors(cudart.cudaSetDevice(gpuid[1])) - g1 = checkCudaErrors(cudart.cudaMalloc(buf_size)) - h0 = checkCudaErrors(cudart.cudaMallocHost(buf_size)) # Automatically portable with UVA + check_cuda_errors(cudart.cudaSetDevice(gpuid[0])) + g0 = check_cuda_errors(cudart.cudaMalloc(buf_size)) + check_cuda_errors(cudart.cudaSetDevice(gpuid[1])) + g1 = check_cuda_errors(cudart.cudaMalloc(buf_size)) + h0 = check_cuda_errors(cudart.cudaMallocHost(buf_size)) # Automatically portable with UVA # Create CUDA event handles print("Creating event handles...") eventflags = cudart.cudaEventBlockingSync - start_event = checkCudaErrors(cudart.cudaEventCreateWithFlags(eventflags)) - stop_event = checkCudaErrors(cudart.cudaEventCreateWithFlags(eventflags)) + start_event = check_cuda_errors(cudart.cudaEventCreateWithFlags(eventflags)) + stop_event = check_cuda_errors(cudart.cudaEventCreateWithFlags(eventflags)) # P2P memcopy() benchmark - checkCudaErrors(cudart.cudaEventRecord(start_event, cudart.cudaStream_t(0))) + check_cuda_errors(cudart.cudaEventRecord(start_event, cudart.cudaStream_t(0))) for i in range(100): # With UVA we don't need to specify source and target devices, the # runtime figures this out by itself from the pointers # Ping-pong copy between GPUs if i % 2 == 0: - checkCudaErrors(cudart.cudaMemcpy(g1, g0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault)) + check_cuda_errors(cudart.cudaMemcpy(g1, g0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault)) else: - checkCudaErrors(cudart.cudaMemcpy(g0, g1, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault)) + check_cuda_errors(cudart.cudaMemcpy(g0, g1, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault)) - checkCudaErrors(cudart.cudaEventRecord(stop_event, cudart.cudaStream_t(0))) - checkCudaErrors(cudart.cudaEventSynchronize(stop_event)) - time_memcpy = checkCudaErrors(cudart.cudaEventElapsedTime(start_event, stop_event)) + check_cuda_errors(cudart.cudaEventRecord(stop_event, cudart.cudaStream_t(0))) + check_cuda_errors(cudart.cudaEventSynchronize(stop_event)) + time_memcpy = check_cuda_errors(cudart.cudaEventElapsedTime(start_event, stop_event)) print( f"cudaMemcpyPeer / cudaMemcpy between GPU{gpuid[0]} and GPU{gpuid[1]}: {(1.0 / (time_memcpy / 1000.0)) * (100.0 * buf_size) / 1024.0 / 1024.0 / 1024.0:.2f}GB/s" ) @@ -129,8 +129,8 @@ def main(): for i in range(int(buf_size / np.dtype(np.float32).itemsize)): h0_local[i] = i % 4096 - checkCudaErrors(cudart.cudaSetDevice(gpuid[0])) - checkCudaErrors(cudart.cudaMemcpy(g0, h0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault)) + check_cuda_errors(cudart.cudaSetDevice(gpuid[0])) + check_cuda_errors(cudart.cudaMemcpy(g0, h0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault)) # Kernel launch configuration threads = cudart.dim3() @@ -145,57 +145,61 @@ def main(): # Run kernel on GPU 1, reading input from the GPU 0 buffer, writing # output to the GPU 1 buffer print(f"Run kernel on GPU{gpuid[1]}, taking source data from GPU{gpuid[0]} and writing to GPU{gpuid[1]}...") - checkCudaErrors(cudart.cudaSetDevice(gpuid[1])) - - with common.KernelHelper(simplep2p, gpuid[1]) as kernelHelper: - simple_kernel_1 = kernelHelper.getFunction(b"SimpleKernel") - kernel_args_1 = ((g0, g1), (ctypes.c_void_p, ctypes.c_void_p)) - checkCudaErrors( - cuda.cuLaunchKernel( - simple_kernel_1, - blocks.x, - blocks.y, - blocks.z, - threads.x, - threads.y, - threads.z, - 0, - 0, - kernel_args_1, - 0, - ) + check_cuda_errors(cudart.cudaSetDevice(gpuid[1])) + + kernel_helper = [None] * 2 + _simple_kernel = [None] * 2 + kernel_args = [None] * 2 + + kernel_helper[1] = common.KernelHelper(simplep2p, gpuid[1]) + _simple_kernel[1] = kernel_helper[1].get_function(b"SimpleKernel") + kernel_args[1] = ((g0, g1), (ctypes.c_void_p, ctypes.c_void_p)) + check_cuda_errors( + cuda.cuLaunchKernel( + _simple_kernel[1], + blocks.x, + blocks.y, + blocks.z, + threads.x, + threads.y, + threads.z, + 0, + 0, + kernel_args[1], + 0, ) + ) - checkCudaErrors(cudart.cudaDeviceSynchronize()) + check_cuda_errors(cudart.cudaDeviceSynchronize()) # Run kernel on GPU 0, reading input from the GPU 1 buffer, writing # output to the GPU 0 buffer print(f"Run kernel on GPU{gpuid[0]}, taking source data from GPU{gpuid[1]} and writing to GPU{gpuid[0]}...") - checkCudaErrors(cudart.cudaSetDevice(gpuid[0])) - with common.KernelHelper(simplep2p, gpuid[0]) as kernelHelper: - simple_kernel_0 = kernelHelper.getFunction(b"SimpleKernel") - kernel_args_0 = ((g1, g0), (ctypes.c_void_p, ctypes.c_void_p)) - checkCudaErrors( - cuda.cuLaunchKernel( - simple_kernel_0, - blocks.x, - blocks.y, - blocks.z, - threads.x, - threads.y, - threads.z, - 0, - 0, - kernel_args_0, - 0, - ) + check_cuda_errors(cudart.cudaSetDevice(gpuid[0])) + kernel_helper[0] = common.KernelHelper(simplep2p, gpuid[0]) + _simple_kernel[0] = kernel_helper[0].get_function(b"SimpleKernel") + kernel_args[0] = ((g1, g0), (ctypes.c_void_p, ctypes.c_void_p)) + check_cuda_errors( + cuda.cuLaunchKernel( + _simple_kernel[0], + blocks.x, + blocks.y, + blocks.z, + threads.x, + threads.y, + threads.z, + 0, + 0, + kernel_args[0], + 0, ) + ) - checkCudaErrors(cudart.cudaDeviceSynchronize()) + check_cuda_errors(cudart.cudaDeviceSynchronize()) # Copy data back to host and verify print(f"Copy data back to host from GPU{gpuid[0]} and verify results...") - checkCudaErrors(cudart.cudaMemcpy(h0, g0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault)) + check_cuda_errors(cudart.cudaMemcpy(h0, g0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault)) error_count = 0 @@ -210,23 +214,23 @@ def main(): # Disable peer access (also unregisters memory for non-UVA cases) print("Disabling peer access...") - checkCudaErrors(cudart.cudaSetDevice(gpuid[0])) - checkCudaErrors(cudart.cudaDeviceDisablePeerAccess(gpuid[1])) - checkCudaErrors(cudart.cudaSetDevice(gpuid[1])) - checkCudaErrors(cudart.cudaDeviceDisablePeerAccess(gpuid[0])) + check_cuda_errors(cudart.cudaSetDevice(gpuid[0])) + check_cuda_errors(cudart.cudaDeviceDisablePeerAccess(gpuid[1])) + check_cuda_errors(cudart.cudaSetDevice(gpuid[1])) + check_cuda_errors(cudart.cudaDeviceDisablePeerAccess(gpuid[0])) # Cleanup and shutdown print("Shutting down...") - checkCudaErrors(cudart.cudaEventDestroy(start_event)) - checkCudaErrors(cudart.cudaEventDestroy(stop_event)) - checkCudaErrors(cudart.cudaSetDevice(gpuid[0])) - checkCudaErrors(cudart.cudaFree(g0)) - checkCudaErrors(cudart.cudaSetDevice(gpuid[1])) - checkCudaErrors(cudart.cudaFree(g1)) - checkCudaErrors(cudart.cudaFreeHost(h0)) + check_cuda_errors(cudart.cudaEventDestroy(start_event)) + check_cuda_errors(cudart.cudaEventDestroy(stop_event)) + check_cuda_errors(cudart.cudaSetDevice(gpuid[0])) + check_cuda_errors(cudart.cudaFree(g0)) + check_cuda_errors(cudart.cudaSetDevice(gpuid[1])) + check_cuda_errors(cudart.cudaFree(g1)) + check_cuda_errors(cudart.cudaFreeHost(h0)) for i in range(gpu_n): - checkCudaErrors(cudart.cudaSetDevice(i)) + check_cuda_errors(cudart.cudaSetDevice(i)) if error_count != 0: print("Test failed!", file=sys.stderr) diff --git a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py index ea64017b95..d4bf44e19a 100644 --- a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py +++ b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py @@ -9,13 +9,13 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors -from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt +from common.helper_cuda import check_cuda_errors +from common.helper_string import check_cmd_line_flag, get_cmd_line_argument_int from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart -simpleZeroCopy = """\ +simple_zero_copy = """\ extern "C" __global__ void vectorAddGPU(float *a, float *b, float *c, int N) { @@ -31,7 +31,7 @@ def main(): idev = 0 - bPinGenericMemory = False + b_pin_generic_memory = False import pytest @@ -47,7 +47,7 @@ def main(): if platform.machine() == "sbsa": pytest.skip("simpleZeroCopy is not supported on sbsa") - if checkCmdLineFlag("help"): + if check_cmd_line_flag("help"): print("Usage: simpleZeroCopy [OPTION]\n", file=sys.stderr) print("Options:", file=sys.stderr) print(" device=[device #] Specify the device to be used", file=sys.stderr) @@ -55,50 +55,50 @@ def main(): sys.exit(1) # Get the device selected by the user or default to 0, and then set it. - if checkCmdLineFlag("device="): - deviceCount = cudart.cudaGetDeviceCount() - idev = int(getCmdLineArgumentInt("device=")) + if check_cmd_line_flag("device="): + device_count = cudart.cudaGetDeviceCount() + idev = int(get_cmd_line_argument_int("device=")) - if idev >= deviceCount or idev < 0: + if idev >= device_count or idev < 0: print(f"Device number {idev} is invalid, will use default CUDA device 0.") idev = 0 - if checkCmdLineFlag("use_generic_memory"): - bPinGenericMemory = True + if check_cmd_line_flag("use_generic_memory"): + b_pin_generic_memory = True - if bPinGenericMemory: + if b_pin_generic_memory: print("> Using Generic System Paged Memory (malloc)") else: print("> Using CUDA Host Allocated (cudaHostAlloc)") - checkCudaErrors(cudart.cudaSetDevice(idev)) + check_cuda_errors(cudart.cudaSetDevice(idev)) # Verify the selected device supports mapped memory and set the device flags for mapping host memory. - deviceProp = checkCudaErrors(cudart.cudaGetDeviceProperties(idev)) + device_prop = check_cuda_errors(cudart.cudaGetDeviceProperties(idev)) - if not deviceProp.canMapHostMemory: + if not device_prop.canMapHostMemory: pytest.skip(f"Device {idev} does not support mapping CPU host memory!") - checkCudaErrors(cudart.cudaSetDeviceFlags(cudart.cudaDeviceMapHost)) + check_cuda_errors(cudart.cudaSetDeviceFlags(cudart.cudaDeviceMapHost)) # Allocate mapped CPU memory nelem = 1048576 num_bytes = nelem * np.dtype(np.float32).itemsize - if bPinGenericMemory: + if b_pin_generic_memory: a = np.empty(nelem, dtype=np.float32) b = np.empty(nelem, dtype=np.float32) c = np.empty(nelem, dtype=np.float32) - checkCudaErrors(cudart.cudaHostRegister(a, num_bytes, cudart.cudaHostRegisterMapped)) - checkCudaErrors(cudart.cudaHostRegister(b, num_bytes, cudart.cudaHostRegisterMapped)) - checkCudaErrors(cudart.cudaHostRegister(c, num_bytes, cudart.cudaHostRegisterMapped)) + check_cuda_errors(cudart.cudaHostRegister(a, num_bytes, cudart.cudaHostRegisterMapped)) + check_cuda_errors(cudart.cudaHostRegister(b, num_bytes, cudart.cudaHostRegisterMapped)) + check_cuda_errors(cudart.cudaHostRegister(c, num_bytes, cudart.cudaHostRegisterMapped)) else: flags = cudart.cudaHostAllocMapped - a_ptr = checkCudaErrors(cudart.cudaHostAlloc(num_bytes, flags)) - b_ptr = checkCudaErrors(cudart.cudaHostAlloc(num_bytes, flags)) - c_ptr = checkCudaErrors(cudart.cudaHostAlloc(num_bytes, flags)) + a_ptr = check_cuda_errors(cudart.cudaHostAlloc(num_bytes, flags)) + b_ptr = check_cuda_errors(cudart.cudaHostAlloc(num_bytes, flags)) + c_ptr = check_cuda_errors(cudart.cudaHostAlloc(num_bytes, flags)) a = (ctypes.c_float * nelem).from_address(a_ptr) b = (ctypes.c_float * nelem).from_address(b_ptr) @@ -110,9 +110,9 @@ def main(): b[n] = rnd.random() # Get the device pointers for the pinned CPU memory mapped into the GPU memory space - d_a = checkCudaErrors(cudart.cudaHostGetDevicePointer(a, 0)) - d_b = checkCudaErrors(cudart.cudaHostGetDevicePointer(b, 0)) - d_c = checkCudaErrors(cudart.cudaHostGetDevicePointer(c, 0)) + d_a = check_cuda_errors(cudart.cudaHostGetDevicePointer(a, 0)) + d_b = check_cuda_errors(cudart.cudaHostGetDevicePointer(b, 0)) + d_c = check_cuda_errors(cudart.cudaHostGetDevicePointer(c, 0)) # Call the GPU kernel using the CPU pointers residing in CPU mapped memory print("> vectorAddGPU kernel will add vectors using mapped CPU memory...") @@ -124,57 +124,57 @@ def main(): grid.x = math.ceil(nelem / float(block.x)) grid.y = 1 grid.z = 1 - with common.KernelHelper(simpleZeroCopy, idev) as kernelHelper: - _vectorAddGPU = kernelHelper.getFunction(b"vectorAddGPU") - kernelArgs = ( - (d_a, d_b, d_c, nelem), - (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int), + kernel_helper = common.KernelHelper(simple_zero_copy, idev) + _vector_add_gpu = kernel_helper.get_function(b"vectorAddGPU") + kernel_args = ( + (d_a, d_b, d_c, nelem), + (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int), + ) + check_cuda_errors( + cuda.cuLaunchKernel( + _vector_add_gpu, + grid.x, + grid.y, + grid.z, + block.x, + block.y, + block.z, + 0, + cuda.CU_STREAM_LEGACY, + kernel_args, + 0, ) - checkCudaErrors( - cuda.cuLaunchKernel( - _vectorAddGPU, - grid.x, - grid.y, - grid.z, - block.x, - block.y, - block.z, - 0, - cuda.CU_STREAM_LEGACY, - kernelArgs, - 0, - ) - ) - checkCudaErrors(cudart.cudaDeviceSynchronize()) + ) + check_cuda_errors(cudart.cudaDeviceSynchronize()) print("> Checking the results from vectorAddGPU() ...") # Compare the results - errorNorm = 0.0 - refNorm = 0.0 + error_norm = 0.0 + ref_norm = 0.0 for n in range(nelem): ref = a[n] + b[n] diff = c[n] - ref - errorNorm += diff * diff - refNorm += ref * ref + error_norm += diff * diff + ref_norm += ref * ref - errorNorm = math.sqrt(errorNorm) - refNorm = math.sqrt(refNorm) + error_norm = math.sqrt(error_norm) + ref_norm = math.sqrt(ref_norm) # Memory clean up print("Releasing CPU memory...") - if bPinGenericMemory: - checkCudaErrors(cudart.cudaHostUnregister(a)) - checkCudaErrors(cudart.cudaHostUnregister(b)) - checkCudaErrors(cudart.cudaHostUnregister(c)) + if b_pin_generic_memory: + check_cuda_errors(cudart.cudaHostUnregister(a)) + check_cuda_errors(cudart.cudaHostUnregister(b)) + check_cuda_errors(cudart.cudaHostUnregister(c)) else: - checkCudaErrors(cudart.cudaFreeHost(a)) - checkCudaErrors(cudart.cudaFreeHost(b)) - checkCudaErrors(cudart.cudaFreeHost(c)) + check_cuda_errors(cudart.cudaFreeHost(a)) + check_cuda_errors(cudart.cudaFreeHost(b)) + check_cuda_errors(cudart.cudaFreeHost(c)) - if errorNorm / refNorm >= 1.0e-7: + if error_norm / ref_norm >= 1.0e-7: print("FAILED", file=sys.stderr) sys.exit(1) diff --git a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py index df52462854..94a356101f 100644 --- a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py +++ b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py @@ -7,12 +7,12 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors, findCudaDevice +from common.helper_cuda import check_cuda_errors, find_cuda_device from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart -systemWideAtomics = """\ +system_wide_atomics = """\ #define LOOP_NUM 50 extern "C" @@ -63,21 +63,21 @@ #! @param reference reference data, computed but preallocated #! @param idata input data as provided to device #! @param len number of elements in reference / idata -def verify(testData, length): +def verify(test_data, length): val = 0 for i in range(length * LOOP_NUM): val += 10 - if val != testData[0]: - print(f"atomicAdd failed val = {val} testData = {testData[0]}") + if val != test_data[0]: + print(f"atomicAdd failed val = {val} test_data = {test_data[0]}") return False val = 0 found = False for i in range(length): # second element should be a member of [0, len) - if i == testData[1]: + if i == test_data[1]: found = True break @@ -91,7 +91,7 @@ def verify(testData, length): # third element should be len-1 val = max(val, i) - if val != testData[2]: + if val != test_data[2]: print("atomicMax failed") return False @@ -100,7 +100,7 @@ def verify(testData, length): for i in range(length): val = min(val, i) - if val != testData[3]: + if val != test_data[3]: print("atomicMin failed") return False @@ -110,7 +110,7 @@ def verify(testData, length): for i in range(length * LOOP_NUM): val = 0 if val >= limit else val + 1 - if val != testData[4]: + if val != test_data[4]: print("atomicInc failed") return False @@ -120,7 +120,7 @@ def verify(testData, length): for i in range(length * LOOP_NUM): val = limit if (val == 0) or (val > limit) else val - 1 - if val != testData[5]: + if val != test_data[5]: print("atomicDec failed") return False @@ -128,7 +128,7 @@ def verify(testData, length): for i in range(length): # seventh element should be a member of [0, len) - if i == testData[6]: + if i == test_data[6]: found = True break @@ -142,13 +142,13 @@ def verify(testData, length): # 8th element should be 1 val &= 2 * i + 7 - if val != testData[7]: + if val != test_data[7]: print("atomicAnd failed") return False # 9th element should be 0xff val = -1 - if val != testData[8]: + if val != test_data[8]: print("atomicOr failed") return False @@ -158,7 +158,7 @@ def verify(testData, length): # 11th element should be 0xff val ^= i - if val != testData[9]: + if val != test_data[9]: print("atomicXor failed") return False @@ -172,72 +172,74 @@ def main(): pytest.skip("Atomics not supported on Windows") # set device - dev_id = findCudaDevice() - device_prop = checkCudaErrors(cudart.cudaGetDeviceProperties(dev_id)) + dev_id = find_cuda_device() + device_prop = check_cuda_errors(cudart.cudaGetDeviceProperties(dev_id)) if not device_prop.managedMemory: pytest.skip("Unified Memory not supported on this device") - computeMode = checkCudaErrors(cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeMode, dev_id)) - if computeMode == cudart.cudaComputeMode.cudaComputeModeProhibited: + compute_mode = check_cuda_errors( + cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeMode, dev_id) + ) + if compute_mode == cudart.cudaComputeMode.cudaComputeModeProhibited: pytest.skip("This sample requires a device in either default or process exclusive mode") if device_prop.major < 6: pytest.skip("Requires a minimum CUDA compute 6.0 capability") - numThreads = 256 - numBlocks = 64 - numData = 10 + num_threads = 256 + num_blocks = 64 + num_data = 10 if device_prop.pageableMemoryAccess: print("CAN access pageable memory") - atom_arr_h = (ctypes.c_int * numData)(0) + atom_arr_h = (ctypes.c_int * num_data)(0) atom_arr = ctypes.addressof(atom_arr_h) else: print("CANNOT access pageable memory") - atom_arr = checkCudaErrors( - cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * numData, cudart.cudaMemAttachGlobal) + atom_arr = check_cuda_errors( + cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * num_data, cudart.cudaMemAttachGlobal) ) - atom_arr_h = (ctypes.c_int * numData).from_address(atom_arr) + atom_arr_h = (ctypes.c_int * num_data).from_address(atom_arr) - for i in range(numData): + for i in range(num_data): atom_arr_h[i] = 0 # To make the AND and XOR tests generate something other than 0... atom_arr_h[7] = atom_arr_h[9] = 0xFF - with common.KernelHelper(systemWideAtomics, dev_id) as kernelHelper: - _atomicKernel = kernelHelper.getFunction(b"atomicKernel") - kernelArgs = ((atom_arr,), (ctypes.c_void_p,)) - checkCudaErrors( - cuda.cuLaunchKernel( - _atomicKernel, - numBlocks, - 1, - 1, # grid dim - numThreads, - 1, - 1, # block dim - 0, - cuda.CU_STREAM_LEGACY, # shared mem and stream - kernelArgs, - 0, - ) - ) # arguments + kernel_helper = common.KernelHelper(system_wide_atomics, dev_id) + _atomic_kernel = kernel_helper.get_function(b"atomicKernel") + kernel_args = ((atom_arr,), (ctypes.c_void_p,)) + check_cuda_errors( + cuda.cuLaunchKernel( + _atomic_kernel, + num_blocks, + 1, + 1, # grid dim + num_threads, + 1, + 1, # block dim + 0, + cuda.CU_STREAM_LEGACY, # shared mem and stream + kernel_args, + 0, + ) + ) # arguments # NOTE: Python doesn't have an equivalent system atomic operations # atomicKernel_CPU(atom_arr_h, numBlocks * numThreads) - checkCudaErrors(cudart.cudaDeviceSynchronize()) + check_cuda_errors(cudart.cudaDeviceSynchronize()) # Compute & verify reference solution - testResult = verify(atom_arr_h, numThreads * numBlocks) + test_result = verify(atom_arr_h, num_threads * num_blocks) if device_prop.pageableMemoryAccess: pass else: - checkCudaErrors(cudart.cudaFree(atom_arr)) + check_cuda_errors(cudart.cudaFree(atom_arr)) - if not testResult: + if not test_result: print("systemWideAtomics completed with errors", file=sys.stderr) sys.exit(1) diff --git a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py index 8ee238e36b..8c70aadd3a 100644 --- a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py +++ b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py @@ -7,11 +7,11 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors, findCudaDeviceDRV +from common.helper_cuda import check_cuda_errors, find_cuda_device_drv from cuda.bindings import driver as cuda -vectorAddDrv = """\ +vector_add_drv = """\ /* Vector addition: C = A + B. * * This sample is a very basic sample that implements element by element @@ -32,82 +32,82 @@ def main(): - N = 50000 - nbytes = N * np.dtype(np.float32).itemsize + n = 50000 + nbytes = n * np.dtype(np.float32).itemsize # Initialize - checkCudaErrors(cuda.cuInit(0)) - cuDevice = findCudaDeviceDRV() + check_cuda_errors(cuda.cuInit(0)) + cu_device = find_cuda_device_drv() # Create context - cuContext = checkCudaErrors(cuda.cuCtxCreate(None, 0, cuDevice)) + cu_context = check_cuda_errors(cuda.cuCtxCreate(None, 0, cu_device)) - uvaSupported = checkCudaErrors( - cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice) + uva_supported = check_cuda_errors( + cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cu_device) ) - if not uvaSupported: + if not uva_supported: import pytest pytest.skip("Accessing pageable memory directly requires UVA") - with common.KernelHelper(vectorAddDrv, int(cuDevice)) as kernelHelper: - _VecAdd_kernel = kernelHelper.getFunction(b"VecAdd_kernel") - - # Allocate input vectors h_A and h_B in host memory - h_A = np.random.rand(N).astype(dtype=np.float32) - h_B = np.random.rand(N).astype(dtype=np.float32) - h_C = np.random.rand(N).astype(dtype=np.float32) - - # Allocate vectors in device memory - d_A = checkCudaErrors(cuda.cuMemAlloc(nbytes)) - d_B = checkCudaErrors(cuda.cuMemAlloc(nbytes)) - d_C = checkCudaErrors(cuda.cuMemAlloc(nbytes)) - - # Copy vectors from host memory to device memory - checkCudaErrors(cuda.cuMemcpyHtoD(d_A, h_A, nbytes)) - checkCudaErrors(cuda.cuMemcpyHtoD(d_B, h_B, nbytes)) - - if True: - # Grid/Block configuration - threadsPerBlock = 256 - blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock - - kernelArgs = ((d_A, d_B, d_C, N), (None, None, None, ctypes.c_int)) - - # Launch the CUDA kernel - checkCudaErrors( - cuda.cuLaunchKernel( - _VecAdd_kernel, - blocksPerGrid, - 1, - 1, - threadsPerBlock, - 1, - 1, - 0, - 0, - kernelArgs, - 0, - ) + kernel_helper = common.KernelHelper(vector_add_drv, int(cu_device)) + _vec_add_kernel = kernel_helper.get_function(b"VecAdd_kernel") + + # Allocate input vectors h_A and h_B in host memory + h_a = np.random.rand(n).astype(dtype=np.float32) + h_b = np.random.rand(n).astype(dtype=np.float32) + h_c = np.random.rand(n).astype(dtype=np.float32) + + # Allocate vectors in device memory + d_a = check_cuda_errors(cuda.cuMemAlloc(nbytes)) + d_b = check_cuda_errors(cuda.cuMemAlloc(nbytes)) + d_c = check_cuda_errors(cuda.cuMemAlloc(nbytes)) + + # Copy vectors from host memory to device memory + check_cuda_errors(cuda.cuMemcpyHtoD(d_a, h_a, nbytes)) + check_cuda_errors(cuda.cuMemcpyHtoD(d_b, h_b, nbytes)) + + if True: + # Grid/Block configuration + threads_per_block = 256 + blocks_per_grid = (n + threads_per_block - 1) / threads_per_block + + kernel_args = ((d_a, d_b, d_c, n), (None, None, None, ctypes.c_int)) + + # Launch the CUDA kernel + check_cuda_errors( + cuda.cuLaunchKernel( + _vec_add_kernel, + blocks_per_grid, + 1, + 1, + threads_per_block, + 1, + 1, + 0, + 0, + kernel_args, + 0, ) - else: - pass - - # Copy result from device memory to host memory - # h_C contains the result in host memory - checkCudaErrors(cuda.cuMemcpyDtoH(h_C, d_C, nbytes)) - - for i in range(N): - sum_all = h_A[i] + h_B[i] - if math.fabs(h_C[i] - sum_all) > 1e-7: - break - - # Free device memory - checkCudaErrors(cuda.cuMemFree(d_A)) - checkCudaErrors(cuda.cuMemFree(d_B)) - checkCudaErrors(cuda.cuMemFree(d_C)) - - checkCudaErrors(cuda.cuCtxDestroy(cuContext)) - if i + 1 != N: + ) + else: + pass + + # Copy result from device memory to host memory + # h_C contains the result in host memory + check_cuda_errors(cuda.cuMemcpyDtoH(h_c, d_c, nbytes)) + + for i in range(n): + sum_all = h_a[i] + h_b[i] + if math.fabs(h_c[i] - sum_all) > 1e-7: + break + + # Free device memory + check_cuda_errors(cuda.cuMemFree(d_a)) + check_cuda_errors(cuda.cuMemFree(d_b)) + check_cuda_errors(cuda.cuMemFree(d_c)) + + check_cuda_errors(cuda.cuCtxDestroy(cu_context)) + if i + 1 != n: print("Result = FAIL", file=sys.stderr) sys.exit(1) diff --git a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py index c7f9e6275b..d5e2e3d26f 100644 --- a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py +++ b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py @@ -8,11 +8,11 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors, findCudaDeviceDRV +from common.helper_cuda import check_cuda_errors, find_cuda_device_drv from cuda.bindings import driver as cuda -vectorAddMMAP = """\ +vector_add_mmap = """\ /* Vector addition: C = A + B. * * This sample is a very basic sample that implements element by element @@ -36,35 +36,35 @@ def round_up(x, y): return int((x - 1) / y + 1) * y -def getBackingDevices(cuDevice): - num_devices = checkCudaErrors(cuda.cuDeviceGetCount()) +def get_backing_devices(cu_device): + num_devices = check_cuda_errors(cuda.cuDeviceGetCount()) - backingDevices = [cuDevice] + backing_devices = [cu_device] for dev in range(num_devices): # The mapping device is already in the backingDevices vector - if int(dev) == int(cuDevice): + if int(dev) == int(cu_device): continue # Only peer capable devices can map each others memory - capable = checkCudaErrors(cuda.cuDeviceCanAccessPeer(cuDevice, dev)) + capable = check_cuda_errors(cuda.cuDeviceCanAccessPeer(cu_device, dev)) if not capable: continue # The device needs to support virtual address management for the required apis to work - attributeVal = checkCudaErrors( + attribute_val = check_cuda_errors( cuda.cuDeviceGetAttribute( cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, - cuDevice, + cu_device, ) ) - if attributeVal == 0: + if attribute_val == 0: continue - backingDevices.append(cuda.CUdevice(dev)) - return backingDevices + backing_devices.append(cuda.CUdevice(dev)) + return backing_devices -def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0): +def simple_malloc_multi_device_mmap(size, resident_devices, mapping_devices, align=0): min_granularity = 0 # Setup the properties common for all the chunks @@ -77,7 +77,7 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0): # Get the minimum granularity needed for the resident devices # (the max of the minimum granularity of each participating device) - for device in residentDevices: + for device in resident_devices: prop.location.id = device status, granularity = cuda.cuMemGetAllocationGranularity( prop, cuda.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM @@ -89,7 +89,7 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0): # Get the minimum granularity needed for the accessing devices # (the max of the minimum granularity of each participating device) - for device in mappingDevices: + for device in mapping_devices: prop.location.id = device status, granularity = cuda.cuMemGetAllocationGranularity( prop, cuda.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM @@ -103,28 +103,28 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0): # Essentially size = N * residentDevices.size() * min_granularity is the requirement, # since each piece of the allocation will be stripeSize = N * min_granularity # and the min_granularity requirement applies to each stripeSize piece of the allocation. - size = round_up(size, len(residentDevices) * min_granularity) - stripeSize = size / len(residentDevices) + size = round_up(size, len(resident_devices) * min_granularity) + stripe_size = size / len(resident_devices) # Return the rounded up size to the caller for use in the free - allocationSize = size + allocation_size = size # Reserve the required contiguous VA space for the allocations status, dptr = cuda.cuMemAddressReserve(size, align, cuda.CUdeviceptr(0), 0) if status != cuda.CUresult.CUDA_SUCCESS: - simpleFreeMultiDeviceMmap(dptr, size) + simple_free_multi_device_mmap(dptr, size) return status, None, None # Create and map the backings on each gpu # note: reusing CUmemAllocationProp prop from earlier with prop.type & prop.location.type already specified. - for idx in range(len(residentDevices)): + for idx in range(len(resident_devices)): # Set the location for this chunk to this device - prop.location.id = residentDevices[idx] + prop.location.id = resident_devices[idx] # Create the allocation as a pinned allocation on this device - status, allocationHandle = cuda.cuMemCreate(stripeSize, prop, 0) + status, allocation_handle = cuda.cuMemCreate(stripe_size, prop, 0) if status != cuda.CUresult.CUDA_SUCCESS: - simpleFreeMultiDeviceMmap(dptr, size) + simple_free_multi_device_mmap(dptr, size) return status, None, None # Assign the chunk to the appropriate VA range and release the handle. @@ -132,10 +132,10 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0): # Since we do not need to make any other mappings of this memory or export it, # we no longer need and can release the allocationHandle. # The allocation will be kept live until it is unmapped. - (status,) = cuda.cuMemMap(int(dptr) + (stripeSize * idx), stripeSize, 0, allocationHandle, 0) + (status,) = cuda.cuMemMap(int(dptr) + (stripe_size * idx), stripe_size, 0, allocation_handle, 0) # the handle needs to be released even if the mapping failed. - (status2,) = cuda.cuMemRelease(allocationHandle) + (status2,) = cuda.cuMemRelease(allocation_handle) if status != cuda.CUresult.CUDA_SUCCESS: # cuMemRelease should not have failed here # as the handle was just allocated successfully @@ -144,31 +144,31 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0): # Cleanup in case of any mapping failures. if status != cuda.CUresult.CUDA_SUCCESS: - simpleFreeMultiDeviceMmap(dptr, size) + simple_free_multi_device_mmap(dptr, size) return status, None, None # Each accessDescriptor will describe the mapping requirement for a single device - accessDescriptors = [cuda.CUmemAccessDesc()] * len(mappingDevices) + access_descriptors = [cuda.CUmemAccessDesc()] * len(mapping_devices) # Prepare the access descriptor array indicating where and how the backings should be visible. - for idx in range(len(mappingDevices)): + for idx in range(len(mapping_devices)): # Specify which device we are adding mappings for. - accessDescriptors[idx].location.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - accessDescriptors[idx].location.id = mappingDevices[idx] + access_descriptors[idx].location.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + access_descriptors[idx].location.id = mapping_devices[idx] # Specify both read and write access. - accessDescriptors[idx].flags = cuda.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE + access_descriptors[idx].flags = cuda.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE # Apply the access descriptors to the whole VA range. - (status,) = cuda.cuMemSetAccess(dptr, size, accessDescriptors, len(accessDescriptors)) + (status,) = cuda.cuMemSetAccess(dptr, size, access_descriptors, len(access_descriptors)) if status != cuda.CUresult.CUDA_SUCCESS: - simpleFreeMultiDeviceMmap(dptr, size) + simple_free_multi_device_mmap(dptr, size) return status, None, None - return (status, dptr, allocationSize) + return (status, dptr, allocation_size) -def simpleFreeMultiDeviceMmap(dptr, size): +def simple_free_multi_device_mmap(dptr, size): # Unmap the mapped virtual memory region # Since the handles to the mapped backing stores have already been released # by cuMemRelease, and these are the only/last mappings referencing them, @@ -204,97 +204,97 @@ def main(): if platform.machine() == "sbsa": pytest.skip("vectorAddMMAP is not supported on sbsa") - N = 50000 - size = N * np.dtype(np.float32).itemsize + n = 50000 + size = n * np.dtype(np.float32).itemsize # Initialize - checkCudaErrors(cuda.cuInit(0)) + check_cuda_errors(cuda.cuInit(0)) - cuDevice = findCudaDeviceDRV() + cu_device = find_cuda_device_drv() # Check that the selected device supports virtual address management - attributeVal = checkCudaErrors( + attribute_val = check_cuda_errors( cuda.cuDeviceGetAttribute( cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, - cuDevice, + cu_device, ) ) - print(f"Device {cuDevice} VIRTUAL ADDRESS MANAGEMENT SUPPORTED = {attributeVal}.") - if not attributeVal: - pytest.skip(f"Device {cuDevice} doesn't support VIRTUAL ADDRESS MANAGEMENT.") + print(f"Device {cu_device} VIRTUAL ADDRESS MANAGEMENT SUPPORTED = {attribute_val}.") + if not attribute_val: + pytest.skip(f"Device {cu_device} doesn't support VIRTUAL ADDRESS MANAGEMENT.") # The vector addition happens on cuDevice, so the allocations need to be mapped there. - mappingDevices = [cuDevice] + mapping_devices = [cu_device] # Collect devices accessible by the mapping device (cuDevice) into the backingDevices vector. - backingDevices = getBackingDevices(cuDevice) + backing_devices = get_backing_devices(cu_device) # Create context - cuContext = checkCudaErrors(cuda.cuCtxCreate(None, 0, cuDevice)) - - with common.KernelHelper(vectorAddMMAP, int(cuDevice)) as kernelHelper: - _VecAdd_kernel = kernelHelper.getFunction(b"VecAdd_kernel") - - # Allocate input vectors h_A and h_B in host memory - h_A = np.random.rand(size).astype(dtype=np.float32) - h_B = np.random.rand(size).astype(dtype=np.float32) - h_C = np.random.rand(size).astype(dtype=np.float32) - - # Allocate vectors in device memory - # note that a call to cuCtxEnablePeerAccess is not needed even though - # the backing devices and mapping device are not the same. - # This is because the cuMemSetAccess call explicitly specifies - # the cross device mapping. - # cuMemSetAccess is still subject to the constraints of cuDeviceCanAccessPeer - # for cross device mappings (hence why we checked cuDeviceCanAccessPeer earlier). - d_A, allocationSize = checkCudaErrors(simpleMallocMultiDeviceMmap(size, backingDevices, mappingDevices)) - d_B, _ = checkCudaErrors(simpleMallocMultiDeviceMmap(size, backingDevices, mappingDevices)) - d_C, _ = checkCudaErrors(simpleMallocMultiDeviceMmap(size, backingDevices, mappingDevices)) - - # Copy vectors from host memory to device memory - checkCudaErrors(cuda.cuMemcpyHtoD(d_A, h_A, size)) - checkCudaErrors(cuda.cuMemcpyHtoD(d_B, h_B, size)) - - # Grid/Block configuration - threadsPerBlock = 256 - blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock - - kernelArgs = ((d_A, d_B, d_C, N), (None, None, None, ctypes.c_int)) - - # Launch the CUDA kernel - checkCudaErrors( - cuda.cuLaunchKernel( - _VecAdd_kernel, - blocksPerGrid, - 1, - 1, - threadsPerBlock, - 1, - 1, - 0, - 0, - kernelArgs, - 0, - ) + cu_context = check_cuda_errors(cuda.cuCtxCreate(None, 0, cu_device)) + + kernel_helper = common.KernelHelper(vector_add_mmap, int(cu_device)) + _vec_add_kernel = kernel_helper.get_function(b"VecAdd_kernel") + + # Allocate input vectors h_A and h_B in host memory + h_a = np.random.rand(size).astype(dtype=np.float32) + h_b = np.random.rand(size).astype(dtype=np.float32) + h_c = np.random.rand(size).astype(dtype=np.float32) + + # Allocate vectors in device memory + # note that a call to cuCtxEnablePeerAccess is not needed even though + # the backing devices and mapping device are not the same. + # This is because the cuMemSetAccess call explicitly specifies + # the cross device mapping. + # cuMemSetAccess is still subject to the constraints of cuDeviceCanAccessPeer + # for cross device mappings (hence why we checked cuDeviceCanAccessPeer earlier). + d_a, allocation_size = check_cuda_errors(simple_malloc_multi_device_mmap(size, backing_devices, mapping_devices)) + d_b, _ = check_cuda_errors(simple_malloc_multi_device_mmap(size, backing_devices, mapping_devices)) + d_c, _ = check_cuda_errors(simple_malloc_multi_device_mmap(size, backing_devices, mapping_devices)) + + # Copy vectors from host memory to device memory + check_cuda_errors(cuda.cuMemcpyHtoD(d_a, h_a, size)) + check_cuda_errors(cuda.cuMemcpyHtoD(d_b, h_b, size)) + + # Grid/Block configuration + threads_per_block = 256 + blocks_per_grid = (n + threads_per_block - 1) / threads_per_block + + kernel_args = ((d_a, d_b, d_c, n), (None, None, None, ctypes.c_int)) + + # Launch the CUDA kernel + check_cuda_errors( + cuda.cuLaunchKernel( + _vec_add_kernel, + blocks_per_grid, + 1, + 1, + threads_per_block, + 1, + 1, + 0, + 0, + kernel_args, + 0, ) + ) - # Copy result from device memory to host memory - # h_C contains the result in host memory - checkCudaErrors(cuda.cuMemcpyDtoH(h_C, d_C, size)) + # Copy result from device memory to host memory + # h_C contains the result in host memory + check_cuda_errors(cuda.cuMemcpyDtoH(h_c, d_c, size)) - # Verify result - for i in range(N): - sum_all = h_A[i] + h_B[i] - if math.fabs(h_C[i] - sum_all) > 1e-7: - break + # Verify result + for i in range(n): + sum_all = h_a[i] + h_b[i] + if math.fabs(h_c[i] - sum_all) > 1e-7: + break - checkCudaErrors(simpleFreeMultiDeviceMmap(d_A, allocationSize)) - checkCudaErrors(simpleFreeMultiDeviceMmap(d_B, allocationSize)) - checkCudaErrors(simpleFreeMultiDeviceMmap(d_C, allocationSize)) + check_cuda_errors(simple_free_multi_device_mmap(d_a, allocation_size)) + check_cuda_errors(simple_free_multi_device_mmap(d_b, allocation_size)) + check_cuda_errors(simple_free_multi_device_mmap(d_c, allocation_size)) - checkCudaErrors(cuda.cuCtxDestroy(cuContext)) + check_cuda_errors(cuda.cuCtxDestroy(cu_context)) - if i + 1 != N: + if i + 1 != n: print("Result = FAIL", file=sys.stderr) sys.exit(1) diff --git a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py index afe769ca15..f26dd2dabe 100644 --- a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py +++ b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py @@ -9,13 +9,13 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors, findCudaDevice -from common.helper_string import checkCmdLineFlag +from common.helper_cuda import check_cuda_errors, find_cuda_device +from common.helper_string import check_cmd_line_flag from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart -streamOrderedAllocation = """\ +stream_ordered_allocation = """\ /* Add two vectors on the GPU */ extern "C" __global__ void vectorAddGPU(const float *a, const float *b, float *c, int N) @@ -31,18 +31,18 @@ MAX_ITER = 20 -def basicStreamOrderedAllocation(dev, nelem, a, b, c): +def basic_stream_ordered_allocation(dev, nelem, a, b, c): num_bytes = nelem * np.dtype(np.float32).itemsize print("Starting basicStreamOrderedAllocation()") - checkCudaErrors(cudart.cudaSetDevice(dev)) - stream = checkCudaErrors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking)) + check_cuda_errors(cudart.cudaSetDevice(dev)) + stream = check_cuda_errors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking)) - d_a = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream)) - d_b = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream)) - d_c = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream)) - checkCudaErrors(cudart.cudaMemcpyAsync(d_a, a, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)) - checkCudaErrors(cudart.cudaMemcpyAsync(d_b, b, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)) + d_a = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream)) + d_b = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream)) + d_c = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream)) + check_cuda_errors(cudart.cudaMemcpyAsync(d_a, a, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)) + check_cuda_errors(cudart.cudaMemcpyAsync(d_b, b, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)) block = cudart.dim3() block.x = 256 @@ -53,13 +53,13 @@ def basicStreamOrderedAllocation(dev, nelem, a, b, c): grid.y = 1 grid.z = 1 - kernelArgs = ( + kernel_args = ( (d_a, d_b, d_c, nelem), (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int), ) - checkCudaErrors( + check_cuda_errors( cuda.cuLaunchKernel( - _vectorAddGPU, + _vector_add_gpu, grid.x, grid.y, grid.z, # grid dim @@ -68,68 +68,72 @@ def basicStreamOrderedAllocation(dev, nelem, a, b, c): block.z, # block dim 0, stream, # shared mem and stream - kernelArgs, + kernel_args, 0, ) ) # arguments - checkCudaErrors(cudart.cudaFreeAsync(d_a, stream)) - checkCudaErrors(cudart.cudaFreeAsync(d_b, stream)) - checkCudaErrors(cudart.cudaMemcpyAsync(c, d_c, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)) - checkCudaErrors(cudart.cudaFreeAsync(d_c, stream)) - checkCudaErrors(cudart.cudaStreamSynchronize(stream)) + check_cuda_errors(cudart.cudaFreeAsync(d_a, stream)) + check_cuda_errors(cudart.cudaFreeAsync(d_b, stream)) + check_cuda_errors(cudart.cudaMemcpyAsync(c, d_c, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)) + check_cuda_errors(cudart.cudaFreeAsync(d_c, stream)) + check_cuda_errors(cudart.cudaStreamSynchronize(stream)) # Compare the results print("> Checking the results from vectorAddGPU() ...") - errorNorm = 0.0 - refNorm = 0.0 + error_norm = 0.0 + ref_norm = 0.0 for n in range(nelem): ref = a[n] + b[n] diff = c[n] - ref - errorNorm += diff * diff - refNorm += ref * ref + error_norm += diff * diff + ref_norm += ref * ref - errorNorm = math.sqrt(errorNorm) - refNorm = math.sqrt(refNorm) + error_norm = math.sqrt(error_norm) + ref_norm = math.sqrt(ref_norm) - checkCudaErrors(cudart.cudaStreamDestroy(stream)) + check_cuda_errors(cudart.cudaStreamDestroy(stream)) - return errorNorm / refNorm < 1.0e-6 + return error_norm / ref_norm < 1.0e-6 # streamOrderedAllocationPostSync(): demonstrates If the application wants the memory to persist in the pool beyond # synchronization, then it sets the release threshold on the pool. This way, when the application reaches the "steady state", # it is no longer allocating/freeing memory from the OS. -def streamOrderedAllocationPostSync(dev, nelem, a, b, c): +def stream_ordered_allocation_post_sync(dev, nelem, a, b, c): num_bytes = nelem * np.dtype(np.float32).itemsize print("Starting streamOrderedAllocationPostSync()") - checkCudaErrors(cudart.cudaSetDevice(dev)) - stream = checkCudaErrors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking)) - start = checkCudaErrors(cudart.cudaEventCreate()) - end = checkCudaErrors(cudart.cudaEventCreate()) + check_cuda_errors(cudart.cudaSetDevice(dev)) + stream = check_cuda_errors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking)) + start = check_cuda_errors(cudart.cudaEventCreate()) + end = check_cuda_errors(cudart.cudaEventCreate()) - memPool = checkCudaErrors(cudart.cudaDeviceGetDefaultMemPool(dev)) - thresholdVal = cuda.cuuint64_t(ctypes.c_uint64(-1).value) + mem_pool = check_cuda_errors(cudart.cudaDeviceGetDefaultMemPool(dev)) + threshold_val = cuda.cuuint64_t(ctypes.c_uint64(-1).value) # Set high release threshold on the default pool so that cudaFreeAsync will not actually release memory to the system. # By default, the release threshold for a memory pool is set to zero. This implies that the CUDA driver is # allowed to release a memory chunk back to the system as long as it does not contain any active suballocations. - checkCudaErrors( + check_cuda_errors( cudart.cudaMemPoolSetAttribute( - memPool, + mem_pool, cudart.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold, - thresholdVal, + threshold_val, ) ) # Record teh start event - checkCudaErrors(cudart.cudaEventRecord(start, stream)) + check_cuda_errors(cudart.cudaEventRecord(start, stream)) for _i in range(MAX_ITER): - d_a = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream)) - d_b = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream)) - d_c = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream)) - checkCudaErrors(cudart.cudaMemcpyAsync(d_a, a, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)) - checkCudaErrors(cudart.cudaMemcpyAsync(d_b, b, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)) + d_a = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream)) + d_b = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream)) + d_c = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream)) + check_cuda_errors( + cudart.cudaMemcpyAsync(d_a, a, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) + ) + check_cuda_errors( + cudart.cudaMemcpyAsync(d_b, b, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) + ) block = cudart.dim3() block.x = 256 @@ -140,13 +144,13 @@ def streamOrderedAllocationPostSync(dev, nelem, a, b, c): grid.y = 1 grid.z = 1 - kernelArgs = ( + kernel_args = ( (d_a, d_b, d_c, nelem), (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int), ) - checkCudaErrors( + check_cuda_errors( cuda.cuLaunchKernel( - _vectorAddGPU, + _vector_add_gpu, grid.x, grid.y, grid.z, # grid dim @@ -155,40 +159,42 @@ def streamOrderedAllocationPostSync(dev, nelem, a, b, c): block.z, # block dim 0, stream, # shared mem and stream - kernelArgs, + kernel_args, 0, ) ) # arguments - checkCudaErrors(cudart.cudaFreeAsync(d_a, stream)) - checkCudaErrors(cudart.cudaFreeAsync(d_b, stream)) - checkCudaErrors(cudart.cudaMemcpyAsync(c, d_c, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)) - checkCudaErrors(cudart.cudaFreeAsync(d_c, stream)) - checkCudaErrors(cudart.cudaStreamSynchronize(stream)) - checkCudaErrors(cudart.cudaEventRecord(end, stream)) + check_cuda_errors(cudart.cudaFreeAsync(d_a, stream)) + check_cuda_errors(cudart.cudaFreeAsync(d_b, stream)) + check_cuda_errors( + cudart.cudaMemcpyAsync(c, d_c, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) + ) + check_cuda_errors(cudart.cudaFreeAsync(d_c, stream)) + check_cuda_errors(cudart.cudaStreamSynchronize(stream)) + check_cuda_errors(cudart.cudaEventRecord(end, stream)) # Wait for the end event to complete - checkCudaErrors(cudart.cudaEventSynchronize(end)) + check_cuda_errors(cudart.cudaEventSynchronize(end)) - msecTotal = checkCudaErrors(cudart.cudaEventElapsedTime(start, end)) - print(f"Total elapsed time = {msecTotal} ms over {MAX_ITER} iterations") + msec_total = check_cuda_errors(cudart.cudaEventElapsedTime(start, end)) + print(f"Total elapsed time = {msec_total} ms over {MAX_ITER} iterations") # Compare the results print("> Checking the results from vectorAddGPU() ...") - errorNorm = 0.0 - refNorm = 0.0 + error_norm = 0.0 + ref_norm = 0.0 for n in range(nelem): ref = a[n] + b[n] diff = c[n] - ref - errorNorm += diff * diff - refNorm += ref * ref + error_norm += diff * diff + ref_norm += ref * ref - errorNorm = math.sqrt(errorNorm) - refNorm = math.sqrt(refNorm) + error_norm = math.sqrt(error_norm) + ref_norm = math.sqrt(ref_norm) - checkCudaErrors(cudart.cudaStreamDestroy(stream)) + check_cuda_errors(cudart.cudaStreamDestroy(stream)) - return errorNorm / refNorm < 1.0e-6 + return error_norm / ref_norm < 1.0e-6 def main(): @@ -198,42 +204,42 @@ def main(): pytest.skip("streamOrderedAllocation is not supported on Mac OSX") cuda.cuInit(0) - if checkCmdLineFlag("help"): + if check_cmd_line_flag("help"): print("Usage: streamOrderedAllocation [OPTION]\n", file=sys.stderr) print("Options:", file=sys.stderr) print(" device=[device #] Specify the device to be used", file=sys.stderr) sys.exit(1) - dev = findCudaDevice() + dev = find_cuda_device() - version = checkCudaErrors(cudart.cudaDriverGetVersion()) + version = check_cuda_errors(cudart.cudaDriverGetVersion()) if version < 11030: - isMemPoolSupported = False + is_mem_pool_supported = False else: - isMemPoolSupported = checkCudaErrors( + is_mem_pool_supported = check_cuda_errors( cudart.cudaDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev) ) - if not isMemPoolSupported: + if not is_mem_pool_supported: pytest.skip("Waiving execution as device does not support Memory Pools") - global _vectorAddGPU - with common.KernelHelper(streamOrderedAllocation, dev) as kernelHelper: - _vectorAddGPU = kernelHelper.getFunction(b"vectorAddGPU") + global _vector_add_gpu + kernel_helper = common.KernelHelper(stream_ordered_allocation, dev) + _vector_add_gpu = kernel_helper.get_function(b"vectorAddGPU") - # Allocate CPU memory - nelem = 1048576 - nelem * np.dtype(np.float32).itemsize + # Allocate CPU memory + nelem = 1048576 + nelem * np.dtype(np.float32).itemsize - a = np.zeros(nelem, dtype="float32") - b = np.zeros(nelem, dtype="float32") - c = np.zeros(nelem, dtype="float32") - # Initialize the vectors - for i in range(nelem): - a[i] = rnd.random() - b[i] = rnd.random() + a = np.zeros(nelem, dtype="float32") + b = np.zeros(nelem, dtype="float32") + c = np.zeros(nelem, dtype="float32") + # Initialize the vectors + for i in range(nelem): + a[i] = rnd.random() + b[i] = rnd.random() - ret1 = basicStreamOrderedAllocation(dev, nelem, a, b, c) - ret2 = streamOrderedAllocationPostSync(dev, nelem, a, b, c) + ret1 = basic_stream_ordered_allocation(dev, nelem, a, b, c) + ret2 = stream_ordered_allocation_post_sync(dev, nelem, a, b, c) if not ret1 or not ret2: sys.exit(1) diff --git a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py index aaa03e446a..722d19dcb5 100644 --- a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py +++ b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py @@ -9,16 +9,16 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors, findCudaDevice -from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt +from common.helper_cuda import check_cuda_errors, find_cuda_device +from common.helper_string import check_cmd_line_flag, get_cmd_line_argument_int from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart -blockSize = 16 +block_size = 16 -class kernels(Enum): +class Kernels(Enum): AsyncCopyMultiStageLargeChunk = 0 AsyncCopyLargeChunk = 1 AsyncCopyLargeChunkAWBarrier = 2 @@ -29,7 +29,7 @@ class kernels(Enum): NaiveLargeChunk = 7 -kernelNames = [ +kernel_names = [ "AsyncCopyMultiStageLargeChunk", "AsyncCopyLargeChunk", "AsyncCopyLargeChunkAWBarrier", @@ -40,7 +40,7 @@ class kernels(Enum): "NaiveLargeChunk", ] -globalToShmemAsyncCopy = """\ +global_to_shmem_async_copy = """\ #line __LINE__ #if __CUDA_ARCH__ >= 700 #include @@ -709,7 +709,7 @@ class kernels(Enum): """ -def ConstantInit(data, size, val): +def constant_init(data, size, val): p_data = (ctypes.c_float * size).from_address(data) for i in range(size): p_data[i] = val @@ -718,78 +718,82 @@ def ConstantInit(data, size, val): # # Run matrix multiplication using CUDA # -def MatrixMultiply(dimsA, dimsB, kernel_number): +def matrix_multiply(dims_a, dims_b, kernel_number): # Allocate host memory for matricies A and B - size_A = dimsA.x * dimsA.y - mem_size_A = np.dtype(np.float32).itemsize * size_A - h_A = checkCudaErrors(cudart.cudaMallocHost(mem_size_A)) - size_B = dimsB.x * dimsB.y - mem_size_B = np.dtype(np.float32).itemsize * size_B - h_B = checkCudaErrors(cudart.cudaMallocHost(mem_size_B)) + size_a = dims_a.x * dims_a.y + mem_size_a = np.dtype(np.float32).itemsize * size_a + h_a = check_cuda_errors(cudart.cudaMallocHost(mem_size_a)) + size_b = dims_b.x * dims_b.y + mem_size_b = np.dtype(np.float32).itemsize * size_b + h_b = check_cuda_errors(cudart.cudaMallocHost(mem_size_b)) # Initialize host memory - valB = 2.10 - ConstantInit(h_A, size_A, 1.0) - ConstantInit(h_B, size_B, valB) + val_b = 2.10 + constant_init(h_a, size_a, 1.0) + constant_init(h_b, size_b, val_b) # Allocate Device Memory # Allocate host matrix C - dimsC = cudart.dim3() - dimsC.x = dimsB.x - dimsC.y = dimsA.y - dimsC.z = 1 - mem_size_C = dimsC.x * dimsC.y * np.dtype(np.float32).itemsize - h_C = checkCudaErrors(cudart.cudaMallocHost(mem_size_C)) - - if h_C == 0: + dims_c = cudart.dim3() + dims_c.x = dims_b.x + dims_c.y = dims_a.y + dims_c.z = 1 + mem_size_c = dims_c.x * dims_c.y * np.dtype(np.float32).itemsize + h_c = check_cuda_errors(cudart.cudaMallocHost(mem_size_c)) + + if h_c == 0: print("Failed to allocate host matrix C!", file=sys.stderr) sys.exit(1) - d_A = checkCudaErrors(cudart.cudaMalloc(mem_size_A)) - d_B = checkCudaErrors(cudart.cudaMalloc(mem_size_B)) - d_C = checkCudaErrors(cudart.cudaMalloc(mem_size_C)) + d_a = check_cuda_errors(cudart.cudaMalloc(mem_size_a)) + d_b = check_cuda_errors(cudart.cudaMalloc(mem_size_b)) + d_c = check_cuda_errors(cudart.cudaMalloc(mem_size_c)) # Allocate CUDA events that we'll use for timing - start = checkCudaErrors(cudart.cudaEventCreate()) - stop = checkCudaErrors(cudart.cudaEventCreate()) + start = check_cuda_errors(cudart.cudaEventCreate()) + stop = check_cuda_errors(cudart.cudaEventCreate()) - stream = checkCudaErrors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking)) + stream = check_cuda_errors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking)) # Copy host memory to device - checkCudaErrors(cudart.cudaMemcpyAsync(d_A, h_A, mem_size_A, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)) - checkCudaErrors(cudart.cudaMemcpyAsync(d_B, h_B, mem_size_B, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)) - checkCudaErrors(cudart.cudaMemsetAsync(d_C, 0, mem_size_C, stream)) + check_cuda_errors( + cudart.cudaMemcpyAsync(d_a, h_a, mem_size_a, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) + ) + check_cuda_errors( + cudart.cudaMemcpyAsync(d_b, h_b, mem_size_b, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) + ) + check_cuda_errors(cudart.cudaMemsetAsync(d_c, 0, mem_size_c, stream)) # Setup execution parameters threads = cudart.dim3() - threads.x = threads.y = blockSize + threads.x = threads.y = block_size threads.z = 1 grid = cudart.dim3() - grid.x = dimsB.x / threads.x - grid.y = dimsA.y / threads.y + grid.x = dims_b.x / threads.x + grid.y = dims_a.y / threads.y grid.z = 1 # Here the block size is 16x18, where first 16 rows are consumer thread group # and last 2 rows (1 warp) is producer thread group - threadsSharedStateKernel = cudart.dim3() - threadsSharedStateKernel.x = blockSize - threadsSharedStateKernel.y = blockSize + 2 - threadsSharedStateKernel.z = 1 - gridSharedStateKernel = cudart.dim3() - gridSharedStateKernel.x = dimsB.x / threadsSharedStateKernel.x - gridSharedStateKernel.y = dimsA.y / threadsSharedStateKernel.x - - print(f"Running kernel = {kernel_number} - {kernelNames[kernel_number.value]}") + threads_shared_state_kernel = cudart.dim3() + threads_shared_state_kernel.x = block_size + threads_shared_state_kernel.y = block_size + 2 + threads_shared_state_kernel.z = 1 + grid_shared_state_kernel = cudart.dim3() + grid_shared_state_kernel.x = dims_b.x / threads_shared_state_kernel.x + grid_shared_state_kernel.y = dims_a.y / threads_shared_state_kernel.x + + print(f"Running kernel = {kernel_number} - {kernel_names[kernel_number.value]}") # Create and start timer print("Computing result using CUDA Kernel...") # Performs warmup operation using matrixMul CUDA kernel - kernelArguments = ( - (d_C, d_A, d_B, dimsA.x, dimsB.x), + kernel_arguments = ( + (d_c, d_a, d_b, dims_a.x, dims_b.x), (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int), ) - if kernel_number == kernels.AsyncCopyMultiStageLargeChunk: - checkCudaErrors( + if kernel_number == Kernels.AsyncCopyMultiStageLargeChunk: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopyMultiStageLargeChunk, grid.x, @@ -800,12 +804,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.AsyncCopyLargeChunk: - checkCudaErrors( + elif kernel_number == Kernels.AsyncCopyLargeChunk: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopyLargeChunk, grid.x, @@ -816,12 +820,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.AsyncCopyLargeChunkAWBarrier: - checkCudaErrors( + elif kernel_number == Kernels.AsyncCopyLargeChunkAWBarrier: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopyLargeChunkAWBarrier, grid.x, @@ -832,28 +836,28 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.AsyncCopyMultiStageSharedState: - checkCudaErrors( + elif kernel_number == Kernels.AsyncCopyMultiStageSharedState: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopyMultiStageSharedState, - gridSharedStateKernel.x, - gridSharedStateKernel.y, - gridSharedStateKernel.z, # grid dim - threadsSharedStateKernel.x, - threadsSharedStateKernel.y, - threadsSharedStateKernel.z, # block dim + grid_shared_state_kernel.x, + grid_shared_state_kernel.y, + grid_shared_state_kernel.z, # grid dim + threads_shared_state_kernel.x, + threads_shared_state_kernel.y, + threads_shared_state_kernel.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.AsyncCopyMultiStage: - checkCudaErrors( + elif kernel_number == Kernels.AsyncCopyMultiStage: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopyMultiStage, grid.x, @@ -864,12 +868,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.AsyncCopySingleStage: - checkCudaErrors( + elif kernel_number == Kernels.AsyncCopySingleStage: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopySingleStage, grid.x, @@ -880,12 +884,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.Naive: - checkCudaErrors( + elif kernel_number == Kernels.Naive: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulNaive, grid.x, @@ -896,12 +900,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.NaiveLargeChunk: - checkCudaErrors( + elif kernel_number == Kernels.NaiveLargeChunk: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulNaiveLargeChunk, grid.x, @@ -912,21 +916,21 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - checkCudaErrors(cudart.cudaStreamSynchronize(stream)) + check_cuda_errors(cudart.cudaStreamSynchronize(stream)) # Execute the kernel - nIter = 100 + n_iter = 100 # Record the start event - checkCudaErrors(cudart.cudaEventRecord(start, stream)) + check_cuda_errors(cudart.cudaEventRecord(start, stream)) - if kernel_number == kernels.AsyncCopyMultiStageLargeChunk: - checkCudaErrors( + if kernel_number == Kernels.AsyncCopyMultiStageLargeChunk: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopyMultiStageLargeChunk, grid.x, @@ -937,12 +941,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.AsyncCopyLargeChunk: - checkCudaErrors( + elif kernel_number == Kernels.AsyncCopyLargeChunk: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopyLargeChunk, grid.x, @@ -953,12 +957,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.AsyncCopyLargeChunkAWBarrier: - checkCudaErrors( + elif kernel_number == Kernels.AsyncCopyLargeChunkAWBarrier: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopyLargeChunkAWBarrier, grid.x, @@ -969,28 +973,28 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.AsyncCopyMultiStageSharedState: - checkCudaErrors( + elif kernel_number == Kernels.AsyncCopyMultiStageSharedState: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopyMultiStageSharedState, - gridSharedStateKernel.x, - gridSharedStateKernel.y, - gridSharedStateKernel.z, # grid dim - threadsSharedStateKernel.x, - threadsSharedStateKernel.y, - threadsSharedStateKernel.z, # block dim + grid_shared_state_kernel.x, + grid_shared_state_kernel.y, + grid_shared_state_kernel.z, # grid dim + threads_shared_state_kernel.x, + threads_shared_state_kernel.y, + threads_shared_state_kernel.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.AsyncCopyMultiStage: - checkCudaErrors( + elif kernel_number == Kernels.AsyncCopyMultiStage: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopyMultiStage, grid.x, @@ -1001,12 +1005,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.AsyncCopySingleStage: - checkCudaErrors( + elif kernel_number == Kernels.AsyncCopySingleStage: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopySingleStage, grid.x, @@ -1017,12 +1021,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.Naive: - checkCudaErrors( + elif kernel_number == Kernels.Naive: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulNaive, grid.x, @@ -1033,12 +1037,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.NaiveLargeChunk: - checkCudaErrors( + elif kernel_number == Kernels.NaiveLargeChunk: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulNaiveLargeChunk, grid.x, @@ -1049,31 +1053,33 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments # Record the stop event - checkCudaErrors(cudart.cudaEventRecord(stop, stream)) + check_cuda_errors(cudart.cudaEventRecord(stop, stream)) # Wait for the stop event to complete - checkCudaErrors(cudart.cudaEventSynchronize(stop)) + check_cuda_errors(cudart.cudaEventSynchronize(stop)) - msecTotal = checkCudaErrors(cudart.cudaEventElapsedTime(start, stop)) + msec_total = check_cuda_errors(cudart.cudaEventElapsedTime(start, stop)) # Compute and print the performance - msecPerMatrixMul = msecTotal / nIter - flopsPerMatrixMul = 2.0 * dimsA.x * dimsA.y * dimsB.x - gigaFlops = (flopsPerMatrixMul * 1.0e-9) / (msecPerMatrixMul / 1000.0) + msec_per_matrix_mul = msec_total / n_iter + flops_per_matrix_mul = 2.0 * dims_a.x * dims_a.y * dims_b.x + giga_flops = (flops_per_matrix_mul * 1.0e-9) / (msec_per_matrix_mul / 1000.0) print( - f"Performance= {gigaFlops:.2f} GFlop/s, Time= {msecPerMatrixMul:.2f} msec, Size= {flopsPerMatrixMul:.0f} Ops, WorkgroupSize= {threads.x * threads.y} threads/block" + f"Performance= {giga_flops:.2f} GFlop/s, Time= {msec_per_matrix_mul:.2f} msec, Size= {flops_per_matrix_mul:.0f} Ops, WorkgroupSize= {threads.x * threads.y} threads/block" ) # Copy result from device to host - checkCudaErrors(cudart.cudaMemcpyAsync(h_C, d_C, mem_size_C, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)) - checkCudaErrors(cudart.cudaStreamSynchronize(stream)) + check_cuda_errors( + cudart.cudaMemcpyAsync(h_c, d_c, mem_size_c, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) + ) + check_cuda_errors(cudart.cudaStreamSynchronize(stream)) correct = True @@ -1081,16 +1087,16 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): # |_cpu - _gpu|/<|x|, |y|> < eps eps = 1.0e-6 - h_C_local = (ctypes.c_float * (dimsC.x * dimsC.y)).from_address(h_C) - for i in range(dimsC.x * dimsC.y): - abs_err = math.fabs(h_C_local[i] - (dimsA.x * valB)) - dot_length = dimsA.x - abs_val = math.fabs(h_C_local[i]) + h_c_local = (ctypes.c_float * (dims_c.x * dims_c.y)).from_address(h_c) + for i in range(dims_c.x * dims_c.y): + abs_err = math.fabs(h_c_local[i] - (dims_a.x * val_b)) + dot_length = dims_a.x + abs_val = math.fabs(h_c_local[i]) rel_err = abs_err / abs_val / dot_length if rel_err > eps: print( - f"Error! Matrix[{i:.5f}]={h_C_local[i]:.8f} ref={dimsA.x * valB:.8f} err term is > {rel_err}", + f"Error! Matrix[{i:.5f}]={h_c_local[i]:.8f} ref={dims_a.x * val_b:.8f} err term is > {rel_err}", file=sys.stderr, ) correct = False @@ -1099,14 +1105,14 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): print("Result = FAIL", file=sys.stderr) # Clean up memory - checkCudaErrors(cudart.cudaFreeHost(h_A)) - checkCudaErrors(cudart.cudaFreeHost(h_B)) - checkCudaErrors(cudart.cudaFreeHost(h_C)) - checkCudaErrors(cudart.cudaFree(d_A)) - checkCudaErrors(cudart.cudaFree(d_B)) - checkCudaErrors(cudart.cudaFree(d_C)) - checkCudaErrors(cudart.cudaEventDestroy(start)) - checkCudaErrors(cudart.cudaEventDestroy(stop)) + check_cuda_errors(cudart.cudaFreeHost(h_a)) + check_cuda_errors(cudart.cudaFreeHost(h_b)) + check_cuda_errors(cudart.cudaFreeHost(h_c)) + check_cuda_errors(cudart.cudaFree(d_a)) + check_cuda_errors(cudart.cudaFree(d_b)) + check_cuda_errors(cudart.cudaFree(d_c)) + check_cuda_errors(cudart.cudaEventDestroy(start)) + check_cuda_errors(cudart.cudaEventDestroy(stop)) print( "\nNOTE: The CUDA Samples are not meant for performance " "measurements. Results may vary when GPU Boost is enabled." @@ -1119,16 +1125,16 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): def main(): import pytest - common.pytest_skipif_compute_capability_too_low(findCudaDevice(), (7, 0)) + common.pytest_skipif_compute_capability_too_low(find_cuda_device(), (7, 0)) if platform.machine() == "qnx": pytest.skip("globalToShmemAsyncCopy is not supported on QNX") - version = checkCudaErrors(cuda.cuDriverGetVersion()) + version = check_cuda_errors(cuda.cuDriverGetVersion()) if version < 11010: pytest.skip("CUDA Toolkit 11.1 or greater is required") - if checkCmdLineFlag("help") or checkCmdLineFlag("?"): + if check_cmd_line_flag("help") or check_cmd_line_flag("?"): print("Usage device=n (n >= 0 for deviceID)", file=sys.stderr) print(" wA=WidthA hA=HeightA (Width x Height of Matrix A)", file=sys.stderr) print(" wB=WidthB hB=HeightB (Width x Height of Matrix B)", file=sys.stderr) @@ -1149,54 +1155,54 @@ def main(): # This will pick the best possible CUDA capable device, otherwise # override the device ID based on input provided at the command line - devID = findCudaDevice() + dev_id = find_cuda_device() - matrixBlock = 32 - dimsA = cudart.dim3() - dimsA.x = dimsA.y = 10 * 4 * matrixBlock - dimsA.z = 1 - dimsB = cudart.dim3() - dimsB.x = dimsB.y = 10 * 4 * matrixBlock - dimsB.z = 1 + matrix_block = 32 + dims_a = cudart.dim3() + dims_a.x = dims_a.y = 10 * 4 * matrix_block + dims_a.z = 1 + dims_b = cudart.dim3() + dims_b.x = dims_b.y = 10 * 4 * matrix_block + dims_b.z = 1 # width of Matrix A - if checkCmdLineFlag("wA="): - dimsA.x = int(getCmdLineArgumentInt("wA=")) + if check_cmd_line_flag("wA="): + dims_a.x = int(get_cmd_line_argument_int("wA=")) # height of Matrix A - if checkCmdLineFlag("hA="): - dimsA.y = int(getCmdLineArgumentInt("hA=")) + if check_cmd_line_flag("hA="): + dims_a.y = int(get_cmd_line_argument_int("hA=")) # width of Matrix B - if checkCmdLineFlag("wB="): - dimsB.x = int(getCmdLineArgumentInt("wB=")) + if check_cmd_line_flag("wB="): + dims_b.x = int(get_cmd_line_argument_int("wB=")) # height of Matrix B - if checkCmdLineFlag("hB="): - dimsB.y = int(getCmdLineArgumentInt("hB=")) + if check_cmd_line_flag("hB="): + dims_b.y = int(get_cmd_line_argument_int("hB=")) - if dimsA.x != dimsB.y: - print(f"Error: outer matrix dimensions must be equal. ({dimsA.x} != {dimsB.y})", file=sys.stderr) + if dims_a.x != dims_b.y: + print(f"Error: outer matrix dimensions must be equal. ({dims_a.x} != {dims_b.y})", file=sys.stderr) sys.exit(1) - selected_kernel = kernels.AsyncCopyMultiStageLargeChunk + selected_kernel = Kernels.AsyncCopyMultiStageLargeChunk # kernel to run - default (AsyncCopyMultiStageLargeChunk == 0) - if checkCmdLineFlag("kernel="): - kernel_number = int(getCmdLineArgumentInt("kernel=")) + if check_cmd_line_flag("kernel="): + kernel_number = int(get_cmd_line_argument_int("kernel=")) if kernel_number < 8: - selected_kernel = kernels(kernel_number) + selected_kernel = Kernels(kernel_number) else: print("Error: kernel number should be between 0 to 7", file=sys.stderr) sys.exit(1) - major = checkCudaErrors( - cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID) + major = check_cuda_errors( + cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, dev_id) ) if major < 7: pytest.skip("globalToShmemAsyncCopy requires SM 7.0 or higher.") - print(f"MatrixA({dimsA.x},{dimsA.y}), MatrixB({dimsB.x},{dimsB.y})") + print(f"MatrixA({dims_a.x},{dims_a.y}), MatrixB({dims_b.x},{dims_b.y})") global _MatrixMulAsyncCopyMultiStageLargeChunk global _MatrixMulAsyncCopyLargeChunk @@ -1206,17 +1212,17 @@ def main(): global _MatrixMulAsyncCopySingleStage global _MatrixMulNaive global _MatrixMulNaiveLargeChunk - with common.KernelHelper(globalToShmemAsyncCopy, devID) as kernelHelper: - _MatrixMulAsyncCopyMultiStageLargeChunk = kernelHelper.getFunction(b"MatrixMulAsyncCopyMultiStageLargeChunk") - _MatrixMulAsyncCopyLargeChunk = kernelHelper.getFunction(b"MatrixMulAsyncCopyLargeChunk") - _MatrixMulAsyncCopyLargeChunkAWBarrier = kernelHelper.getFunction(b"MatrixMulAsyncCopyLargeChunkAWBarrier") - _MatrixMulAsyncCopyMultiStageSharedState = kernelHelper.getFunction(b"MatrixMulAsyncCopyMultiStageSharedState") - _MatrixMulAsyncCopyMultiStage = kernelHelper.getFunction(b"MatrixMulAsyncCopyMultiStage") - _MatrixMulAsyncCopySingleStage = kernelHelper.getFunction(b"MatrixMulAsyncCopySingleStage") - _MatrixMulNaive = kernelHelper.getFunction(b"MatrixMulNaive") - _MatrixMulNaiveLargeChunk = kernelHelper.getFunction(b"MatrixMulNaiveLargeChunk") - - matrix_result = MatrixMultiply(dimsA, dimsB, selected_kernel) + kernel_helper = common.KernelHelper(global_to_shmem_async_copy, dev_id) + _MatrixMulAsyncCopyMultiStageLargeChunk = kernel_helper.get_function(b"MatrixMulAsyncCopyMultiStageLargeChunk") + _MatrixMulAsyncCopyLargeChunk = kernel_helper.get_function(b"MatrixMulAsyncCopyLargeChunk") + _MatrixMulAsyncCopyLargeChunkAWBarrier = kernel_helper.get_function(b"MatrixMulAsyncCopyLargeChunkAWBarrier") + _MatrixMulAsyncCopyMultiStageSharedState = kernel_helper.get_function(b"MatrixMulAsyncCopyMultiStageSharedState") + _MatrixMulAsyncCopyMultiStage = kernel_helper.get_function(b"MatrixMulAsyncCopyMultiStage") + _MatrixMulAsyncCopySingleStage = kernel_helper.get_function(b"MatrixMulAsyncCopySingleStage") + _MatrixMulNaive = kernel_helper.get_function(b"MatrixMulNaive") + _MatrixMulNaiveLargeChunk = kernel_helper.get_function(b"MatrixMulNaiveLargeChunk") + + matrix_result = matrix_multiply(dims_a, dims_b, selected_kernel) if matrix_result != 0: sys.exit(1) diff --git a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py index 7746bd08e3..b08da3edc0 100644 --- a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py +++ b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py @@ -6,7 +6,7 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors, findCudaDevice +from common.helper_cuda import check_cuda_errors, find_cuda_device from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart @@ -14,7 +14,7 @@ THREADS_PER_BLOCK = 512 GRAPH_LAUNCH_ITERATIONS = 3 -simpleCudaGraphs = """\ +simple_cuda_graphs = """\ #include #include @@ -121,185 +121,185 @@ def init_input(a, size): a_list[i] = rnd.random() -def cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, numOfBlocks): +def cuda_graphs_manual(input_vec_h, input_vec_d, output_vec_d, result_d, input_size, num_of_blocks): result_h = ctypes.c_double(0.0) - nodeDependencies = [] + node_dependencies = [] - streamForGraph = checkCudaErrors(cudart.cudaStreamCreate()) + stream_for_graph = check_cuda_errors(cudart.cudaStreamCreate()) - kernelNodeParams = cuda.CUDA_KERNEL_NODE_PARAMS() - memcpyParams = cudart.cudaMemcpy3DParms() - memsetParams = cudart.cudaMemsetParams() + kernel_node_params = cuda.CUDA_KERNEL_NODE_PARAMS() + memcpy_params = cudart.cudaMemcpy3DParms() + memset_params = cudart.cudaMemsetParams() - memcpyParams.srcArray = None - memcpyParams.srcPos = cudart.make_cudaPos(0, 0, 0) - memcpyParams.srcPtr = cudart.make_cudaPitchedPtr( - inputVec_h, np.dtype(np.float32).itemsize * inputSize, inputSize, 1 + memcpy_params.srcArray = None + memcpy_params.srcPos = cudart.make_cudaPos(0, 0, 0) + memcpy_params.srcPtr = cudart.make_cudaPitchedPtr( + input_vec_h, np.dtype(np.float32).itemsize * input_size, input_size, 1 ) - memcpyParams.dstArray = None - memcpyParams.dstPos = cudart.make_cudaPos(0, 0, 0) - memcpyParams.dstPtr = cudart.make_cudaPitchedPtr( - inputVec_d, np.dtype(np.float32).itemsize * inputSize, inputSize, 1 + memcpy_params.dstArray = None + memcpy_params.dstPos = cudart.make_cudaPos(0, 0, 0) + memcpy_params.dstPtr = cudart.make_cudaPitchedPtr( + input_vec_d, np.dtype(np.float32).itemsize * input_size, input_size, 1 ) - memcpyParams.extent = cudart.make_cudaExtent(np.dtype(np.float32).itemsize * inputSize, 1, 1) - memcpyParams.kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice + memcpy_params.extent = cudart.make_cudaExtent(np.dtype(np.float32).itemsize * input_size, 1, 1) + memcpy_params.kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice - memsetParams.dst = outputVec_d - memsetParams.value = 0 - memsetParams.pitch = 0 - memsetParams.elementSize = np.dtype(np.float32).itemsize # elementSize can be max 4 bytes - memsetParams.width = numOfBlocks * 2 - memsetParams.height = 1 + memset_params.dst = output_vec_d + memset_params.value = 0 + memset_params.pitch = 0 + memset_params.elementSize = np.dtype(np.float32).itemsize # elementSize can be max 4 bytes + memset_params.width = num_of_blocks * 2 + memset_params.height = 1 - graph = checkCudaErrors(cudart.cudaGraphCreate(0)) + graph = check_cuda_errors(cudart.cudaGraphCreate(0)) - memcpyNode = checkCudaErrors(cudart.cudaGraphAddMemcpyNode(graph, None, 0, memcpyParams)) - memsetNode = checkCudaErrors(cudart.cudaGraphAddMemsetNode(graph, None, 0, memsetParams)) + memcpy_node = check_cuda_errors(cudart.cudaGraphAddMemcpyNode(graph, None, 0, memcpy_params)) + memset_node = check_cuda_errors(cudart.cudaGraphAddMemsetNode(graph, None, 0, memset_params)) - nodeDependencies.append(memsetNode) - nodeDependencies.append(memcpyNode) + node_dependencies.append(memset_node) + node_dependencies.append(memcpy_node) - kernelArgs = ( - (inputVec_d, outputVec_d, inputSize, numOfBlocks), + kernel_args = ( + (input_vec_d, output_vec_d, input_size, num_of_blocks), (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_uint), ) - kernelNodeParams.func = _reduce - kernelNodeParams.gridDimX = numOfBlocks - kernelNodeParams.gridDimY = kernelNodeParams.gridDimZ = 1 - kernelNodeParams.blockDimX = THREADS_PER_BLOCK - kernelNodeParams.blockDimY = kernelNodeParams.blockDimZ = 1 - kernelNodeParams.sharedMemBytes = 0 - kernelNodeParams.kernelParams = kernelArgs + kernel_node_params.func = _reduce + kernel_node_params.gridDimX = num_of_blocks + kernel_node_params.gridDimY = kernel_node_params.gridDimZ = 1 + kernel_node_params.blockDimX = THREADS_PER_BLOCK + kernel_node_params.blockDimY = kernel_node_params.blockDimZ = 1 + kernel_node_params.sharedMemBytes = 0 + kernel_node_params.kernelParams = kernel_args # kernelNodeParams.extra = None - kernelNode = checkCudaErrors( - cuda.cuGraphAddKernelNode(graph, nodeDependencies, len(nodeDependencies), kernelNodeParams) + kernel_node = check_cuda_errors( + cuda.cuGraphAddKernelNode(graph, node_dependencies, len(node_dependencies), kernel_node_params) ) - nodeDependencies.clear() - nodeDependencies.append(kernelNode) - - memsetParams = cudart.cudaMemsetParams() - memsetParams.dst = result_d - memsetParams.value = 0 - memsetParams.elementSize = np.dtype(np.float32).itemsize - memsetParams.width = 2 - memsetParams.height = 1 - memsetNode = checkCudaErrors(cudart.cudaGraphAddMemsetNode(graph, None, 0, memsetParams)) - - nodeDependencies.append(memsetNode) - - kernelNodeParams = cuda.CUDA_KERNEL_NODE_PARAMS() - kernelNodeParams.func = _reduceFinal - kernelNodeParams.gridDimX = kernelNodeParams.gridDimY = kernelNodeParams.gridDimZ = 1 - kernelNodeParams.blockDimX = THREADS_PER_BLOCK - kernelNodeParams.blockDimY = kernelNodeParams.blockDimZ = 1 - kernelNodeParams.sharedMemBytes = 0 - kernelArgs2 = ( - (outputVec_d, result_d, numOfBlocks), + node_dependencies.clear() + node_dependencies.append(kernel_node) + + memset_params = cudart.cudaMemsetParams() + memset_params.dst = result_d + memset_params.value = 0 + memset_params.elementSize = np.dtype(np.float32).itemsize + memset_params.width = 2 + memset_params.height = 1 + memset_node = check_cuda_errors(cudart.cudaGraphAddMemsetNode(graph, None, 0, memset_params)) + + node_dependencies.append(memset_node) + + kernel_node_params = cuda.CUDA_KERNEL_NODE_PARAMS() + kernel_node_params.func = _reduceFinal + kernel_node_params.gridDimX = kernel_node_params.gridDimY = kernel_node_params.gridDimZ = 1 + kernel_node_params.blockDimX = THREADS_PER_BLOCK + kernel_node_params.blockDimY = kernel_node_params.blockDimZ = 1 + kernel_node_params.sharedMemBytes = 0 + kernel_args2 = ( + (output_vec_d, result_d, num_of_blocks), (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_uint), ) - kernelNodeParams.kernelParams = kernelArgs2 + kernel_node_params.kernelParams = kernel_args2 # kernelNodeParams.extra = None - kernelNode = checkCudaErrors( - cuda.cuGraphAddKernelNode(graph, nodeDependencies, len(nodeDependencies), kernelNodeParams) + kernel_node = check_cuda_errors( + cuda.cuGraphAddKernelNode(graph, node_dependencies, len(node_dependencies), kernel_node_params) ) - nodeDependencies.clear() - nodeDependencies.append(kernelNode) - - memcpyParams = cudart.cudaMemcpy3DParms() - - memcpyParams.srcArray = None - memcpyParams.srcPos = cudart.make_cudaPos(0, 0, 0) - memcpyParams.srcPtr = cudart.make_cudaPitchedPtr(result_d, np.dtype(np.float64).itemsize, 1, 1) - memcpyParams.dstArray = None - memcpyParams.dstPos = cudart.make_cudaPos(0, 0, 0) - memcpyParams.dstPtr = cudart.make_cudaPitchedPtr(result_h, np.dtype(np.float64).itemsize, 1, 1) - memcpyParams.extent = cudart.make_cudaExtent(np.dtype(np.float64).itemsize, 1, 1) - memcpyParams.kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost - memcpyNode = checkCudaErrors( - cudart.cudaGraphAddMemcpyNode(graph, nodeDependencies, len(nodeDependencies), memcpyParams) + node_dependencies.clear() + node_dependencies.append(kernel_node) + + memcpy_params = cudart.cudaMemcpy3DParms() + + memcpy_params.srcArray = None + memcpy_params.srcPos = cudart.make_cudaPos(0, 0, 0) + memcpy_params.srcPtr = cudart.make_cudaPitchedPtr(result_d, np.dtype(np.float64).itemsize, 1, 1) + memcpy_params.dstArray = None + memcpy_params.dstPos = cudart.make_cudaPos(0, 0, 0) + memcpy_params.dstPtr = cudart.make_cudaPitchedPtr(result_h, np.dtype(np.float64).itemsize, 1, 1) + memcpy_params.extent = cudart.make_cudaExtent(np.dtype(np.float64).itemsize, 1, 1) + memcpy_params.kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost + memcpy_node = check_cuda_errors( + cudart.cudaGraphAddMemcpyNode(graph, node_dependencies, len(node_dependencies), memcpy_params) ) - nodeDependencies.clear() - nodeDependencies.append(memcpyNode) + node_dependencies.clear() + node_dependencies.append(memcpy_node) # WIP: Host nodes - nodes, numNodes = checkCudaErrors(cudart.cudaGraphGetNodes(graph)) - print(f"\nNum of nodes in the graph created manually = {numNodes}") + nodes, num_nodes = check_cuda_errors(cudart.cudaGraphGetNodes(graph)) + print(f"\nNum of nodes in the graph created manually = {num_nodes}") - graphExec = checkCudaErrors(cudart.cudaGraphInstantiate(graph, 0)) + graph_exec = check_cuda_errors(cudart.cudaGraphInstantiate(graph, 0)) - clonedGraph = checkCudaErrors(cudart.cudaGraphClone(graph)) - clonedGraphExec = checkCudaErrors(cudart.cudaGraphInstantiate(clonedGraph, 0)) + cloned_graph = check_cuda_errors(cudart.cudaGraphClone(graph)) + cloned_graph_exec = check_cuda_errors(cudart.cudaGraphInstantiate(cloned_graph, 0)) for _i in range(GRAPH_LAUNCH_ITERATIONS): - checkCudaErrors(cudart.cudaGraphLaunch(graphExec, streamForGraph)) + check_cuda_errors(cudart.cudaGraphLaunch(graph_exec, stream_for_graph)) - checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph)) + check_cuda_errors(cudart.cudaStreamSynchronize(stream_for_graph)) print("Cloned Graph Output..") for _i in range(GRAPH_LAUNCH_ITERATIONS): - checkCudaErrors(cudart.cudaGraphLaunch(clonedGraphExec, streamForGraph)) + check_cuda_errors(cudart.cudaGraphLaunch(cloned_graph_exec, stream_for_graph)) - checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph)) + check_cuda_errors(cudart.cudaStreamSynchronize(stream_for_graph)) - checkCudaErrors(cudart.cudaGraphExecDestroy(graphExec)) - checkCudaErrors(cudart.cudaGraphExecDestroy(clonedGraphExec)) - checkCudaErrors(cudart.cudaGraphDestroy(graph)) - checkCudaErrors(cudart.cudaGraphDestroy(clonedGraph)) - checkCudaErrors(cudart.cudaStreamDestroy(streamForGraph)) + check_cuda_errors(cudart.cudaGraphExecDestroy(graph_exec)) + check_cuda_errors(cudart.cudaGraphExecDestroy(cloned_graph_exec)) + check_cuda_errors(cudart.cudaGraphDestroy(graph)) + check_cuda_errors(cudart.cudaGraphDestroy(cloned_graph)) + check_cuda_errors(cudart.cudaStreamDestroy(stream_for_graph)) -def cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, numOfBlocks): +def cuda_graphs_using_stream_capture(input_vec_h, input_vec_d, output_vec_d, result_d, input_size, num_of_blocks): result_h = ctypes.c_double(0.0) - stream1 = checkCudaErrors(cudart.cudaStreamCreate()) - stream2 = checkCudaErrors(cudart.cudaStreamCreate()) - stream3 = checkCudaErrors(cudart.cudaStreamCreate()) - streamForGraph = checkCudaErrors(cudart.cudaStreamCreate()) + stream1 = check_cuda_errors(cudart.cudaStreamCreate()) + stream2 = check_cuda_errors(cudart.cudaStreamCreate()) + stream3 = check_cuda_errors(cudart.cudaStreamCreate()) + stream_for_graph = check_cuda_errors(cudart.cudaStreamCreate()) - forkStreamEvent = checkCudaErrors(cudart.cudaEventCreate()) - memsetEvent1 = checkCudaErrors(cudart.cudaEventCreate()) - memsetEvent2 = checkCudaErrors(cudart.cudaEventCreate()) + fork_stream_event = check_cuda_errors(cudart.cudaEventCreate()) + memset_event1 = check_cuda_errors(cudart.cudaEventCreate()) + memset_event2 = check_cuda_errors(cudart.cudaEventCreate()) - checkCudaErrors(cudart.cudaStreamBeginCapture(stream1, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal)) + check_cuda_errors(cudart.cudaStreamBeginCapture(stream1, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal)) - checkCudaErrors(cudart.cudaEventRecord(forkStreamEvent, stream1)) - checkCudaErrors(cudart.cudaStreamWaitEvent(stream2, forkStreamEvent, 0)) - checkCudaErrors(cudart.cudaStreamWaitEvent(stream3, forkStreamEvent, 0)) + check_cuda_errors(cudart.cudaEventRecord(fork_stream_event, stream1)) + check_cuda_errors(cudart.cudaStreamWaitEvent(stream2, fork_stream_event, 0)) + check_cuda_errors(cudart.cudaStreamWaitEvent(stream3, fork_stream_event, 0)) - checkCudaErrors( + check_cuda_errors( cudart.cudaMemcpyAsync( - inputVec_d, - inputVec_h, - np.dtype(np.float32).itemsize * inputSize, + input_vec_d, + input_vec_h, + np.dtype(np.float32).itemsize * input_size, cudart.cudaMemcpyKind.cudaMemcpyDefault, stream1, ) ) - checkCudaErrors(cudart.cudaMemsetAsync(outputVec_d, 0, np.dtype(np.float64).itemsize * numOfBlocks, stream2)) + check_cuda_errors(cudart.cudaMemsetAsync(output_vec_d, 0, np.dtype(np.float64).itemsize * num_of_blocks, stream2)) - checkCudaErrors(cudart.cudaEventRecord(memsetEvent1, stream2)) + check_cuda_errors(cudart.cudaEventRecord(memset_event1, stream2)) - checkCudaErrors(cudart.cudaMemsetAsync(result_d, 0, np.dtype(np.float64).itemsize, stream3)) - checkCudaErrors(cudart.cudaEventRecord(memsetEvent2, stream3)) + check_cuda_errors(cudart.cudaMemsetAsync(result_d, 0, np.dtype(np.float64).itemsize, stream3)) + check_cuda_errors(cudart.cudaEventRecord(memset_event2, stream3)) - checkCudaErrors(cudart.cudaStreamWaitEvent(stream1, memsetEvent1, 0)) + check_cuda_errors(cudart.cudaStreamWaitEvent(stream1, memset_event1, 0)) - kernelArgs = ( - (inputVec_d, outputVec_d, inputSize, numOfBlocks), + kernel_args = ( + (input_vec_d, output_vec_d, input_size, num_of_blocks), (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_uint), ) - checkCudaErrors( + check_cuda_errors( cuda.cuLaunchKernel( _reduce, - numOfBlocks, + num_of_blocks, 1, 1, THREADS_PER_BLOCK, @@ -307,20 +307,20 @@ def cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d, 1, 0, stream1, - kernelArgs, + kernel_args, 0, ) ) - checkCudaErrors(cudart.cudaStreamWaitEvent(stream1, memsetEvent2, 0)) + check_cuda_errors(cudart.cudaStreamWaitEvent(stream1, memset_event2, 0)) - kernelArgs2 = ( - (outputVec_d, result_d, numOfBlocks), + kernel_args2 = ( + (output_vec_d, result_d, num_of_blocks), (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_uint), ) - checkCudaErrors(cuda.cuLaunchKernel(_reduceFinal, 1, 1, 1, THREADS_PER_BLOCK, 1, 1, 0, stream1, kernelArgs2, 0)) + check_cuda_errors(cuda.cuLaunchKernel(_reduceFinal, 1, 1, 1, THREADS_PER_BLOCK, 1, 1, 0, stream1, kernel_args2, 0)) - checkCudaErrors( + check_cuda_errors( cudart.cudaMemcpyAsync( result_h, result_d, @@ -332,71 +332,67 @@ def cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d, # WIP: Host nodes - graph = checkCudaErrors(cudart.cudaStreamEndCapture(stream1)) + graph = check_cuda_errors(cudart.cudaStreamEndCapture(stream1)) - nodes, numNodes = checkCudaErrors(cudart.cudaGraphGetNodes(graph)) - print(f"\nNum of nodes in the graph created using stream capture API = {numNodes}") + nodes, num_nodes = check_cuda_errors(cudart.cudaGraphGetNodes(graph)) + print(f"\nNum of nodes in the graph created using stream capture API = {num_nodes}") - graphExec = checkCudaErrors(cudart.cudaGraphInstantiate(graph, 0)) + graph_exec = check_cuda_errors(cudart.cudaGraphInstantiate(graph, 0)) - clonedGraph = checkCudaErrors(cudart.cudaGraphClone(graph)) - clonedGraphExec = checkCudaErrors(cudart.cudaGraphInstantiate(clonedGraph, 0)) + cloned_graph = check_cuda_errors(cudart.cudaGraphClone(graph)) + cloned_graph_exec = check_cuda_errors(cudart.cudaGraphInstantiate(cloned_graph, 0)) for _i in range(GRAPH_LAUNCH_ITERATIONS): - checkCudaErrors(cudart.cudaGraphLaunch(graphExec, streamForGraph)) + check_cuda_errors(cudart.cudaGraphLaunch(graph_exec, stream_for_graph)) - checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph)) + check_cuda_errors(cudart.cudaStreamSynchronize(stream_for_graph)) print("Cloned Graph Output..") for _i in range(GRAPH_LAUNCH_ITERATIONS): - checkCudaErrors(cudart.cudaGraphLaunch(clonedGraphExec, streamForGraph)) + check_cuda_errors(cudart.cudaGraphLaunch(cloned_graph_exec, stream_for_graph)) - checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph)) + check_cuda_errors(cudart.cudaStreamSynchronize(stream_for_graph)) - checkCudaErrors(cudart.cudaGraphExecDestroy(graphExec)) - checkCudaErrors(cudart.cudaGraphExecDestroy(clonedGraphExec)) - checkCudaErrors(cudart.cudaGraphDestroy(graph)) - checkCudaErrors(cudart.cudaGraphDestroy(clonedGraph)) - checkCudaErrors(cudart.cudaEventDestroy(memsetEvent2)) - checkCudaErrors(cudart.cudaEventDestroy(memsetEvent1)) - checkCudaErrors(cudart.cudaEventDestroy(forkStreamEvent)) - checkCudaErrors(cudart.cudaStreamDestroy(stream3)) - checkCudaErrors(cudart.cudaStreamDestroy(stream1)) - checkCudaErrors(cudart.cudaStreamDestroy(stream2)) - checkCudaErrors(cudart.cudaStreamDestroy(streamForGraph)) + check_cuda_errors(cudart.cudaGraphExecDestroy(graph_exec)) + check_cuda_errors(cudart.cudaGraphExecDestroy(cloned_graph_exec)) + check_cuda_errors(cudart.cudaGraphDestroy(graph)) + check_cuda_errors(cudart.cudaGraphDestroy(cloned_graph)) + check_cuda_errors(cudart.cudaStreamDestroy(stream1)) + check_cuda_errors(cudart.cudaStreamDestroy(stream2)) + check_cuda_errors(cudart.cudaStreamDestroy(stream_for_graph)) def main(): size = 1 << 24 # number of elements to reduce - maxBlocks = 512 + max_blocks = 512 # This will pick the best possible CUDA capable device - devID = findCudaDevice() + dev_id = find_cuda_device() global _reduce global _reduceFinal - with common.KernelHelper(simpleCudaGraphs, devID) as kernelHelper: - _reduce = kernelHelper.getFunction(b"reduce") - _reduceFinal = kernelHelper.getFunction(b"reduceFinal") + kernel_helper = common.KernelHelper(simple_cuda_graphs, dev_id) + _reduce = kernel_helper.get_function(b"reduce") + _reduceFinal = kernel_helper.get_function(b"reduceFinal") - print(f"{size} elements") - print(f"threads per block = {THREADS_PER_BLOCK}") - print(f"Graph Launch iterations = {GRAPH_LAUNCH_ITERATIONS}") + print(f"{size} elements") + print(f"threads per block = {THREADS_PER_BLOCK}") + print(f"Graph Launch iterations = {GRAPH_LAUNCH_ITERATIONS}") - inputVec_h = checkCudaErrors(cudart.cudaMallocHost(size * np.dtype(np.float32).itemsize)) - inputVec_d = checkCudaErrors(cudart.cudaMalloc(size * np.dtype(np.float32).itemsize)) - outputVec_d = checkCudaErrors(cudart.cudaMalloc(maxBlocks * np.dtype(np.float64).itemsize)) - result_d = checkCudaErrors(cudart.cudaMalloc(np.dtype(np.float64).itemsize)) + input_vec_h = check_cuda_errors(cudart.cudaMallocHost(size * np.dtype(np.float32).itemsize)) + input_vec_d = check_cuda_errors(cudart.cudaMalloc(size * np.dtype(np.float32).itemsize)) + output_vec_d = check_cuda_errors(cudart.cudaMalloc(max_blocks * np.dtype(np.float64).itemsize)) + result_d = check_cuda_errors(cudart.cudaMalloc(np.dtype(np.float64).itemsize)) - init_input(inputVec_h, size) + init_input(input_vec_h, size) - cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, size, maxBlocks) - cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d, size, maxBlocks) + cuda_graphs_manual(input_vec_h, input_vec_d, output_vec_d, result_d, size, max_blocks) + cuda_graphs_using_stream_capture(input_vec_h, input_vec_d, output_vec_d, result_d, size, max_blocks) - checkCudaErrors(cudart.cudaFree(inputVec_d)) - checkCudaErrors(cudart.cudaFree(outputVec_d)) - checkCudaErrors(cudart.cudaFree(result_d)) - checkCudaErrors(cudart.cudaFreeHost(inputVec_h)) + check_cuda_errors(cudart.cudaFree(input_vec_d)) + check_cuda_errors(cudart.cudaFree(output_vec_d)) + check_cuda_errors(cudart.cudaFree(result_d)) + check_cuda_errors(cudart.cudaFreeHost(input_vec_h)) if __name__ == "__main__": diff --git a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py index 257a7afa14..8ef5506257 100644 --- a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py +++ b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py @@ -9,12 +9,12 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors, findCudaDevice +from common.helper_cuda import check_cuda_errors, find_cuda_device from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart -conjugateGradientMultiBlockCG = """\ +conjugate_gradient_multi_block_cg = """\ #line __LINE__ #include #include @@ -163,37 +163,37 @@ """ -def genTridiag(I, J, val, N, nz): - I[0] = 0 - J[0] = 0 - J[1] = 0 +def gen_tridiag(i, j, val, n, nz): + i[0] = 0 + j[0] = 0 + j[1] = 0 val[0] = float(random()) + 10.0 val[1] = float(random()) - for i in range(1, N): + for i in range(1, n): if i > 1: - I[i] = I[i - 1] + 3 + i[i] = i[i - 1] + 3 else: - I[1] = 2 + i[1] = 2 start = (i - 1) * 3 + 2 - J[start] = i - 1 - J[start + 1] = i + j[start] = i - 1 + j[start + 1] = i - if i < N - 1: - J[start + 2] = i + 1 + if i < n - 1: + j[start + 2] = i + 1 val[start] = val[start - 1] val[start + 1] = float(random()) + 10.0 - if i < N - 1: + if i < n - 1: val[start + 2] = float(random()) - I[N] = nz + i[n] = nz THREADS_PER_BLOCK = 512 -sSDKname = "conjugateGradientMultiBlockCG" +s_sd_kname = "conjugateGradientMultiBlockCG" def main(): @@ -214,139 +214,137 @@ def main(): pytest.skip("conjugateGradientMultiBlockCG is not supported on QNX") # This will pick the best possible CUDA capable device - devID = findCudaDevice() - deviceProp = checkCudaErrors(cudart.cudaGetDeviceProperties(devID)) + dev_id = find_cuda_device() + device_prop = check_cuda_errors(cudart.cudaGetDeviceProperties(dev_id)) - if not deviceProp.managedMemory: + if not device_prop.managedMemory: pytest.skip("Unified Memory not supported on this device") # This sample requires being run on a device that supports Cooperative Kernel # Launch - if not deviceProp.cooperativeLaunch: - pytest.skip(f"Selected GPU {devID} does not support Cooperative Kernel Launch") + if not device_prop.cooperativeLaunch: + pytest.skip(f"Selected GPU {dev_id} does not support Cooperative Kernel Launch") # Statistics about the GPU device print( - f"> GPU device has {deviceProp.multiProcessorCount:%d} Multi-Processors, SM {deviceProp.major:%d}.{deviceProp.minor:%d} compute capabilities\n" + f"> GPU device has {device_prop.multiProcessorCount:%d} Multi-Processors, SM {device_prop.major:%d}.{device_prop.minor:%d} compute capabilities\n" ) # Get kernel - with common.KernelHelper(conjugateGradientMultiBlockCG, devID) as kernelHelper: - _gpuConjugateGradient = kernelHelper.getFunction(b"gpuConjugateGradient") - - # Generate a random tridiagonal symmetric matrix in CSR format - N = 1048576 - nz = (N - 2) * 3 + 4 - - I = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * (N + 1), cudart.cudaMemAttachGlobal)) - J = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * nz, cudart.cudaMemAttachGlobal)) - val = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * nz, cudart.cudaMemAttachGlobal)) - I_local = (ctypes.c_int * (N + 1)).from_address(I) - J_local = (ctypes.c_int * nz).from_address(J) - val_local = (ctypes.c_float * nz).from_address(val) - - genTridiag(I_local, J_local, val_local, N, nz) - - x = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal)) - rhs = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal)) - dot_result = checkCudaErrors( - cudart.cudaMallocManaged(np.dtype(np.float64).itemsize, cudart.cudaMemAttachGlobal) - ) - x_local = (ctypes.c_float * N).from_address(x) - rhs_local = (ctypes.c_float * N).from_address(rhs) - dot_result_local = (ctypes.c_double).from_address(dot_result) - dot_result_local = 0 - - # temp memory for CG - r = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal)) - p = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal)) - Ax = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal)) - r_local = (ctypes.c_float * N).from_address(r) - - checkCudaErrors(cudart.cudaDeviceSynchronize()) - - start = checkCudaErrors(cudart.cudaEventCreate()) - stop = checkCudaErrors(cudart.cudaEventCreate()) - - for i in range(N): - r_local[i] = rhs_local[i] = 1.0 - x_local[i] = 0.0 - - kernelArgs_value = (I, J, val, x, Ax, p, r, dot_result, nz, N, tol) - kernelArgs_types = ( - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_int, - ctypes.c_int, - ctypes.c_float, - ) - kernelArgs = (kernelArgs_value, kernelArgs_types) + kernel_helper = common.KernelHelper(conjugate_gradient_multi_block_cg, dev_id) + _gpu_conjugate_gradient = kernel_helper.get_function(b"gpuConjugateGradient") + + # Generate a random tridiagonal symmetric matrix in CSR format + n = 1048576 + nz = (n - 2) * 3 + 4 + + i = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * (n + 1), cudart.cudaMemAttachGlobal)) + j = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * nz, cudart.cudaMemAttachGlobal)) + val = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * nz, cudart.cudaMemAttachGlobal)) + i_local = (ctypes.c_int * (n + 1)).from_address(i) + j_local = (ctypes.c_int * nz).from_address(j) + val_local = (ctypes.c_float * nz).from_address(val) + + gen_tridiag(i_local, j_local, val_local, n, nz) + + x = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal)) + rhs = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal)) + dot_result = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float64).itemsize, cudart.cudaMemAttachGlobal)) + x_local = (ctypes.c_float * n).from_address(x) + rhs_local = (ctypes.c_float * n).from_address(rhs) + dot_result_local = (ctypes.c_double).from_address(dot_result) + dot_result_local = 0 + + # temp memory for CG + r = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal)) + p = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal)) + ax = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal)) + r_local = (ctypes.c_float * n).from_address(r) + + check_cuda_errors(cudart.cudaDeviceSynchronize()) + + start = check_cuda_errors(cudart.cudaEventCreate()) + stop = check_cuda_errors(cudart.cudaEventCreate()) + + for i in range(n): + r_local[i] = rhs_local[i] = 1.0 + x_local[i] = 0.0 + + kernel_args_value = (i, j, val, x, ax, p, r, dot_result, nz, n, tol) + kernel_args_types = ( + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ctypes.c_float, + ) + kernel_args = (kernel_args_value, kernel_args_types) - sMemSize = np.dtype(np.float64).itemsize * ((THREADS_PER_BLOCK / 32) + 1) - numThreads = THREADS_PER_BLOCK - numBlocksPerSm = checkCudaErrors( - cuda.cuOccupancyMaxActiveBlocksPerMultiprocessor(_gpuConjugateGradient, numThreads, sMemSize) - ) - numSms = deviceProp.multiProcessorCount - dimGrid = cudart.dim3() - dimGrid.x = numSms * numBlocksPerSm - dimGrid.y = 1 - dimGrid.z = 1 - dimBlock = cudart.dim3() - dimBlock.x = THREADS_PER_BLOCK - dimBlock.y = 1 - dimBlock.z = 1 - - checkCudaErrors(cudart.cudaEventRecord(start, 0)) - checkCudaErrors( - cuda.cuLaunchCooperativeKernel( - _gpuConjugateGradient, - dimGrid.x, - dimGrid.y, - dimGrid.z, - dimBlock.x, - dimBlock.y, - dimBlock.z, - 0, - 0, - kernelArgs, - ) + s_mem_size = np.dtype(np.float64).itemsize * ((THREADS_PER_BLOCK / 32) + 1) + num_threads = THREADS_PER_BLOCK + num_blocks_per_sm = check_cuda_errors( + cuda.cuOccupancyMaxActiveBlocksPerMultiprocessor(_gpu_conjugate_gradient, num_threads, s_mem_size) + ) + num_sms = device_prop.multiProcessorCount + dim_grid = cudart.dim3() + dim_grid.x = num_sms * num_blocks_per_sm + dim_grid.y = 1 + dim_grid.z = 1 + dim_block = cudart.dim3() + dim_block.x = THREADS_PER_BLOCK + dim_block.y = 1 + dim_block.z = 1 + + check_cuda_errors(cudart.cudaEventRecord(start, 0)) + check_cuda_errors( + cuda.cuLaunchCooperativeKernel( + _gpu_conjugate_gradient, + dim_grid.x, + dim_grid.y, + dim_grid.z, + dim_block.x, + dim_block.y, + dim_block.z, + 0, + 0, + kernel_args, ) - checkCudaErrors(cudart.cudaEventRecord(stop, 0)) - checkCudaErrors(cudart.cudaDeviceSynchronize()) - - time = checkCudaErrors(cudart.cudaEventElapsedTime(start, stop)) - print(f"GPU Final, residual = {math.sqrt(dot_result_local):e}, kernel execution time = {time:f} ms") - - err = 0.0 - for i in range(N): - rsum = 0.0 - - for j in range(I_local[i], I_local[i + 1]): - rsum += val_local[j] * x_local[J_local[j]] - - diff = math.fabs(rsum - rhs_local[i]) - - if diff > err: - err = diff - - checkCudaErrors(cudart.cudaFree(I)) - checkCudaErrors(cudart.cudaFree(J)) - checkCudaErrors(cudart.cudaFree(val)) - checkCudaErrors(cudart.cudaFree(x)) - checkCudaErrors(cudart.cudaFree(rhs)) - checkCudaErrors(cudart.cudaFree(r)) - checkCudaErrors(cudart.cudaFree(p)) - checkCudaErrors(cudart.cudaFree(Ax)) - checkCudaErrors(cudart.cudaFree(dot_result)) - checkCudaErrors(cudart.cudaEventDestroy(start)) - checkCudaErrors(cudart.cudaEventDestroy(stop)) + ) + check_cuda_errors(cudart.cudaEventRecord(stop, 0)) + check_cuda_errors(cudart.cudaDeviceSynchronize()) + + time = check_cuda_errors(cudart.cudaEventElapsedTime(start, stop)) + print(f"GPU Final, residual = {math.sqrt(dot_result_local):e}, kernel execution time = {time:f} ms") + + err = 0.0 + for i in range(n): + rsum = 0.0 + + for j in range(i_local[i], i_local[i + 1]): + rsum += val_local[j] * x_local[j_local[j]] + + diff = math.fabs(rsum - rhs_local[i]) + + if diff > err: + err = diff + + check_cuda_errors(cudart.cudaFree(i)) + check_cuda_errors(cudart.cudaFree(j)) + check_cuda_errors(cudart.cudaFree(val)) + check_cuda_errors(cudart.cudaFree(x)) + check_cuda_errors(cudart.cudaFree(rhs)) + check_cuda_errors(cudart.cudaFree(r)) + check_cuda_errors(cudart.cudaFree(p)) + check_cuda_errors(cudart.cudaFree(ax)) + check_cuda_errors(cudart.cudaFree(dot_result)) + check_cuda_errors(cudart.cudaEventDestroy(start)) + check_cuda_errors(cudart.cudaEventDestroy(stop)) print(f"Test Summary: Error amount = {err:f}") if math.sqrt(dot_result_local) >= tol: diff --git a/cuda_bindings/examples/common/common.py b/cuda_bindings/examples/common/common.py index 8723abe26a..5b5151ef24 100644 --- a/cuda_bindings/examples/common/common.py +++ b/cuda_bindings/examples/common/common.py @@ -2,10 +2,8 @@ # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE -from contextlib import suppress - import numpy as np -from common.helper_cuda import checkCudaErrors +from common.helper_cuda import check_cuda_errors from cuda import pathfinder from cuda.bindings import driver as cuda @@ -13,14 +11,14 @@ from cuda.bindings import runtime as cudart -def pytest_skipif_compute_capability_too_low(devID, required_cc_major_minor): +def pytest_skipif_compute_capability_too_low(dev_id, required_cc_major_minor): import pytest - cc_major = checkCudaErrors( - cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID) + cc_major = check_cuda_errors( + cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, dev_id) ) - cc_minor = checkCudaErrors( - cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, devID) + cc_minor = check_cuda_errors( + cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, dev_id) ) have_cc_major_minor = (cc_major, cc_minor) if have_cc_major_minor < required_cc_major_minor: @@ -28,8 +26,7 @@ def pytest_skipif_compute_capability_too_low(devID, required_cc_major_minor): class KernelHelper: - def __init__(self, code, devID): - self.module = None + def __init__(self, code, dev_id): include_dirs = [] for libname in ("cudart", "cccl"): hdr_dir = pathfinder.find_nvidia_header_directory(libname) @@ -39,18 +36,18 @@ def __init__(self, code, devID): pytest.skip(f'pathfinder.find_nvidia_header_directory("{libname}") returned None') include_dirs.append(hdr_dir) - prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None)) + prog = check_cuda_errors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None)) # Initialize CUDA - checkCudaErrors(cudart.cudaFree(0)) + check_cuda_errors(cudart.cudaFree(0)) - major = checkCudaErrors( - cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID) + major = check_cuda_errors( + cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, dev_id) ) - minor = checkCudaErrors( - cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, devID) + minor = check_cuda_errors( + cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, dev_id) ) - _, nvrtc_minor = checkCudaErrors(nvrtc.nvrtcVersion()) + _, nvrtc_minor = check_cuda_errors(nvrtc.nvrtcVersion()) use_cubin = nvrtc_minor >= 1 prefix = "sm" if use_cubin else "compute" arch_arg = bytes(f"--gpu-architecture={prefix}_{major}{minor}", "ascii") @@ -65,44 +62,27 @@ def __init__(self, code, devID): opts.append(f"--include-path={inc_dir}".encode()) try: - checkCudaErrors(nvrtc.nvrtcCompileProgram(prog, len(opts), opts)) - - if use_cubin: - dataSize = checkCudaErrors(nvrtc.nvrtcGetCUBINSize(prog)) - data = b" " * dataSize - checkCudaErrors(nvrtc.nvrtcGetCUBIN(prog, data)) - else: - dataSize = checkCudaErrors(nvrtc.nvrtcGetPTXSize(prog)) - data = b" " * dataSize - checkCudaErrors(nvrtc.nvrtcGetPTX(prog, data)) + check_cuda_errors(nvrtc.nvrtcCompileProgram(prog, len(opts), opts)) except RuntimeError as err: - logSize = checkCudaErrors(nvrtc.nvrtcGetProgramLogSize(prog)) - log = b" " * logSize - checkCudaErrors(nvrtc.nvrtcGetProgramLog(prog, log)) + log_size = check_cuda_errors(nvrtc.nvrtcGetProgramLogSize(prog)) + log = b" " * log_size + check_cuda_errors(nvrtc.nvrtcGetProgramLog(prog, log)) import sys print(log.decode(), file=sys.stderr) print(err, file=sys.stderr) sys.exit(1) - finally: - checkCudaErrors(nvrtc.nvrtcDestroyProgram(prog)) - - self.module = checkCudaErrors(cuda.cuModuleLoadData(np.char.array(data))) - - def getFunction(self, name): - return checkCudaErrors(cuda.cuModuleGetFunction(self.module, name)) - - def close(self): - if self.module is not None: - checkCudaErrors(cuda.cuModuleUnload(self.module)) - self.module = None - def __enter__(self): - return self + if use_cubin: + data_size = check_cuda_errors(nvrtc.nvrtcGetCUBINSize(prog)) + data = b" " * data_size + check_cuda_errors(nvrtc.nvrtcGetCUBIN(prog, data)) + else: + data_size = check_cuda_errors(nvrtc.nvrtcGetPTXSize(prog)) + data = b" " * data_size + check_cuda_errors(nvrtc.nvrtcGetPTX(prog, data)) - def __exit__(self, exc_type, exc, tb): - self.close() + self.module = check_cuda_errors(cuda.cuModuleLoadData(np.char.array(data))) - def __del__(self): - with suppress(Exception): - self.close() + def get_function(self, name): + return check_cuda_errors(cuda.cuModuleGetFunction(self.module, name)) diff --git a/cuda_bindings/examples/common/helper_cuda.py b/cuda_bindings/examples/common/helper_cuda.py index d741eb54d9..9fbfe8c82f 100644 --- a/cuda_bindings/examples/common/helper_cuda.py +++ b/cuda_bindings/examples/common/helper_cuda.py @@ -1,14 +1,14 @@ # Copyright 2021-2025 NVIDIA Corporation. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE -from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt +from common.helper_string import check_cmd_line_flag, get_cmd_line_argument_int from cuda.bindings import driver as cuda from cuda.bindings import nvrtc from cuda.bindings import runtime as cudart -def _cudaGetErrorEnum(error): +def _cuda_get_error_enum(error): if isinstance(error, cuda.CUresult): err, name = cuda.cuGetErrorName(error) return name if err == cuda.CUresult.CUDA_SUCCESS else "" @@ -20,9 +20,9 @@ def _cudaGetErrorEnum(error): raise RuntimeError(f"Unknown error type: {error}") -def checkCudaErrors(result): +def check_cuda_errors(result): if result[0].value: - raise RuntimeError(f"CUDA error code={result[0].value}({_cudaGetErrorEnum(result[0])})") + raise RuntimeError(f"CUDA error code={result[0].value}({_cuda_get_error_enum(result[0])})") if len(result) == 1: return None elif len(result) == 2: @@ -31,18 +31,18 @@ def checkCudaErrors(result): return result[1:] -def findCudaDevice(): - devID = 0 - if checkCmdLineFlag("device="): - devID = getCmdLineArgumentInt("device=") - checkCudaErrors(cudart.cudaSetDevice(devID)) - return devID +def find_cuda_device(): + dev_id = 0 + if check_cmd_line_flag("device="): + dev_id = get_cmd_line_argument_int("device=") + check_cuda_errors(cudart.cudaSetDevice(dev_id)) + return dev_id -def findCudaDeviceDRV(): - devID = 0 - if checkCmdLineFlag("device="): - devID = getCmdLineArgumentInt("device=") - checkCudaErrors(cuda.cuInit(0)) - cuDevice = checkCudaErrors(cuda.cuDeviceGet(devID)) - return cuDevice +def find_cuda_device_drv(): + dev_id = 0 + if check_cmd_line_flag("device="): + dev_id = get_cmd_line_argument_int("device=") + check_cuda_errors(cuda.cuInit(0)) + cu_device = check_cuda_errors(cuda.cuDeviceGet(dev_id)) + return cu_device diff --git a/cuda_bindings/examples/common/helper_string.py b/cuda_bindings/examples/common/helper_string.py index 9f8e70a6c4..47d9d36569 100644 --- a/cuda_bindings/examples/common/helper_string.py +++ b/cuda_bindings/examples/common/helper_string.py @@ -4,12 +4,12 @@ import sys -def checkCmdLineFlag(stringRef): - return any(stringRef == i and k < len(sys.argv) - 1 for i, k in enumerate(sys.argv)) +def check_cmd_line_flag(string_ref): + return any(string_ref == i and k < len(sys.argv) - 1 for i, k in enumerate(sys.argv)) -def getCmdLineArgumentInt(stringRef): +def get_cmd_line_argument_int(string_ref): for i, k in enumerate(sys.argv): - if stringRef == i and k < len(sys.argv) - 1: + if string_ref == i and k < len(sys.argv) - 1: return sys.argv[k + 1] return 0 diff --git a/cuda_bindings/examples/extra/isoFDModelling_test.py b/cuda_bindings/examples/extra/isoFDModelling_test.py index 148d836adf..21303664ac 100644 --- a/cuda_bindings/examples/extra/isoFDModelling_test.py +++ b/cuda_bindings/examples/extra/isoFDModelling_test.py @@ -5,12 +5,12 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors +from common.helper_cuda import check_cuda_errors from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart -isoPropagator = """\ +iso_propagator = """\ extern "C" __global__ void injectSource(float *__restrict__ in, float *__restrict__ src, int it) { @@ -177,7 +177,7 @@ def align_ny(ny, blk, nops): # # this class contains the input params # -class params: +class Params: def __init__(self): self.BDIMX = 32 # tiles x y for fd operators self.BDIMY = 16 @@ -209,53 +209,53 @@ def __init__(self): # # this class contains all the kernels to be used bu propagator # -class cudaKernels: +class CudaKernels: def __init__(self, cntx): - checkCudaErrors(cuda.cuInit(0)) - checkCudaErrors(cuda.cuCtxSetCurrent(cntx)) - dev = checkCudaErrors(cuda.cuCtxGetDevice()) + check_cuda_errors(cuda.cuInit(0)) + check_cuda_errors(cuda.cuCtxSetCurrent(cntx)) + dev = check_cuda_errors(cuda.cuCtxGetDevice()) - self.kernelHelper = common.KernelHelper(isoPropagator, int(dev)) + self.kernel_helper = common.KernelHelper(iso_propagator, int(dev)) # kernel to create a source fnction with some max frequency - self.creatSource = self.kernelHelper.getFunction(b"createSource") + self.creatSource = self.kernel_helper.get_function(b"createSource") # create a velocity to try things: just a sphere on the middle 4500 m/s and 2500 m/s all around - self.createVelocity = self.kernelHelper.getFunction(b"createVelocity") + self.create_velocity = self.kernel_helper.get_function(b"createVelocity") # kernel to propagate the wavefield by 1 step in time - self.fdPropag = self.kernelHelper.getFunction(b"fwd_3D_orderX2k") + self.fdPropag = self.kernel_helper.get_function(b"fwd_3D_orderX2k") # kernel to propagate the wavefield by 1 step in time - self.injectSource = self.kernelHelper.getFunction(b"injectSource") + self.inject_source = self.kernel_helper.get_function(b"injectSource") # # this class contains: propagator, source creation, velocity creation # injection of data and domain exchange # -class propagator: +class Propagator: def __init__(self, params, _dev): print("init object for device ", _dev) self.dev = _dev - checkCudaErrors(cuda.cuInit(0)) - self.cuDevice = checkCudaErrors(cuda.cuDeviceGet(_dev)) - self.context = checkCudaErrors(cuda.cuCtxCreate(None, 0, self.cuDevice)) + check_cuda_errors(cuda.cuInit(0)) + self.cu_device = check_cuda_errors(cuda.cuDeviceGet(_dev)) + self.context = check_cuda_errors(cuda.cuCtxCreate(None, 0, self.cu_device)) self.waveOut = 0 self.waveIn = 0 - self.streamCenter = checkCudaErrors(cuda.cuStreamCreate(0)) - self.streamHalo = checkCudaErrors(cuda.cuStreamCreate(0)) - self.params = params + self.streamCenter = check_cuda_errors(cuda.cuStreamCreate(0)) + self.streamHalo = check_cuda_errors(cuda.cuStreamCreate(0)) + self.Params = params def __del__(self): - checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) - checkCudaErrors(cuda.cuStreamDestroy(self.streamHalo)) - checkCudaErrors(cuda.cuStreamDestroy(self.streamCenter)) + check_cuda_errors(cuda.cuCtxSetCurrent(self.context)) + check_cuda_errors(cuda.cuStreamDestroy(self.streamHalo)) + check_cuda_errors(cuda.cuStreamDestroy(self.streamCenter)) if self.waveIn != 0: - checkCudaErrors(cuda.cuMemFree(self.waveIn)) + check_cuda_errors(cuda.cuMemFree(self.waveIn)) if self.waveOut != 0: - checkCudaErrors(cuda.cuMemFree(self.waveOut)) - checkCudaErrors(cuda.cuCtxDestroy(self.context)) + check_cuda_errors(cuda.cuMemFree(self.waveOut)) + check_cuda_errors(cuda.cuCtxDestroy(self.context)) # # swap waveIn with waveOut @@ -275,45 +275,45 @@ def swap(self): # allocate the device memory # def allocate(self): - nel = self.params.nx * self.params.ny * self.params.nz + nel = self.Params.nx * self.Params.ny * self.Params.nz n = np.array(nel, dtype=np.uint32) - bufferSize = n * np.dtype(np.float32).itemsize - checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) + buffer_size = n * np.dtype(np.float32).itemsize + check_cuda_errors(cuda.cuCtxSetCurrent(self.context)) - self.velocity = checkCudaErrors(cuda.cuMemAlloc(bufferSize)) - checkCudaErrors(cuda.cuMemsetD32(self.velocity, 0, n)) + self.velocity = check_cuda_errors(cuda.cuMemAlloc(buffer_size)) + check_cuda_errors(cuda.cuMemsetD32(self.velocity, 0, n)) - nel += self.params.lead + nel += self.Params.lead n = np.array(nel, dtype=np.uint32) ## we need to align at the beginning of the tile - bufferSize = n * np.dtype(np.float32).itemsize - self.waveIn = checkCudaErrors(cuda.cuMemAlloc(bufferSize)) - checkCudaErrors(cuda.cuMemsetD32(self.waveIn, 0, n)) + buffer_size = n * np.dtype(np.float32).itemsize + self.waveIn = check_cuda_errors(cuda.cuMemAlloc(buffer_size)) + check_cuda_errors(cuda.cuMemsetD32(self.waveIn, 0, n)) - self.waveOut = checkCudaErrors(cuda.cuMemAlloc(bufferSize)) - checkCudaErrors(cuda.cuMemsetD32(self.waveOut, 0, n)) + self.waveOut = check_cuda_errors(cuda.cuMemAlloc(buffer_size)) + check_cuda_errors(cuda.cuMemsetD32(self.waveOut, 0, n)) - n = np.array(self.params.nt, dtype=np.uint32) - bufferSize = n * np.dtype(np.float32).itemsize - self.source = checkCudaErrors(cuda.cuMemAlloc(bufferSize)) - checkCudaErrors(cuda.cuMemsetD32(self.source, 0, n)) + n = np.array(self.Params.nt, dtype=np.uint32) + buffer_size = n * np.dtype(np.float32).itemsize + self.source = check_cuda_errors(cuda.cuMemAlloc(buffer_size)) + check_cuda_errors(cuda.cuMemsetD32(self.source, 0, n)) # # create source data # - def createSource(self, kernel): + def create_source(self, kernel): print("creating source on device ", self.dev) buf = np.array([int(self.source)], dtype=np.uint64) - nt = np.array(self.params.nt, dtype=np.uint32) - dt = np.array(self.params.dt, dtype=np.float32) - freq = np.array(self.params.freqMax, dtype=np.float32) + nt = np.array(self.Params.nt, dtype=np.uint32) + dt = np.array(self.Params.dt, dtype=np.float32) + freq = np.array(self.Params.freqMax, dtype=np.float32) args = [buf, dt, freq, nt] argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) - checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) - checkCudaErrors( + check_cuda_errors(cuda.cuCtxSetCurrent(self.context)) + check_cuda_errors( cuda.cuLaunchKernel( kernel.creatSource, 1, @@ -328,34 +328,34 @@ def createSource(self, kernel): 0, ) ) # arguments - checkCudaErrors(cuda.cuStreamSynchronize(self.streamHalo)) + check_cuda_errors(cuda.cuStreamSynchronize(self.streamHalo)) # # inject source function: ony on the domain 0 # - def injectSource(self, kernel, iter): - checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) + def inject_source(self, kernel, iter): + check_cuda_errors(cuda.cuCtxSetCurrent(self.context)) if self.dev != 0: return wavein = np.array([int(self.waveIn)], dtype=np.uint64) src = np.array([int(self.source)], dtype=np.uint64) - offset_sourceInject = ( - self.params.lead - + (int)(self.params.nz / 2) * self.params.nx * self.params.ny - + (int)(self.params.ny / 2) * self.params.nx - + (int)(self.params.nx / 2) + offset_source_inject = ( + self.Params.lead + + (int)(self.Params.nz / 2) * self.Params.nx * self.Params.ny + + (int)(self.Params.ny / 2) * self.Params.nx + + (int)(self.Params.nx / 2) ) - offset_sourceInject *= np.dtype(np.float32).itemsize + offset_source_inject *= np.dtype(np.float32).itemsize np_it = np.array(iter, dtype=np.uint32) - args = [wavein + offset_sourceInject, src, np_it] + args = [wavein + offset_source_inject, src, np_it] argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) - checkCudaErrors( + check_cuda_errors( cuda.cuLaunchKernel( - kernel.injectSource, + kernel.inject_source, 1, 1, 1, # grid dim @@ -372,39 +372,39 @@ def injectSource(self, kernel, iter): # # create velocity # - def createVelocity(self, kernel): + def create_velocity(self, kernel): print("running create velocity on device ", self.dev) offset_velocity = ( - self.params.FD_ORDER * self.params.nx * self.params.ny - + self.params.FD_ORDER * self.params.nx - + self.params.FD_ORDER + self.Params.FD_ORDER * self.Params.nx * self.Params.ny + + self.Params.FD_ORDER * self.Params.nx + + self.Params.FD_ORDER ) offset_velocity *= np.dtype(np.float32).itemsize vel = np.array([int(self.velocity)], dtype=np.uint64) - dx_dt2 = (self.params.dt * self.params.dt) / (self.params.delta * self.params.delta) + dx_dt2 = (self.Params.dt * self.Params.dt) / (self.Params.delta * self.Params.delta) - stride = self.params.nx * self.params.ny + stride = self.Params.nx * self.Params.ny np_dx_dt2 = np.array(dx_dt2, dtype=np.float32) - np_nz = np.array((self.params.nz - 2 * self.params.FD_ORDER), dtype=np.uint32) - np_nx = np.array(self.params.nx, dtype=np.uint32) + np_nz = np.array((self.Params.nz - 2 * self.Params.FD_ORDER), dtype=np.uint32) + np_nx = np.array(self.Params.nx, dtype=np.uint32) np_stride = np.array(stride, dtype=np.uint32) args = [vel + offset_velocity, np_dx_dt2, np_nz, np_nx, np_stride] argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) - checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) + check_cuda_errors(cuda.cuCtxSetCurrent(self.context)) # do halo up - checkCudaErrors( + check_cuda_errors( cuda.cuLaunchKernel( - kernel.createVelocity, - self.params.blkx, - self.params.blky, + kernel.create_velocity, + self.Params.blkx, + self.Params.blky, 1, # grid dim - 2 * self.params.BDIMX, - self.params.BDIMY, + 2 * self.Params.BDIMX, + self.Params.BDIMY, 1, # block dim 0, self.streamHalo, # shared mem and stream @@ -412,22 +412,22 @@ def createVelocity(self, kernel): 0, ) ) # arguments - checkCudaErrors(cuda.cuStreamSynchronize(self.streamHalo)) + check_cuda_errors(cuda.cuStreamSynchronize(self.streamHalo)) # # execute the center part of propagation # - def executeCenter(self, kernel): + def execute_center(self, kernel): if verbose_prints: print("running center on device ", self.dev) - checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) + check_cuda_errors(cuda.cuCtxSetCurrent(self.context)) offset_velocity = ( - 2 * self.params.FD_ORDER * self.params.nx * self.params.ny - + self.params.FD_ORDER * self.params.nx - + self.params.FD_ORDER + 2 * self.Params.FD_ORDER * self.Params.nx * self.Params.ny + + self.Params.FD_ORDER * self.Params.nx + + self.Params.FD_ORDER ) - offset_wave = self.params.lead + offset_velocity + offset_wave = self.Params.lead + offset_velocity offset_wave *= np.dtype(np.float32).itemsize offset_velocity *= np.dtype(np.float32).itemsize @@ -436,9 +436,9 @@ def executeCenter(self, kernel): waveout = np.array([int(self.waveOut)], dtype=np.uint64) vel = np.array([int(self.velocity)], dtype=np.uint64) - stride = self.params.nx * self.params.ny - np_nz = np.array(self.params.nz - 4 * self.params.FD_ORDER, dtype=np.uint32) - np_nx = np.array(self.params.nx, dtype=np.uint32) + stride = self.Params.nx * self.Params.ny + np_nz = np.array(self.Params.nz - 4 * self.Params.FD_ORDER, dtype=np.uint32) + np_nx = np.array(self.Params.nx, dtype=np.uint32) np_stride = np.array(stride, dtype=np.uint32) args = [ @@ -452,14 +452,14 @@ def executeCenter(self, kernel): argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) # do center propagation from 2 * fd_order to nz - 2 * fd_order - checkCudaErrors( + check_cuda_errors( cuda.cuLaunchKernel( kernel.fdPropag, - self.params.blkx, - self.params.blky, + self.Params.blkx, + self.Params.blky, 1, # grid dim - self.params.BDIMX, - self.params.BDIMY, + self.Params.BDIMX, + self.Params.BDIMY, 1, # block dim 0, self.streamCenter, # shared mem and stream @@ -471,18 +471,18 @@ def executeCenter(self, kernel): # # execute the halo part of propagation # - def executeHalo(self, kernel): + def execute_halo(self, kernel): if verbose_prints: print("running halos on device ", self.dev) - checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) + check_cuda_errors(cuda.cuCtxSetCurrent(self.context)) offset_velocity = ( - self.params.FD_ORDER * self.params.nx * self.params.ny - + self.params.FD_ORDER * self.params.nx - + self.params.FD_ORDER + self.Params.FD_ORDER * self.Params.nx * self.Params.ny + + self.Params.FD_ORDER * self.Params.nx + + self.Params.FD_ORDER ) - offset_wave = self.params.lead + offset_velocity + offset_wave = self.Params.lead + offset_velocity offset_wave *= np.dtype(np.float32).itemsize offset_velocity *= np.dtype(np.float32).itemsize @@ -491,9 +491,9 @@ def executeHalo(self, kernel): waveout = np.array([int(self.waveOut)], dtype=np.uint64) vel = np.array([int(self.velocity)], dtype=np.uint64) - stride = self.params.nx * self.params.ny - np_nz = np.array(self.params.FD_ORDER, dtype=np.uint32) - np_nx = np.array(self.params.nx, dtype=np.uint32) + stride = self.Params.nx * self.Params.ny + np_nz = np.array(self.Params.FD_ORDER, dtype=np.uint32) + np_nx = np.array(self.Params.nx, dtype=np.uint32) np_stride = np.array(stride, dtype=np.uint32) args = [ @@ -507,14 +507,14 @@ def executeHalo(self, kernel): argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) # do halo up - checkCudaErrors( + check_cuda_errors( cuda.cuLaunchKernel( kernel.fdPropag, - self.params.blkx, - self.params.blky, + self.Params.blkx, + self.Params.blky, 1, # grid dim - self.params.BDIMX, - self.params.BDIMY, + self.Params.BDIMX, + self.Params.BDIMY, 1, # block dim 0, self.streamHalo, # shared mem and stream @@ -525,11 +525,11 @@ def executeHalo(self, kernel): # do halo down offset_velocity = ( - (self.params.nz - 2 * self.params.FD_ORDER) * self.params.nx * self.params.ny - + self.params.FD_ORDER * self.params.nx - + self.params.FD_ORDER + (self.Params.nz - 2 * self.Params.FD_ORDER) * self.Params.nx * self.Params.ny + + self.Params.FD_ORDER * self.Params.nx + + self.Params.FD_ORDER ) - offset_wave = self.params.lead + offset_velocity + offset_wave = self.Params.lead + offset_velocity offset_wave *= np.dtype(np.float32).itemsize offset_velocity *= np.dtype(np.float32).itemsize @@ -543,14 +543,14 @@ def executeHalo(self, kernel): np_stride, ] argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) - checkCudaErrors( + check_cuda_errors( cuda.cuLaunchKernel( kernel.fdPropag, - self.params.blkx, - self.params.blky, + self.Params.blkx, + self.Params.blky, 1, # grid dim - self.params.BDIMX, - self.params.BDIMY, + self.Params.BDIMX, + self.Params.BDIMY, 1, # block dim 0, self.streamHalo, # shared mem and stream @@ -562,79 +562,79 @@ def executeHalo(self, kernel): # # exchange the halos # - def exchangeHalo(self, propag): + def exchange_halo(self, propag): if verbose_prints: print("exchange halos on device ", self.dev, "with dev ", propag.dev) - checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) + check_cuda_errors(cuda.cuCtxSetCurrent(self.context)) # # the following variables don't change # - nstride = self.params.nx * self.params.ny + nstride = self.Params.nx * self.Params.ny - devS = self.context - devD = propag.context + dev_s = self.context + dev_d = propag.context - n_exch = self.params.FD_ORDER * nstride + n_exch = self.Params.FD_ORDER * nstride n_exch *= np.dtype(np.float32).itemsize if self.dev < propag.dev: # exchange up - offsetS = self.params.lead + (self.params.nz - 2 * self.params.FD_ORDER) * nstride - offsetD = propag.params.lead + offset_s = self.Params.lead + (self.Params.nz - 2 * self.Params.FD_ORDER) * nstride + offset_d = propag.Params.lead - offsetS *= np.dtype(np.float32).itemsize - offsetD *= np.dtype(np.float32).itemsize + offset_s *= np.dtype(np.float32).itemsize + offset_d *= np.dtype(np.float32).itemsize - waveD = cuda.CUdeviceptr(int(propag.waveOut) + offsetD) - waveS = cuda.CUdeviceptr(int(self.waveOut) + offsetS) + wave_d = cuda.CUdeviceptr(int(propag.waveOut) + offset_d) + wave_s = cuda.CUdeviceptr(int(self.waveOut) + offset_s) - checkCudaErrors(cuda.cuMemcpyPeerAsync(waveD, devD, waveS, devS, n_exch, self.streamHalo)) + check_cuda_errors(cuda.cuMemcpyPeerAsync(wave_d, dev_d, wave_s, dev_s, n_exch, self.streamHalo)) else: # exchange down - offsetS = self.params.lead + self.params.FD_ORDER * nstride - offsetD = propag.params.lead + (propag.params.nz - propag.params.FD_ORDER) * nstride + offset_s = self.Params.lead + self.Params.FD_ORDER * nstride + offset_d = propag.Params.lead + (propag.Params.nz - propag.Params.FD_ORDER) * nstride - offsetS *= np.dtype(np.float32).itemsize - offsetD *= np.dtype(np.float32).itemsize + offset_s *= np.dtype(np.float32).itemsize + offset_d *= np.dtype(np.float32).itemsize - waveD = cuda.CUdeviceptr(int(propag.waveOut) + offsetD) - waveS = cuda.CUdeviceptr(int(self.waveOut) + offsetS) + wave_d = cuda.CUdeviceptr(int(propag.waveOut) + offset_d) + wave_s = cuda.CUdeviceptr(int(self.waveOut) + offset_s) - checkCudaErrors(cuda.cuMemcpyPeerAsync(waveD, devD, waveS, devS, n_exch, self.streamHalo)) + check_cuda_errors(cuda.cuMemcpyPeerAsync(wave_d, dev_d, wave_s, dev_s, n_exch, self.streamHalo)) # # sync stream # - def syncStream(self, stream): - checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) - checkCudaErrors(cuda.cuStreamSynchronize(stream)) + def sync_stream(self, stream): + check_cuda_errors(cuda.cuCtxSetCurrent(self.context)) + check_cuda_errors(cuda.cuStreamSynchronize(stream)) def main(): - checkCudaErrors(cuda.cuInit(0)) + check_cuda_errors(cuda.cuInit(0)) # Number of GPUs print("Checking for multiple GPUs...") - gpu_n = checkCudaErrors(cuda.cuDeviceGetCount()) + gpu_n = check_cuda_errors(cuda.cuDeviceGetCount()) print(f"CUDA-capable device count: {gpu_n}") if gpu_n < 2: print("Two or more GPUs with Peer-to-Peer access capability are required") return - prop = [checkCudaErrors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)] + prop = [check_cuda_errors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)] # Check possibility for peer access print("\nChecking GPU(s) for support of peer to peer memory access...") - p2pCapableGPUs = [-1, -1] + p2p_capable_gp_us = [-1, -1] for i in range(gpu_n): - p2pCapableGPUs[0] = i + p2p_capable_gp_us[0] = i for j in range(gpu_n): if i == j: continue - i_access_j = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(i, j)) - j_access_i = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(j, i)) + i_access_j = check_cuda_errors(cudart.cudaDeviceCanAccessPeer(i, j)) + j_access_i = check_cuda_errors(cudart.cudaDeviceCanAccessPeer(j, i)) print( "> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format( prop[i].name, i, prop[j].name, j, "Yes" if i_access_j else "No" @@ -646,23 +646,23 @@ def main(): ) ) if i_access_j and j_access_i: - p2pCapableGPUs[1] = j + p2p_capable_gp_us[1] = j break - if p2pCapableGPUs[1] != -1: + if p2p_capable_gp_us[1] != -1: break - if p2pCapableGPUs[0] == -1 or p2pCapableGPUs[1] == -1: + if p2p_capable_gp_us[0] == -1 or p2p_capable_gp_us[1] == -1: print("Two or more GPUs with Peer-to-Peer access capability are required.") print("Peer to Peer access is not available amongst GPUs in the system, waiving test.") return # Use first pair of p2p capable GPUs detected - gpuid = [p2pCapableGPUs[0], p2pCapableGPUs[1]] + gpuid = [p2p_capable_gp_us[0], p2p_capable_gp_us[1]] # # init device # - pars = params() + pars = Params() # # create propagators @@ -674,16 +674,16 @@ def main(): # create kernels and propagators that are going to be used on device # for i in gpuid: - p = propagator(pars, i) - k = cudaKernels(p.context) + p = Propagator(pars, i) + k = CudaKernels(p.context) propags.append(p) kerns.append(k) # allocate resources in device for propag, kern in zip(propags, kerns): propag.allocate() - propag.createSource(kern) - propag.createVelocity(kern) + propag.create_source(kern) + propag.create_velocity(kern) # # loop over time iterations @@ -691,26 +691,26 @@ def main(): start = time.time() for it in range(pars.nt): for propag in propags: - propag.syncStream(propag.streamHalo) + propag.sync_stream(propag.streamHalo) for propag, kern in zip(propags, kerns): - propag.injectSource(kern, it) + propag.inject_source(kern, it) for propag, kern in zip(propags, kerns): - propag.executeHalo(kern) + propag.execute_halo(kern) for propag in propags: - propag.syncStream(propag.streamHalo) + propag.sync_stream(propag.streamHalo) - propags[1].exchangeHalo(propags[0]) + propags[1].exchange_halo(propags[0]) - propags[0].exchangeHalo(propags[1]) + propags[0].exchange_halo(propags[1]) for propag, kern in zip(propags, kerns): - propag.executeCenter(kern) + propag.execute_center(kern) for propag in propags: - propag.syncStream(propag.streamCenter) + propag.sync_stream(propag.streamCenter) for propag in propags: propag.swap() @@ -727,19 +727,19 @@ def main(): # nz = 2 * (int)(pars.nz - 2 * pars.FD_ORDER) print(" nz= ", nz, " nx= ", pars.nx) - hOut = np.zeros((nz, pars.nx), dtype="float32") + h_out = np.zeros((nz, pars.nx), dtype="float32") istart = 0 for propag in propags: - checkCudaErrors(cuda.cuCtxSetCurrent(propag.context)) + check_cuda_errors(cuda.cuCtxSetCurrent(propag.context)) offset = pars.lead + pars.FD_ORDER * pars.nx * pars.ny + (int)(pars.ny / 2) * pars.nx for j in range(pars.nz - 2 * pars.FD_ORDER): ptr = cuda.CUdeviceptr(int(propag.waveOut) + offset * 4) - checkCudaErrors( + check_cuda_errors( cuda.cuMemcpyDtoH( - hOut[istart].ctypes.data, + h_out[istart].ctypes.data, ptr, pars.nx * np.dtype(np.float32).itemsize, ) @@ -756,7 +756,7 @@ def main(): if display_graph: nrows = nz ncols = pars.nx - dbz = hOut + dbz = h_out dbz = np.reshape(dbz, (nrows, ncols)) ## diff --git a/cuda_bindings/examples/extra/jit_program_test.py b/cuda_bindings/examples/extra/jit_program_test.py index be78deafc1..80e7e73376 100644 --- a/cuda_bindings/examples/extra/jit_program_test.py +++ b/cuda_bindings/examples/extra/jit_program_test.py @@ -9,7 +9,7 @@ from cuda.bindings import nvrtc -def ASSERT_DRV(err): +def assert_drv(err): if isinstance(err, cuda.CUresult): if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError(f"Cuda Error: {err}") @@ -35,31 +35,31 @@ def ASSERT_DRV(err): def main(): # Init (err,) = cuda.cuInit(0) - ASSERT_DRV(err) + assert_drv(err) # Device - err, cuDevice = cuda.cuDeviceGet(0) - ASSERT_DRV(err) + err, cu_device = cuda.cuDeviceGet(0) + assert_drv(err) # Ctx - err, context = cuda.cuCtxCreate(None, 0, cuDevice) - ASSERT_DRV(err) + err, context = cuda.cuCtxCreate(None, 0, cu_device) + assert_drv(err) # Create program err, prog = nvrtc.nvrtcCreateProgram(str.encode(saxpy), b"saxpy.cu", 0, None, None) - ASSERT_DRV(err) + assert_drv(err) # Get target architecture err, major = cuda.cuDeviceGetAttribute( - cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cu_device ) - ASSERT_DRV(err) + assert_drv(err) err, minor = cuda.cuDeviceGetAttribute( - cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cu_device ) - ASSERT_DRV(err) + assert_drv(err) err, nvrtc_major, nvrtc_minor = nvrtc.nvrtcVersion() - ASSERT_DRV(err) + assert_drv(err) use_cubin = nvrtc_minor >= 1 prefix = "sm" if use_cubin else "compute" arch_arg = bytes(f"--gpu-architecture={prefix}_{major}{minor}", "ascii") @@ -67,82 +67,80 @@ def main(): # Compile program opts = [b"--fmad=false", arch_arg] (err,) = nvrtc.nvrtcCompileProgram(prog, len(opts), opts) - ASSERT_DRV(err) + assert_drv(err) # Get log from compilation - err, logSize = nvrtc.nvrtcGetProgramLogSize(prog) - ASSERT_DRV(err) - log = b" " * logSize + err, log_size = nvrtc.nvrtcGetProgramLogSize(prog) + assert_drv(err) + log = b" " * log_size (err,) = nvrtc.nvrtcGetProgramLog(prog, log) - ASSERT_DRV(err) + assert_drv(err) print(log.decode()) # Get data from compilation if use_cubin: - err, dataSize = nvrtc.nvrtcGetCUBINSize(prog) - ASSERT_DRV(err) - data = b" " * dataSize + err, data_size = nvrtc.nvrtcGetCUBINSize(prog) + assert_drv(err) + data = b" " * data_size (err,) = nvrtc.nvrtcGetCUBIN(prog, data) - ASSERT_DRV(err) + assert_drv(err) else: - err, dataSize = nvrtc.nvrtcGetPTXSize(prog) - ASSERT_DRV(err) - data = b" " * dataSize + err, data_size = nvrtc.nvrtcGetPTXSize(prog) + assert_drv(err) + data = b" " * data_size (err,) = nvrtc.nvrtcGetPTX(prog, data) - ASSERT_DRV(err) - (err,) = nvrtc.nvrtcDestroyProgram(prog) - ASSERT_DRV(err) + assert_drv(err) # Load data as module data and retrieve function data = np.char.array(data) err, module = cuda.cuModuleLoadData(data) - ASSERT_DRV(err) + assert_drv(err) err, kernel = cuda.cuModuleGetFunction(module, b"saxpy") - ASSERT_DRV(err) + assert_drv(err) # Test the kernel - NUM_THREADS = 128 - NUM_BLOCKS = 32 + num_threads = 128 + num_blocks = 32 a = np.float32(2.0) - n = np.array(NUM_THREADS * NUM_BLOCKS, dtype=np.uint32) - bufferSize = n * a.itemsize + n = np.array(num_threads * num_blocks, dtype=np.uint32) + buffer_size = n * a.itemsize - err, dX = cuda.cuMemAlloc(bufferSize) - ASSERT_DRV(err) - err, dY = cuda.cuMemAlloc(bufferSize) - ASSERT_DRV(err) - err, dOut = cuda.cuMemAlloc(bufferSize) - ASSERT_DRV(err) + err, d_x = cuda.cuMemAlloc(buffer_size) + assert_drv(err) + err, d_y = cuda.cuMemAlloc(buffer_size) + assert_drv(err) + err, d_out = cuda.cuMemAlloc(buffer_size) + assert_drv(err) - hX = np.random.rand(n).astype(dtype=np.float32) - hY = np.random.rand(n).astype(dtype=np.float32) - hOut = np.zeros(n).astype(dtype=np.float32) + h_x = np.random.rand(n).astype(dtype=np.float32) + h_y = np.random.rand(n).astype(dtype=np.float32) + h_out = np.zeros(n).astype(dtype=np.float32) err, stream = cuda.cuStreamCreate(0) - ASSERT_DRV(err) + assert_drv(err) - (err,) = cuda.cuMemcpyHtoDAsync(dX, hX, bufferSize, stream) - ASSERT_DRV(err) - (err,) = cuda.cuMemcpyHtoDAsync(dY, hY, bufferSize, stream) - ASSERT_DRV(err) + (err,) = cuda.cuMemcpyHtoDAsync(d_x, h_x, buffer_size, stream) + assert_drv(err) + (err,) = cuda.cuMemcpyHtoDAsync(d_y, h_y, buffer_size, stream) + assert_drv(err) (err,) = cuda.cuStreamSynchronize(stream) - ASSERT_DRV(err) + assert_drv(err) # Assert values are different before running kernel - hZ = a * hX + hY - if np.allclose(hOut, hZ): + h_z = a * h_x + h_y + if np.allclose(h_out, h_z): raise ValueError("Error inside tolerence for host-device vectors") - arg_values = (a, dX, dY, dOut, n) + arg_values = (a, d_x, d_y, d_out, n) arg_types = (ctypes.c_float, None, None, None, ctypes.c_size_t) (err,) = cuda.cuLaunchKernel( kernel, - NUM_BLOCKS, + num_blocks, 1, 1, # grid dim - NUM_THREADS, + num_threads, 1, 1, # block dim 0, @@ -150,32 +148,32 @@ def main(): (arg_values, arg_types), 0, ) # arguments - ASSERT_DRV(err) + assert_drv(err) - (err,) = cuda.cuMemcpyDtoHAsync(hOut, dOut, bufferSize, stream) - ASSERT_DRV(err) + (err,) = cuda.cuMemcpyDtoHAsync(h_out, d_out, buffer_size, stream) + assert_drv(err) (err,) = cuda.cuStreamSynchronize(stream) - ASSERT_DRV(err) + assert_drv(err) # Assert values are same after running kernel - hZ = a * hX + hY - if not np.allclose(hOut, hZ): + h_z = a * h_x + h_y + if not np.allclose(h_out, h_z): raise ValueError("Error outside tolerence for host-device vectors") (err,) = cuda.cuStreamDestroy(stream) - ASSERT_DRV(err) + assert_drv(err) - (err,) = cuda.cuMemFree(dX) - ASSERT_DRV(err) - (err,) = cuda.cuMemFree(dY) - ASSERT_DRV(err) - (err,) = cuda.cuMemFree(dOut) - ASSERT_DRV(err) + (err,) = cuda.cuMemFree(d_x) + assert_drv(err) + (err,) = cuda.cuMemFree(d_y) + assert_drv(err) + (err,) = cuda.cuMemFree(d_out) + assert_drv(err) (err,) = cuda.cuModuleUnload(module) - ASSERT_DRV(err) + assert_drv(err) (err,) = cuda.cuCtxDestroy(context) - ASSERT_DRV(err) + assert_drv(err) if __name__ == "__main__": diff --git a/cuda_core/examples/cuda_graphs.py b/cuda_core/examples/cuda_graphs.py index 02d1b59ec1..c6233dd5d9 100644 --- a/cuda_core/examples/cuda_graphs.py +++ b/cuda_core/examples/cuda_graphs.py @@ -84,9 +84,9 @@ def main(): result3 = cp.empty_like(a) # Prepare launch configuration - block_size = 256 - grid_size = (size + block_size - 1) // block_size - config = LaunchConfig(grid=grid_size, block=block_size) + block = 256 + grid = (size + block - 1) // block + config = LaunchConfig(grid=grid, block=block) # Sync before graph capture dev.sync() diff --git a/cuda_core/examples/gl_interop_plasma.py b/cuda_core/examples/gl_interop_plasma.py index 7b8b43cd8d..46fa59ee3f 100644 --- a/cuda_core/examples/gl_interop_plasma.py +++ b/cuda_core/examples/gl_interop_plasma.py @@ -94,8 +94,8 @@ def setup_cuda(kernel_source): dev.set_current() stream = dev.create_stream() - opts = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}") - prog = Program(kernel_source, code_type="c++", options=opts) + program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}") + prog = Program(kernel_source, code_type="c++", options=program_options) mod = prog.compile("cubin") kernel = mod.get_kernel("plasma") diff --git a/cuda_core/examples/pytorch_example.py b/cuda_core/examples/pytorch_example.py index 3919953eab..4e3bfcceb5 100644 --- a/cuda_core/examples/pytorch_example.py +++ b/cuda_core/examples/pytorch_example.py @@ -48,7 +48,7 @@ def __cuda_stream__(self): return (0, stream_id) # Return format required by CUDA Python -s = dev.create_stream(PyTorchStreamWrapper(pt_stream)) +stream = dev.create_stream(PyTorchStreamWrapper(pt_stream)) try: # prepare program @@ -61,7 +61,7 @@ def __cuda_stream__(self): ) # Run in single precision - ker = mod.get_kernel("saxpy_kernel") + kernel = mod.get_kernel("saxpy_kernel") dtype = torch.float32 # prepare input/output @@ -76,16 +76,16 @@ def __cuda_stream__(self): block = 32 grid = int((size + block - 1) // block) config = LaunchConfig(grid=grid, block=block) - ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size) + kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size) # launch kernel on our stream - launch(s, config, ker, *ker_args) + launch(stream, config, kernel, *kernel_args) # check result assert torch.allclose(out, a.item() * x + y) # let's repeat again with double precision - ker = mod.get_kernel("saxpy_kernel") + kernel = mod.get_kernel("saxpy_kernel") dtype = torch.float64 # prepare input @@ -102,12 +102,12 @@ def __cuda_stream__(self): block = 64 grid = int((size + block - 1) // block) config = LaunchConfig(grid=grid, block=block) - ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size) + kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size) # launch kernel on PyTorch's stream - launch(s, config, ker, *ker_args) + launch(stream, config, kernel, *kernel_args) # check result assert torch.allclose(out, a * x + y) finally: - s.close() + stream.close() diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py index d7eb401ac3..548af802be 100644 --- a/cuda_core/examples/saxpy.py +++ b/cuda_core/examples/saxpy.py @@ -35,7 +35,7 @@ dev = Device() dev.set_current() -s = dev.create_stream() +stream = dev.create_stream() buf = None try: @@ -53,7 +53,7 @@ ) # run in single precision - ker = mod.get_kernel("saxpy") + kernel = mod.get_kernel("saxpy") dtype = cp.float32 # prepare input/output @@ -63,24 +63,24 @@ x = rng.random(size, dtype=dtype) y = rng.random(size, dtype=dtype) out = cp.empty_like(x) - dev.sync() # cupy runs on a different stream from s, so sync before accessing + dev.sync() # cupy runs on a different stream from stream, so sync before accessing # prepare launch block = 32 grid = int((size + block - 1) // block) config = LaunchConfig(grid=grid, block=block) - ker_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size) + kernel_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size) - # launch kernel on stream s - launch(s, config, ker, *ker_args) - s.sync() + # launch kernel on stream + launch(stream, config, kernel, *kernel_args) + stream.sync() # check result assert cp.allclose(out, a * x + y) # let's repeat again, this time allocates our own out buffer instead of cupy's # run in double precision - ker = mod.get_kernel("saxpy") + kernel = mod.get_kernel("saxpy") dtype = cp.float64 # prepare input @@ -93,18 +93,18 @@ # prepare output buf = dev.allocate( size * 8, # = dtype.itemsize - stream=s, + stream=stream, ) # prepare launch block = 64 grid = int((size + block - 1) // block) config = LaunchConfig(grid=grid, block=block) - ker_args = (a, x.data.ptr, y.data.ptr, buf, size) + kernel_args = (a, x.data.ptr, y.data.ptr, buf, size) - # launch kernel on stream s - launch(s, config, ker, *ker_args) - s.sync() + # launch kernel on stream + launch(stream, config, kernel, *kernel_args) + stream.sync() # check result # we wrap output buffer as a cupy array for simplicity @@ -115,5 +115,5 @@ finally: # cupy cleans up automatically the rest if buf is not None: - buf.close(s) - s.close() + buf.close(stream) + stream.close() diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py index 0fbb4466bb..882ce8bbb3 100644 --- a/cuda_core/examples/simple_multi_gpu_example.py +++ b/cuda_core/examples/simple_multi_gpu_example.py @@ -13,7 +13,7 @@ import cupy as cp -from cuda.core import Device, LaunchConfig, Program, launch, system +from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch, system if system.get_num_devices() < 2: print("this example requires at least 2 GPUs", file=sys.stderr) @@ -56,9 +56,9 @@ def __cuda_stream__(self): } } """ - prog_add = Program(code_add, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev0.arch}"}) + prog_add = Program(code_add, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}")) mod_add = prog_add.compile("cubin") - ker_add = mod_add.get_kernel("vector_add") + add_kernel = mod_add.get_kernel("vector_add") # Set GPU 1 dev1 = Device(1) @@ -78,9 +78,9 @@ def __cuda_stream__(self): } } """ - prog_sub = Program(code_sub, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev1.arch}"}) + prog_sub = Program(code_sub, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}")) mod_sub = prog_sub.compile("cubin") - ker_sub = mod_sub.get_kernel("vector_sub") + sub_kernel = mod_sub.get_kernel("vector_sub") # Create launch configs for each kernel that will be executed on the respective # CUDA streams. @@ -103,7 +103,7 @@ def __cuda_stream__(self): stream0.wait(cp_stream0) # Launch the add kernel on GPU 0 / stream 0 - launch(stream0, config0, ker_add, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) + launch(stream0, config0, add_kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) # Allocate memory on GPU 1 # Note: This runs on CuPy's current stream for GPU 1. @@ -118,7 +118,7 @@ def __cuda_stream__(self): stream1.wait(cp_stream1) # Launch the subtract kernel on GPU 1 / stream 1 - launch(stream1, config1, ker_sub, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size)) + launch(stream1, config1, sub_kernel, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size)) # Synchronize both GPUs are validate the results dev0.set_current() diff --git a/cuda_core/examples/strided_memory_view_gpu.py b/cuda_core/examples/strided_memory_view_gpu.py index d53c4278b2..9d4e4aacff 100644 --- a/cuda_core/examples/strided_memory_view_gpu.py +++ b/cuda_core/examples/strided_memory_view_gpu.py @@ -57,7 +57,7 @@ # We assume the 0-th argument supports either DLPack or CUDA Array Interface (both # of which are supported by StridedMemoryView). @args_viewable_as_strided_memory((0,)) -def my_func(arr, work_stream, gpu_ker): +def my_func(arr, work_stream, kernel): # Create a memory view over arr (assumed to be a 1D array of int32). The stream # ordering is taken care of, so that arr can be safely accessed on our work # stream (ordered after a data stream on which arr is potentially prepared). @@ -73,7 +73,7 @@ def my_func(arr, work_stream, gpu_ker): block = 256 grid = (size + block - 1) // block config = LaunchConfig(grid=grid, block=block) - launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size)) + launch(work_stream, config, kernel, view.ptr, np.uint64(size)) # Here we're being conservative and synchronize over our work stream, # assuming we do not know the data stream; if we know then we could # just order the data stream after the work stream here, e.g. @@ -101,24 +101,24 @@ def run(): # To know the GPU's compute capability, we need to identify which GPU to use. dev = Device(0) dev.set_current() - gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11")) - mod = gpu_prog.compile(target_type="cubin") - gpu_ker = mod.get_kernel(func_name) + prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11")) + mod = prog.compile(target_type="cubin") + kernel = mod.get_kernel(func_name) - s = dev.create_stream() + stream = dev.create_stream() try: # Create input array on GPU arr_gpu = cp.ones(1024, dtype=cp.int32) print(f"before: {arr_gpu[:10]=}") # Run the workload - my_func(arr_gpu, s, gpu_ker) + my_func(arr_gpu, stream, kernel) # Check the result print(f"after: {arr_gpu[:10]=}") assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32)) finally: - s.close() + stream.close() if __name__ == "__main__": diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py index 5e36270eab..a5f50d4189 100644 --- a/cuda_core/examples/thread_block_cluster.py +++ b/cuda_core/examples/thread_block_cluster.py @@ -94,7 +94,7 @@ options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=include_path), ) mod = prog.compile(target_type="cubin") -ker = mod.get_kernel("check_cluster_info") +kernel = mod.get_kernel("check_cluster_info") # prepare launch config grid = 4 @@ -126,7 +126,7 @@ block_dims[:] = 0 # launch kernel on the default stream - launch(dev.default_stream, config, ker, grid_buffer, cluster_buffer, block_buffer) + launch(dev.default_stream, config, kernel, grid_buffer, cluster_buffer, block_buffer) dev.sync() # verify results diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py index 4c645fc7dd..e648a3846f 100644 --- a/cuda_core/examples/vector_add.py +++ b/cuda_core/examples/vector_add.py @@ -30,7 +30,7 @@ dev = Device() dev.set_current() -s = dev.create_stream() +stream = dev.create_stream() try: # prepare program @@ -39,7 +39,7 @@ mod = prog.compile("cubin", name_expressions=("vector_add",)) # run in single precision - ker = mod.get_kernel("vector_add") + kernel = mod.get_kernel("vector_add") dtype = cp.float32 # prepare input/output @@ -49,7 +49,7 @@ b = rng.random(size, dtype=dtype) c = cp.empty_like(a) - # cupy runs on a different stream from s, so sync before accessing + # cupy runs on a different stream from stream, so sync before accessing dev.sync() # prepare launch @@ -57,11 +57,11 @@ grid = (size + block - 1) // block config = LaunchConfig(grid=grid, block=block) - # launch kernel on stream s - launch(s, config, ker, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) - s.sync() + # launch kernel on stream + launch(stream, config, kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) + stream.sync() # check result assert cp.allclose(c, a + b) finally: - s.close() + stream.close() diff --git a/ruff.toml b/ruff.toml index 7f3853529e..76f548848c 100644 --- a/ruff.toml +++ b/ruff.toml @@ -123,13 +123,15 @@ inline-quotes = "double" ] # CUDA bindings mirror C API naming conventions (CamelCase types, camelCase functions) -"cuda_bindings/**" = [ +# Keep examples opted-in to enforce naming conventions in example-local identifiers. +"cuda_bindings/{benchmarks,cuda,docs,tests}/**" = [ "N801", # invalid-class-name "N802", # invalid-function-name "N803", # invalid-argument-name "N806", # non-lowercase-variable-in-function "N816", # mixed-case-variable-in-global-scope ] +"cuda_bindings/{build_hooks.py,setup.py}" = ["N801", "N802", "N803", "N806", "N816"] # scripts and build tooling — print is the expected output method "toolshed/**" = ["T201"]