NVIDIA · rparolin · Feb 23, 2026 · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026
diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
@@ -68,3 +68,4 @@
     Stream,
     StreamOptions,
 )
+from cuda.core._tensor_map import TensorMapDescriptor
diff --git a/cuda_core/cuda/core/_cpp/tensor_map.cpp b/cuda_core/cuda/core/_cpp/tensor_map.cpp
@@ -0,0 +1,149 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tensor_map_cccl.h"
+
+#include <string.h>
+
+#include <algorithm>
+#include <exception>
+
+#if defined(__has_include)
+#  if __has_include(<cuda/tma>)
+#    include <cuda/tma>
+#    define CUDA_CORE_HAS_CUDA_TMA 1
+#  else
+#    define CUDA_CORE_HAS_CUDA_TMA 0
+#  endif
+#  if __has_include(<dlpack/dlpack.h>)
+#    include <dlpack/dlpack.h>
+#    define CUDA_CORE_HAS_DLPACK_H 1
+#  else
+#    define CUDA_CORE_HAS_DLPACK_H 0
+#  endif
+#else
+#  define CUDA_CORE_HAS_CUDA_TMA 0
+#  define CUDA_CORE_HAS_DLPACK_H 0
+#endif
+
+static inline void cuda_core_write_err(char* err, size_t cap, const char* msg) noexcept
+{
+  if (!err || cap == 0)
+    return;
+  if (!msg)
+  {
+    err[0] = '\0';
+    return;
+  }
+  size_t n = ::strlen(msg);
+  if (n >= cap)
+    n = cap - 1;
+  ::memcpy(err, msg, n);
+  err[n] = '\0';
+}
+
+int cuda_core_cccl_make_tma_descriptor_tiled(
+  void* out_tensor_map,
+  void* data,
+  int device_type,
+  int device_id,
+  int ndim,
+  const int64_t* shape,
+  const int64_t* strides,
+  uint8_t dtype_code,
+  uint8_t dtype_bits,
+  uint16_t dtype_lanes,
+  const int* box_sizes,
+  const int* elem_strides,
+  int interleave_layout,
+  int swizzle,
+  int l2_fetch_size,
+  int oob_fill,
+  char* err,
+  size_t err_cap) noexcept
+{
+#if !(CUDA_CORE_HAS_CUDA_TMA && CUDA_CORE_HAS_DLPACK_H)
+  (void)out_tensor_map;
+  (void)data;
+  (void)device_type;
+  (void)device_id;
+  (void)ndim;
+  (void)shape;
+  (void)strides;
+  (void)dtype_code;
+  (void)dtype_bits;
+  (void)dtype_lanes;
+  (void)box_sizes;
+  (void)elem_strides;
+  (void)interleave_layout;
+  (void)swizzle;
+  (void)l2_fetch_size;
+  (void)oob_fill;
+  cuda_core_write_err(err, err_cap, "CCCL <cuda/tma> and/or <dlpack/dlpack.h> not available at build time");
+  return 1;
+#else
+  try
+  {
+    if (!out_tensor_map)
+    {
+      cuda_core_write_err(err, err_cap, "out_tensor_map is NULL");
+      return 1;
+    }
+    if (!data)
+    {
+      cuda_core_write_err(err, err_cap, "tensor data pointer is NULL");
+      return 1;
+    }
+    if (!shape || !box_sizes || ndim <= 0)
+    {
+      cuda_core_write_err(err, err_cap, "invalid rank/shape/box_sizes");
+      return 1;
+    }
+
+    DLTensor t{};
+    t.data        = data;
+    t.device      = {static_cast<DLDeviceType>(device_type), device_id};
+    t.ndim        = ndim;
+    t.dtype.code  = dtype_code;
+    t.dtype.bits  = dtype_bits;
+    t.dtype.lanes = dtype_lanes;
+    // CCCL promises not to mutate the arrays, but DLPack uses non-const pointers.
+    t.shape       = const_cast<int64_t*>(shape);
+    t.strides     = const_cast<int64_t*>(strides);
+    t.byte_offset = 0;
+
+    const auto layout = static_cast<cuda::tma_interleave_layout>(interleave_layout);
+    const auto swz    = static_cast<cuda::tma_swizzle>(swizzle);
+    const auto l2     = static_cast<cuda::tma_l2_fetch_size>(l2_fetch_size);
+    const auto oob    = static_cast<cuda::tma_oob_fill>(oob_fill);
+
+    auto box = cuda::std::span<const int>(box_sizes, static_cast<size_t>(ndim));
+
+    CUtensorMap desc{};
+    if (elem_strides)
+    {
+      auto es = cuda::std::span<const int>(elem_strides, static_cast<size_t>(ndim));
+      desc    = cuda::make_tma_descriptor(t, box, es, layout, swz, l2, oob);
+    }
+    else
+    {
+      desc = cuda::make_tma_descriptor(t, box, layout, swz, l2, oob);
+    }
+
+    ::memcpy(out_tensor_map, &desc, sizeof(CUtensorMap));
+    cuda_core_write_err(err, err_cap, nullptr);
+    return 0;
+  }
+  catch (const std::exception& e)
+  {
+    cuda_core_write_err(err, err_cap, e.what());
+    return 1;
+  }
+  catch (...)
+  {
+    cuda_core_write_err(err, err_cap, "unknown error while building TMA descriptor");
+    return 1;
+  }
+#endif
+}
diff --git a/cuda_core/cuda/core/_cpp/tensor_map_cccl.h b/cuda_core/cuda/core/_cpp/tensor_map_cccl.h
@@ -0,0 +1,43 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef CUDA_CORE_TENSOR_MAP_CCCL_H_
+#define CUDA_CORE_TENSOR_MAP_CCCL_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Build a tiled CUtensorMap using CCCL's cuda::make_tma_descriptor (from <cuda/tma>).
+//
+// Returns 0 on success; on failure returns non-zero and writes a best-effort
+// human-readable message into (err, err_cap) if provided.
+int cuda_core_cccl_make_tma_descriptor_tiled(
+  void* out_tensor_map,
+  void* data,
+  int device_type,
+  int device_id,
+  int ndim,
+  const int64_t* shape,        // length ndim
+  const int64_t* strides,      // length ndim, or NULL for contiguous
+  uint8_t dtype_code,
+  uint8_t dtype_bits,
+  uint16_t dtype_lanes,
+  const int* box_sizes,        // length ndim
+  const int* elem_strides,     // length ndim, or NULL for all-ones overload
+  int interleave_layout,
+  int swizzle,
+  int l2_fetch_size,
+  int oob_fill,
+  char* err,
+  size_t err_cap) noexcept;
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // CUDA_CORE_TENSOR_MAP_CCCL_H_
diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx
@@ -6,6 +6,7 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free
 from libc.stdint cimport (intptr_t,
                           int8_t, int16_t, int32_t, int64_t,
                           uint8_t, uint16_t, uint32_t, uint64_t,)
+from libc.string cimport memcpy
 from libcpp cimport bool as cpp_bool
 from libcpp.complex cimport complex as cpp_complex
 from libcpp cimport nullptr
@@ -16,6 +17,8 @@ import ctypes
 import numpy
 
 from cuda.core._memory import Buffer
+from cuda.core._tensor_map import TensorMapDescriptor as _TensorMapDescriptor_py
+from cuda.core._tensor_map cimport TensorMapDescriptor
 from cuda.core._utils.cuda_utils import driver
 from cuda.bindings cimport cydriver
 
@@ -97,6 +100,9 @@ cdef object numpy_complex64 = numpy.complex64
 cdef object numpy_complex128 = numpy.complex128
 
 
+cdef object tensor_map_descriptor_type = _TensorMapDescriptor_py
+
+
 # limitation due to cython/cython#534
 ctypedef void* voidptr
 
@@ -124,6 +130,26 @@ cdef inline int prepare_arg(
     return 0
 
 
+cdef inline int prepare_tensor_map_arg(
+        vector.vector[void*]& data,
+        vector.vector[void*]& data_addresses,
+        TensorMapDescriptor arg,
+        const size_t idx) except -1:
+    arg._check_context_compat()
+    # Allocate a temporary buffer for the 128-byte CUtensorMap struct.
+    # We copy rather than pointing directly at arg._tensor_map for lifetime
+    # safety: ParamHolder owns and frees its argument buffers independently.
+    cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap))
+    if ptr is NULL:
+        raise MemoryError("Failed to allocate memory for CUtensorMap")
+    memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap))
+    # data[idx] is tracked so the allocation is freed in ParamHolder.__dealloc__,
+    # data_addresses[idx] is the pointer passed to cuLaunchKernel.
+    data_addresses[idx] = ptr
+    data[idx] = ptr
+    return 0
+
+
 cdef inline int prepare_ctypes_arg(
         vector.vector[void*]& data,
         vector.vector[void*]& data_addresses,
@@ -273,6 +299,9 @@ cdef class ParamHolder:
                     # it's a CUdeviceptr:
                     self.data_addresses[i] = <void*><intptr_t>(arg.handle.getPtr())
                 continue
+            elif arg_type is tensor_map_descriptor_type:
+                prepare_tensor_map_arg(self.data, self.data_addresses, <TensorMapDescriptor>arg, i)
+                continue
             elif arg_type is bool:
                 prepare_arg[cpp_bool](self.data, self.data_addresses, arg, i)
                 continue

diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx
@@ -316,6 +316,39 @@ cdef class StridedMemoryView:
         view_buffer_strided(view, self.get_buffer(), layout, dtype, self.readonly)
         return view
 
+    def as_tensor_map(
+        self,
+        box_dim,
+        *,
+        element_strides=None,
+        data_type=None,
+        interleave=None,
+        swizzle=None,
+        l2_promotion=None,
+        oob_fill=None,
+    ):
+        """Create a tiled :obj:`TensorMapDescriptor` from this view.
+
+        This is a convenience wrapper around
+        :meth:`cuda.core._tensor_map.TensorMapDescriptor.from_tiled`.
+        """
+        from cuda.core._tensor_map import TensorMapDescriptor
+
+        kwargs = {}
+        if element_strides is not None:
+            kwargs["element_strides"] = element_strides
+        if data_type is not None:
+            kwargs["data_type"] = data_type
+        if interleave is not None:
+            kwargs["interleave"] = interleave
+        if swizzle is not None:
+            kwargs["swizzle"] = swizzle
+        if l2_promotion is not None:
+            kwargs["l2_promotion"] = l2_promotion
+        if oob_fill is not None:
+            kwargs["oob_fill"] = oob_fill
+        return TensorMapDescriptor.from_tiled(self, box_dim, **kwargs)
+
     def copy_from(
         self, other : StridedMemoryView, stream : Stream,
         allocator = None,

diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd
@@ -0,0 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.bindings cimport cydriver
+from libc.stdint cimport intptr_t
+
+
+cdef class TensorMapDescriptor:
+    cdef cydriver.CUtensorMap _tensor_map
+    cdef int _device_id
+    cdef intptr_t _context
+    cdef object _source_ref
+    cdef object _view_ref
+    cdef object _repr_info
+
+    cdef int _check_context_compat(self) except -1
+    cdef void* _get_data_ptr(self)
-Original file line number
+Diff line change
@@ Expand Up / @@ -68,3 +68,4 @@ @@
         Stream,
         StreamOptions,
     )
+    from cuda.core._tensor_map import TensorMapDescriptor