diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py
index dab33fb3097..37aad2b9583 100644
--- a/backends/vulkan/runtime/gen_vulkan_spv.py
+++ b/backends/vulkan/runtime/gen_vulkan_spv.py
@@ -1091,6 +1091,7 @@ def compile_spirv(shader_paths_pair) -> Tuple[str, str]:
                     return (spv_out_path, gen_out_path)
 
             vk_version = codegen_params.get("VK_VERSION", "1.1")
+            spv_version = codegen_params.get("SPV_VERSION", None)
             # Only proceed if a GLSL compiler was specified
             if self.glslc_path is not None:
                 cmd_base = [
@@ -1104,6 +1105,9 @@ def compile_spirv(shader_paths_pair) -> Tuple[str, str]:
                     "-I",
                     output_dir,
                 ]
+                # Add explicit SPIR-V version if specified (for extensions like GL_NV_cooperative_matrix2)
+                if spv_version is not None:
+                    cmd_base.append("--target-spv=spv{}".format(spv_version))
                 cmd = cmd_base + self.glslc_flags
 
                 try:
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_tiled_nv_cm2.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_tiled_nv_cm2.glsl
new file mode 100644
index 00000000000..b17a10ee553
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_tiled_nv_cm2.glsl
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Floating-point matrix multiplication shader using GL_NV_cooperative_matrix2
+ * extension for optimized performance on NVIDIA GPUs with tensor cores.
+ *
+ * RTX 4080 supported configuration:
+ *   - Scope: Subgroup (NOT Workgroup!)
+ *   - A, B, C types: all float16
+ *   - Tile sizes: M=16, N=16, K=16
+ *
+ * Computes: output = input @ weight^T + bias
+ */
+
+#version 450 core
+
+#extension GL_EXT_control_flow_attributes : enable
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_KHR_memory_scope_semantics : enable
+#extension GL_KHR_shader_subgroup_basic : enable
+#extension GL_KHR_cooperative_matrix : enable
+#extension GL_NV_cooperative_matrix2 : enable
+
+${define_required_extensions("buffer", DTYPE)}
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+#define VEC4_T ${buffer_gvec_type(DTYPE, 4)}
+
+#define TILE_ROWS ${TILE_ROWS}
+#define TILE_COLS ${TILE_COLS}
+
+// Block sizes for cooperative matrix - matching RTX 4080 supported config
+#define BM 16
+#define BN 16
+#define BK 16
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "t_output", DTYPE, "buffer", is_scalar_array=True)}
+${layout_declare_tensor(B, "r", "t_input", DTYPE, "buffer", is_scalar_array=True)}
+${layout_declare_tensor(B, "r", "t_weight", DTYPE, "buffer", is_scalar_array=True)}
+${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=True)}
+
+${layout_declare_ubo(B, "ivec4", "output_sizes")}
+${layout_declare_ubo(B, "ivec4", "input_sizes")}
+
+// Workgroup size: 1 threads (subgroup/warp size for Subgroup scope)
+layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
+
+${layout_declare_spec_const(C, "int", "apply_bias", "0")}
+
+// Matrix type for float16 computation with SUBGROUP scope (RTX 4080 supported)
+
+#define MAT_TYPE float16_t
+#define ACC_TYPE float16_t
+
+void main() {
+  // Get tile indices
+  const uint M = uint(output_sizes.y);  // batch/rows
+  const uint N = uint(output_sizes.x);  // output features
+  const uint K = uint(input_sizes.x);   // input features
+
+  // Calculate N4 for prepacked weight stride
+  const uint N4 = (N + 3) / 4;
+  const uint weight_stride = N4 * 4;
+
+  const uint blocks_m = (M + BM - 1) / BM;
+  const uint ir = gl_WorkGroupID.x % blocks_m;  // row tile index
+  const uint ic = gl_WorkGroupID.y;             // column tile index
+
+  // Early exit if out of bounds
+  if (ir * BM >= M || ic * BN >= N) {
+    return;
+  }
+
+  // Create tensor layouts with clamping for boundary handling
+  tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutA =
+      createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
+  tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutB =
+      createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
+  tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD =
+      createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
+
+  // Set dimensions and strides
+  // Input A: M x K, row-major (stride = K)
+  tensorLayoutA = setTensorLayoutDimensionNV(tensorLayoutA, M, K);
+  tensorLayoutA = setTensorLayoutStrideNV(tensorLayoutA, K, 1);
+
+  // Weight B: K x N (prepacked), row-major
+  // Each row k has N elements: weight[k, 0:N]
+  tensorLayoutB = setTensorLayoutDimensionNV(tensorLayoutB, K, N);
+  tensorLayoutB = setTensorLayoutStrideNV(tensorLayoutB, N, 1);
+
+  // Output D: M x N, row-major (stride = N)
+  // Output layout matches expected: [batch, out_features]
+  tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, M, N);
+  tensorLayoutD = setTensorLayoutStrideNV(tensorLayoutD, N, 1);
+
+  // Transpose view for B matrix (weight is stored transposed)
+  tensorViewNV<2, false, 1, 0> tensorViewTranspose = createTensorViewNV(2, false, 1, 0);
+
+  // Initialize accumulator - either with zeros or with bias (broadcast across rows)
+  coopmat<ACC_TYPE, gl_ScopeSubgroup, BM, BN, gl_MatrixUseAccumulator> sum;
+
+  if (apply_bias != 0) {
+    // Create tensor layout for bias with broadcast (stride 0 in row dimension)
+    // This makes all rows read the same bias values
+    // Bias is 1D array of size N, we want to load it as BM x BN matrix
+    // where each row has the same values: bias[ic*BN], bias[ic*BN+1], ..., bias[ic*BN+BN-1]
+    tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutBias =
+        createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
+
+    // Dimension: BM rows x N columns (full bias width)
+    // Stride: 0 for rows (broadcast same values), 1 for columns
+    tensorLayoutBias = setTensorLayoutDimensionNV(tensorLayoutBias, BM, N);
+    tensorLayoutBias = setTensorLayoutStrideNV(tensorLayoutBias, 0, 1);  // stride 0 = broadcast rows
+
+    // Load bias into accumulator (slice the column range for this tile)
+    coopMatLoadTensorNV(sum, t_bias, 0,
+        sliceTensorLayoutNV(tensorLayoutBias, 0, BM, ic * BN, BN));
+  } else {
+    // Initialize to zeros
+    sum = coopmat<ACC_TYPE, gl_ScopeSubgroup, BM, BN, gl_MatrixUseAccumulator>(ACC_TYPE(0.0));
+  }
+
+  // Loop over K dimension
+  const uint k_iters = (K + BK - 1) / BK;
+
+  [[dont_unroll]]
+  for (uint block_k = 0, i = 0; i < k_iters; block_k += BK, ++i) {
+    // Use SUBGROUP scope for cooperative matrices (RTX 4080 supported)
+    coopmat<MAT_TYPE, gl_ScopeSubgroup, BM, BK, gl_MatrixUseA> mat_a;
+    coopmat<MAT_TYPE, gl_ScopeSubgroup, BK, BN, gl_MatrixUseB> mat_b;
+
+    // Load A tile: input[ir*BM : ir*BM+BM, block_k : block_k+BK]
+    coopMatLoadTensorNV(mat_a, t_input, 0,
+        sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK));
+
+    // Load B tile: weight[block_k : block_k+BK, ic*BN : ic*BN+BN]
+    // Weight is prepacked in [K, N] layout, load directly without transpose
+    coopMatLoadTensorNV(mat_b, t_weight, 0,
+        sliceTensorLayoutNV(tensorLayoutB, block_k, BK, ic * BN, BN));
+
+    // Multiply and accumulate
+    sum = coopMatMulAdd(mat_a, mat_b, sum);
+  }
+
+  // Store result directly without transpose (row-major output)
+  coopMatStoreTensorNV(sum, t_output, 0,
+      sliceTensorLayoutNV(tensorLayoutD, ir * BM, BM, ic * BN, BN));
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_tiled_nv_cm2.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_tiled_nv_cm2.yaml
new file mode 100644
index 00000000000..ae83385f568
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_tiled_nv_cm2.yaml
@@ -0,0 +1,31 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Floating-point matrix multiplication shader using GL_NV_cooperative_matrix2
+# extension for optimized performance on NVIDIA GPUs with tensor cores.
+#
+# This shader computes: output = input @ weight^T + bias
+#
+# Tile sizes should be tuned for specific NVIDIA architectures:
+#   - SM80 (Ampere): 16x16x16 or 32x8x16
+#   - SM90 (Hopper): Up to 64x32x16
+
+linear_tiled_nv_cm2:
+  parameter_names_with_default_values:
+    DTYPE: float
+    # Tile sizes for cooperative matrix operations
+    # TILE_ROWS: rows of tile
+    # TILE_COLS: columns of tile
+    TILE_ROWS: 16
+    TILE_COLS: 16
+    # Use Vulkan 1.3 and SPIR-V 1.6 for GL_NV_cooperative_matrix2 support
+    VK_VERSION: "1.3"
+    SPV_VERSION: "1.6"
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+  shader_variants:
+    - NAME: linear_tiled_nv_cm2
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_fp_linear_weight.glsl b/backends/vulkan/runtime/graph/ops/glsl/pack_fp_linear_weight.glsl
new file mode 100644
index 00000000000..03dc7f6a10e
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/pack_fp_linear_weight.glsl
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions(OUTPUT_STORAGE, DTYPE)}
+${define_required_extensions("buffer", DTYPE)}
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_load_type(DTYPE, OUTPUT_STORAGE)}
+#define T ${texel_load_component_type(DTYPE, OUTPUT_STORAGE)}
+
+$if OUTPUT_STORAGE == "buffer":
+  #define OUTPUT_BUFFER
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "t_packed_weight", DTYPE, OUTPUT_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_weight", DTYPE, "buffer", is_scalar_array=True)}
+
+layout(push_constant) uniform restrict Block {
+  // Original weight sizes: [N, K] (out_features, in_features)
+  ivec2 orig_sizes;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+#include "common.glslh"
+
+void main() {
+  // The source weight tensor has size [W=K, H=N] in WHCN format.
+  // Each shader invocation processes one vec4 of the output.
+  // The thread position (n4, k) corresponds to the output block index.
+  //
+  // Output layout: [K, N4] where each element is a vec4 containing 4
+  // consecutive N values for the same K position.
+  // This layout is optimized for tiled matrix multiplication where we
+  // iterate over K and accumulate into N.
+  //
+  // w_tile.data[k][n4] = vec4(W[n4*4+0, k], W[n4*4+1, k], W[n4*4+2, k], W[n4*4+3, k])
+
+  const int n4 = int(gl_GlobalInvocationID.x);
+  const int k = int(gl_GlobalInvocationID.y);
+
+  const int K = orig_sizes.x;  // in_features
+  const int N = orig_sizes.y;  // out_features
+
+  const int N4 = div_up_4(N);
+
+  if (n4 >= N4 || k >= K) {
+    return;
+  }
+
+  // Each output vec4 contains 4 consecutive N values for position k
+  // Input layout is [N, K] row-major, so element [n, k] is at index n*K + k
+  const int n_base = mul_4(n4);
+
+  VEC4_T packed_data = VEC4_T(0);
+
+  // Load 4 consecutive N values for position k
+  for (int ni = 0; ni < 4; ++ni) {
+    const int n = n_base + ni;
+    if (n < N) {
+      packed_data[ni] = T(t_weight[n * K + k]);
+    }
+  }
+
+  // Write to output
+  // Output is [K, N4] where each vec4 has 4 N values for one K position
+#ifdef OUTPUT_BUFFER
+  t_packed_weight[k * N4 + n4] = packed_data;
+#else
+  imageStore(t_packed_weight, ivec3(n4, k, 0), packed_data);
+#endif
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_fp_linear_weight.yaml b/backends/vulkan/runtime/graph/ops/glsl/pack_fp_linear_weight.yaml
new file mode 100644
index 00000000000..de45ee84fff
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/pack_fp_linear_weight.yaml
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+pack_fp_linear_weight:
+  parameter_names_with_default_values:
+    DTYPE: float
+    OUTPUT_STORAGE: buffer
+  generate_variant_forall:
+    OUTPUT_STORAGE:
+      - VALUE: buffer
+      - VALUE: texture3d
+    DTYPE:
+      - VALUE: float
+      - VALUE: half
+  shader_variants:
+    - NAME: pack_fp_linear_weight
diff --git a/backends/vulkan/runtime/graph/ops/impl/LinearNVCoopMat.cpp b/backends/vulkan/runtime/graph/ops/impl/LinearNVCoopMat.cpp
new file mode 100644
index 00000000000..0756af366e4
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/LinearNVCoopMat.cpp
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+//
+// Shader dispatch utilities
+//
+
+void resize_linear_tiled_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)extra_args;
+
+  const ValueRef output = args.at(0).refs.at(0);
+  const ValueRef input = args.at(1).refs.at(0);
+  const ValueRef weight_data = extra_args.at(0);
+
+  std::vector<int64_t> input_sizes = graph->sizes_of(input);
+  std::vector<int64_t> weight_sizes = graph->sizes_of(weight_data);
+
+  // input: [M, K], weight: [N, K] -> output: [M, N]
+  const int64_t M = utils::val_at(-2, input_sizes);
+  const int64_t N = utils::val_at(-2, weight_sizes);
+
+  std::vector<int64_t> new_out_sizes(input_sizes.size());
+  if (input_sizes.size() == 2) {
+    new_out_sizes.at(0) = M;
+    new_out_sizes.at(1) = N;
+  } else {
+    new_out_sizes.at(0) = input_sizes.at(0);
+    new_out_sizes.at(1) = M;
+    new_out_sizes.at(2) = N;
+  }
+
+  graph->virtual_resize(output, new_out_sizes);
+}
+
+utils::uvec3 linear_tiled_nv_cm2_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+
+  const ValueRef output = args.at(0).refs.at(0);
+
+  std::vector<int64_t> out_sizes = graph->sizes_of(output);
+  // Width dimension (N = out_features)
+  const uint32_t N = utils::val_at(-1, out_sizes);
+  // Height dimension (M = batch size)
+  const uint32_t M = utils::val_at(-2, out_sizes);
+
+  // NV cooperative matrix 2 shader uses BM=16 x BN=16 tiles
+  // Following ggml's dispatch pattern: x = blocks_m * k_split, y = blocks_n
+  const uint32_t BM = 16;
+  const uint32_t BN = 16;
+
+  const uint32_t blocks_m = utils::div_up(M, BM);
+  const uint32_t blocks_n = utils::div_up(N, BN);
+
+  // x = blocks_m (row tiles), y = blocks_n (column tiles)
+  return {blocks_m * 32, blocks_n, 1};
+}
+
+// Fixed local workgroup size for NV cooperative matrix 2 linear shader
+// Must match the shader's layout(local_size_x = 32)
+utils::uvec3 linear_tiled_nv_cm2_local_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)graph;
+  (void)shader;
+  (void)global_workgroup_size;
+  (void)args;
+  (void)resize_args;
+
+  // NV cooperative matrix 2 with Subgroup scope always uses 32 threads (subgroup size)
+  // This matches the shader's layout(local_size_x = 32, local_size_y = 1, local_size_z = 1)
+  return {32, 1, 1};
+}
+
+//
+// Prepacking
+//
+
+ValueRef prepack_fp_linear_weight(
+    ComputeGraph& graph,
+    const ValueRef weight_data,
+    const utils::StorageType output_storage_type) {
+  std::vector<int64_t> weight_sizes = graph.sizes_of(weight_data);
+  const int64_t ndim = graph.dim_of(weight_data);
+
+  // Weight tensor has shape [N, K] (out_features, in_features)
+  const int64_t K = weight_sizes.at(ndim - 1);
+  const int64_t N = weight_sizes.at(ndim - 2);
+
+  // Calculate output sizes
+  // Output layout: [K, N4] where each element is a vec4 containing 4
+  // consecutive N values for one K position
+  const int64_t N4 = utils::div_up(N, int64_t(4));
+
+  // Determine if we need to fall back to buffer storage
+  utils::StorageType storage_type = output_storage_type;
+  if (storage_type == utils::kTexture3D) {
+    uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
+    if (N4 > max_extent || K > max_extent) {
+      storage_type = utils::kBuffer;
+    }
+  }
+
+  // Output tensor shape: [K, N4 * 4] for the prepacked weights
+  // The width is N4 * 4 because each vec4 access reads 4 consecutive elements
+  // (matching the standard convention where sizes are in terms of individual
+  // elements, not vec4s)
+  std::vector<int64_t> packed_weight_sizes = {K, N4 * 4};
+
+  ValueRef packed_weight = graph.add_tensor(
+      packed_weight_sizes,
+      graph.dtype_of(weight_data),
+      storage_type,
+      utils::kWidthPacked);
+
+  // Store original sizes for the shader
+  utils::ivec2 orig_sizes = {
+      utils::safe_downcast<int32_t>(K), utils::safe_downcast<int32_t>(N)};
+
+  utils::uvec3 global_wg_size = {
+      utils::safe_downcast<uint32_t>(N4),
+      utils::safe_downcast<uint32_t>(K),
+      1u};
+
+  std::string kernel_name = "pack_fp_linear_weight";
+  add_storage_type_suffix(kernel_name, storage_type);
+  add_dtype_suffix(kernel_name, graph.dtype_of(weight_data));
+
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
+      // Inputs and Outputs
+      weight_data,
+      packed_weight,
+      // UBOs
+      {},
+      // Specialization Constants
+      {},
+      // Push Constants
+      {PushConstantDataInfo(&orig_sizes, sizeof(utils::ivec2))}));
+
+  return packed_weight;
+}
+
+//
+// Linear Dispatch
+//
+
+void add_linear_tiled_node(
+    ComputeGraph& graph,
+    const ValueRef input,
+    const ValueRef weight_data,
+    const ValueRef packed_weight,
+    const ValueRef bias_data,
+    const ValueRef packed_bias,
+    const ValueRef output) {
+  // Use CM2 kernel for buffer storage (GL_NV_cooperative_matrix2)
+  std::string kernel_name = "linear_tiled_nv_cm2";
+  add_dtype_suffix(kernel_name, graph.dtype_of(output));
+
+  vkapi::ParamsBindList param_buffers = {
+      graph.sizes_ubo(output), graph.sizes_ubo(input)};
+
+  int32_t apply_bias = graph.val_is_not_none(bias_data) ? 1 : 0;
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      linear_tiled_nv_cm2_global_wg_size,
+      linear_tiled_nv_cm2_local_wg_size,
+      // Inputs and Outputs
+      {{output, vkapi::kWrite},
+       {{input, packed_weight, packed_bias}, vkapi::kRead}},
+      // Shader params buffers
+      param_buffers,
+      // Push Constants
+      {},
+      // Specialization Constants
+      {apply_bias},
+      // Resize args
+      {weight_data},
+      // Resizing Logic
+      resize_linear_tiled_node));
+}
+
+//
+// High-level operator implementation
+//
+
+void linear_nv_cm2_impl(
+    ComputeGraph& graph,
+    const ValueRef input,
+    const ValueRef weight_data,
+    const ValueRef bias_data,
+    const ValueRef output) {
+  // Check that VK_NV_cooperative_matrix2 extension is available
+  // This is required for the linear_tiled_nv_cm2 shader
+  VK_CHECK_COND(
+      graph.context()->adapter_ptr()->supports_nv_cooperative_matrix2(),
+      "linear_nv_cm2 requires VK_NV_cooperative_matrix2 extension which is "
+      "not available on this device. Please use a device that supports "
+      "VK_NV_cooperative_matrix2 or use a different linear implementation.");
+
+  // Check input dimensions
+  std::vector<int64_t> input_sizes = graph.sizes_of(input);
+  VK_CHECK_COND(
+      input_sizes.size() == 2 || input_sizes.size() == 3,
+      "Input must be 2D or 3D tensor");
+
+  // Determine storage type based on output
+  utils::StorageType storage_type = graph.storage_type_of(output);
+
+  // For the tiled implementation, we need the input to be width-packed
+  // (i.e., K is along the width/x dimension)
+  ValueRef input_W_packed = input;
+  if (graph.estimate_memory_layout_of(input) != utils::kWidthPacked) {
+    input_W_packed = graph.add_tensor_like(input, utils::kWidthPacked);
+    auto viewFn = VK_GET_OP_FN("aten.view_copy.default");
+    viewFn(graph, {input, graph.add_none(), input_W_packed});
+  }
+
+  // Prepack weight
+  ValueRef packed_weight =
+      prepack_fp_linear_weight(graph, weight_data, storage_type);
+
+  // Create dummy bias tensor if bias is not provided
+  TmpTensor dummy_bias(
+      &graph, {}, graph.dtype_of(output), utils::kBuffer, utils::kWidthPacked);
+
+  ValueRef packed_bias = dummy_bias.vref;
+  if (graph.val_is_not_none(bias_data)) {
+    packed_bias =
+        prepack_standard(graph, bias_data, utils::kBuffer, utils::kWidthPacked);
+  }
+
+  add_linear_tiled_node(
+      graph,
+      input_W_packed,
+      weight_data,
+      packed_weight,
+      bias_data,
+      packed_bias,
+      output);
+}
+
+//
+// Registered operator entry point
+//
+
+void linear_nv_cm2(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  const ValueRef input = args.at(0);
+  const ValueRef weight_data = args.at(1);
+  const ValueRef bias_data = args.at(2);
+  const ValueRef output = args.at(3);
+
+  linear_nv_cm2_impl(graph, input, weight_data, bias_data, output);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(etvk.linear_nv_cm2.default, linear_nv_cm2);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/impl/TestLinear.cpp b/backends/vulkan/test/custom_ops/impl/TestLinear.cpp
index bdef4d7bafb..a3421e8c439 100644
--- a/backends/vulkan/test/custom_ops/impl/TestLinear.cpp
+++ b/backends/vulkan/test/custom_ops/impl/TestLinear.cpp
@@ -18,7 +18,7 @@ namespace vkcompute {
 
 // Implementation selector values:
 // 0 = default (use standard aten.linear.default dispatch)
-// 1 = alternative path (also uses aten.linear.default for correctness)
+// 1 = experimental tiled linear implementation
 
 void test_fp_linear(
     ComputeGraph& graph,
@@ -33,11 +33,14 @@ void test_fp_linear(
   // Extract the impl_selector flag
   int32_t impl_selector = graph.extract_scalar<int32_t>(impl_selector_ref);
 
-  if (impl_selector == 0 || impl_selector == 1) {
-    // Both paths use the standard linear operator dispatch
-    // impl_selector=1 is provided as a hook for future alternative implementations
+  if (impl_selector == 0) {
+    // Use standard linear operator dispatch
     std::vector<ValueRef> linear_args = {input, weight_data, bias_data, output};
     VK_GET_OP_FN("aten.linear.default")(graph, linear_args);
+  } else if (impl_selector == 1) {
+    // Use experimental tiled linear implementation
+    std::vector<ValueRef> linear_args = {input, weight_data, bias_data, output};
+    VK_GET_OP_FN("etvk.linear_nv_cm2.default")(graph, linear_args);
   } else {
     VK_THROW("Invalid impl_selector value: ", impl_selector);
   }
diff --git a/backends/vulkan/test/custom_ops/test_fp_linear.cpp b/backends/vulkan/test/custom_ops/test_fp_linear.cpp
index 97178f91f77..d66a05f0dcc 100644
--- a/backends/vulkan/test/custom_ops/test_fp_linear.cpp
+++ b/backends/vulkan/test/custom_ops/test_fp_linear.cpp
@@ -7,20 +7,22 @@
 #include <iostream>
 #include <vector>
 
+#include <executorch/backends/vulkan/runtime/vk_api/Runtime.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 
+#include "nv_utils.h"
 #include "utils.h"
 
-// #define DEBUG_MODE
+#define DEBUG_MODE
 
 using namespace executorch::vulkan::prototyping;
 
 using namespace vkcompute;
 
-static constexpr int64_t kRefDimSizeLimit = 512;
+static constexpr int64_t kRefDimSizeLimit = 2048;
 
 // Configuration for linear layer test cases
 struct LinearConfig {
@@ -56,7 +58,7 @@ TestCase create_test_case_from_config(
       "  O=" + std::to_string(config.out_features) + "  " + storage_str + "  " +
       dtype_str + bias_str;
   if (impl_selector == 1) {
-    test_name += " L"; // Legacy/alternative implementation
+    test_name += " Experimental"; // Legacy/alternative implementation
   }
   test_case.set_name(test_name);
 
@@ -118,7 +120,8 @@ TestCase create_test_case_from_config(
   if (dtype == vkapi::kFloat) {
     test_case.set_abs_tolerance(1e-4f);
   } else {
-    test_case.set_abs_tolerance(1e-2f);
+    // FP16 cooperative matrix operations have slightly more numerical variance
+    test_case.set_abs_tolerance(2e-1f);
   }
 
   return test_case;
@@ -128,25 +131,26 @@ TestCase create_test_case_from_config(
 std::vector<TestCase> generate_linear_easy_cases() {
   std::vector<TestCase> test_cases;
 
-  // Simple configuration for debugging
+  // Test with multiple row tiles only
   LinearConfig config = {
-      4,    // batch_size
-      64,   // in_features
-      32,   // out_features
+      4,   // batch_size (4 tiles in M dimension)
+      256,   // in_features
+      256,   // out_features (1 tile in N dimension)
       true, // has_bias
       "ACCU",
   };
 
   std::vector<utils::StorageType> storage_types = {utils::kBuffer};
-  std::vector<vkapi::ScalarType> dtypes = {vkapi::kFloat, vkapi::kHalf};
+  // Use FP16 for cooperative matrix shader
+  std::vector<vkapi::ScalarType> dtypes = {vkapi::kHalf};
 
   for (const utils::StorageType storage_type : storage_types) {
     for (const vkapi::ScalarType dtype : dtypes) {
-      config.test_case_name = "ACCU";
+      config.test_case_name = "PERF";
       // Test with impl_selector = 0 (default)
       test_cases.push_back(
           create_test_case_from_config(config, dtype, storage_type, 0));
-      // Test with impl_selector = 1 (alternative)
+      // // Test with impl_selector = 1 (alternative)
       test_cases.push_back(
           create_test_case_from_config(config, dtype, storage_type, 1));
     }
@@ -176,9 +180,8 @@ std::vector<TestCase> generate_linear_test_cases() {
       {64, 768, 768, true, "PERF"},
   };
 
-  std::vector<utils::StorageType> storage_types = {
-      utils::kTexture3D, utils::kBuffer};
-  std::vector<vkapi::ScalarType> dtypes = {vkapi::kFloat, vkapi::kHalf};
+  std::vector<utils::StorageType> storage_types = {utils::kBuffer};
+  std::vector<vkapi::ScalarType> dtypes = {vkapi::kHalf};
 
   for (auto& config : configs) {
     bool is_performance = config.batch_size > kRefDimSizeLimit ||
@@ -260,22 +263,64 @@ void linear_reference_impl(TestCase& test_case) {
     auto& input_data = input_spec.get_half_data();
     auto& weight_data = weight_spec.get_half_data();
 
+    // IEEE 754 FP16 to float conversion helper
+    auto half_to_float = [](uint16_t h) -> float {
+      uint32_t sign = (h >> 15) & 0x1;
+      uint32_t exponent = (h >> 10) & 0x1F;
+      uint32_t mantissa = h & 0x3FF;
+
+      uint32_t f_sign = sign << 31;
+      uint32_t f_exp;
+      uint32_t f_mant;
+
+      if (exponent == 0) {
+        if (mantissa == 0) {
+          f_exp = 0;
+          f_mant = 0;
+        } else {
+          // Denormalized
+          uint32_t exp_adj = 1;
+          uint32_t mant_temp = mantissa;
+          while ((mant_temp & 0x400) == 0) {
+            mant_temp <<= 1;
+            exp_adj--;
+          }
+          mant_temp &= 0x3FF;
+          f_exp = (127 - 15 + exp_adj) << 23;
+          f_mant = mant_temp << 13;
+        }
+      } else if (exponent == 31) {
+        f_exp = 0xFF << 23;
+        f_mant = mantissa << 13;
+      } else {
+        f_exp = (exponent + 127 - 15) << 23;
+        f_mant = mantissa << 13;
+      }
+
+      uint32_t bits = f_sign | f_exp | f_mant;
+      float result;
+      std::memcpy(&result, &bits, sizeof(result));
+      return result;
+    };
+
     // Perform linear operation: output = input @ weight^T + bias
     for (int64_t b = 0; b < batch_size; ++b) {
       for (int64_t o = 0; o < out_features; ++o) {
         float sum = 0.0f;
         for (int64_t i = 0; i < in_features; ++i) {
           // input[b, i] * weight[o, i]
+          // Convert from IEEE 754 FP16 to float
           int64_t input_idx = b * in_features + i;
           int64_t weight_idx = o * in_features + i;
-          sum += static_cast<float>(input_data[input_idx]) *
-              static_cast<float>(weight_data[weight_idx]);
+          float input_val = half_to_float(input_data[input_idx]);
+          float weight_val = half_to_float(weight_data[weight_idx]);
+          sum += input_val * weight_val;
         }
 
         // Add bias if present
         if (has_bias) {
           auto& bias_data = bias_spec.get_half_data();
-          sum += static_cast<float>(bias_data[o]);
+          sum += half_to_float(bias_data[o]);
         }
 
         int64_t output_idx = b * out_features + o;
@@ -310,16 +355,19 @@ int64_t linear_flop_calculator(const TestCase& test_case) {
   return flop;
 }
 
-int main(int argc, char* argv[]) {
+int main(int /* argc */, char* /* argv */[]) {
   set_debugging(false);
   set_print_output(false);
-  set_print_latencies(false);
+  set_print_latencies(true);
   set_use_gpu_timestamps(false);
 
   print_performance_header();
   std::cout << "FP32/FP16 Linear Layer Benchmark" << std::endl;
   print_separator();
 
+  // Query cooperative matrix properties to understand what's supported
+  queryCooperativeMatrixProperties();
+
   ReferenceComputeFunc ref_fn = reference_impl;
 
   // Execute test cases using the framework with custom FLOP calculator