From 0bb3839d8ec1dce105d2265be27c5b04a42dd428 Mon Sep 17 00:00:00 2001
From: hongbinghu <hongbinghu@devgpu003.eag1.facebook.com>
Date: Thu, 19 Feb 2026 22:00:40 -0800
Subject: [PATCH] [ET-VK] FP Linear benchmark + test op

A float linear op for testing new linear layer implementations.

Differential Revision: [D91945036](https://our.internmc.facebook.com/intern/diff/D91945036/)

[ghstack-poisoned]
---
 .../test/custom_ops/impl/TestLinear.cpp       |  50 +++
 backends/vulkan/test/custom_ops/targets.bzl   |   2 +
 .../vulkan/test/custom_ops/test_fp_linear.cpp | 344 ++++++++++++++++++
 3 files changed, 396 insertions(+)
 create mode 100644 backends/vulkan/test/custom_ops/impl/TestLinear.cpp
 create mode 100644 backends/vulkan/test/custom_ops/test_fp_linear.cpp
diff --git a/backends/vulkan/test/custom_ops/impl/TestLinear.cpp b/backends/vulkan/test/custom_ops/impl/TestLinear.cpp
new file mode 100644
index 00000000000..bdef4d7bafb
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/impl/TestLinear.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/MatMul.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+// Implementation selector values:
+// 0 = default (use standard aten.linear.default dispatch)
+// 1 = alternative path (also uses aten.linear.default for correctness)
+
+void test_fp_linear(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  int32_t idx = 0;
+  const ValueRef input = args.at(idx++);
+  const ValueRef weight_data = args.at(idx++);
+  const ValueRef bias_data = args.at(idx++);
+  const ValueRef impl_selector_ref = args.at(idx++);
+  const ValueRef output = args.at(idx++);
+
+  // Extract the impl_selector flag
+  int32_t impl_selector = graph.extract_scalar<int32_t>(impl_selector_ref);
+
+  if (impl_selector == 0 || impl_selector == 1) {
+    // Both paths use the standard linear operator dispatch
+    // impl_selector=1 is provided as a hook for future alternative implementations
+    std::vector<ValueRef> linear_args = {input, weight_data, bias_data, output};
+    VK_GET_OP_FN("aten.linear.default")(graph, linear_args);
+  } else {
+    VK_THROW("Invalid impl_selector value: ", impl_selector);
+  }
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(test_etvk.test_fp_linear.default, test_fp_linear);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl
index 73b1e343bbe..46038af314c 100644
--- a/backends/vulkan/test/custom_ops/targets.bzl
+++ b/backends/vulkan/test/custom_ops/targets.bzl
@@ -97,3 +97,5 @@ def define_common_targets(is_fbcode = False):
     define_custom_op_test_binary("test_q8ta_conv2d")
     define_custom_op_test_binary("test_q8ta_conv2d_pw")
     define_custom_op_test_binary("test_q8ta_conv2d_dw")
+    define_custom_op_test_binary("q8ta_q8ta_q8to_add")
+    define_custom_op_test_binary("test_fp_linear")
diff --git a/backends/vulkan/test/custom_ops/test_fp_linear.cpp b/backends/vulkan/test/custom_ops/test_fp_linear.cpp
new file mode 100644
index 00000000000..97178f91f77
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/test_fp_linear.cpp
@@ -0,0 +1,344 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <iostream>
+#include <vector>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+#include "utils.h"
+
+// #define DEBUG_MODE
+
+using namespace executorch::vulkan::prototyping;
+
+using namespace vkcompute;
+
+static constexpr int64_t kRefDimSizeLimit = 512;
+
+// Configuration for linear layer test cases
+struct LinearConfig {
+  int64_t batch_size;
+  int64_t in_features;
+  int64_t out_features;
+  bool has_bias;
+  std::string test_case_name;
+};
+
+// Utility function to create a test case from a LinearConfig
+TestCase create_test_case_from_config(
+    const LinearConfig& config,
+    vkapi::ScalarType dtype,
+    utils::StorageType storage_type,
+    int32_t impl_selector = 0) {
+  TestCase test_case;
+
+  utils::GPUMemoryLayout memory_layout = storage_type == utils::kBuffer
+      ? utils::kWidthPacked
+      : utils::kChannelsPacked;
+
+  // Create test case name
+  // Format: ACCU/PERF  B=batch  I=in_features  O=out_features  Tex/Buf
+  std::string prefix = config.test_case_name.substr(0, 4); // "ACCU" or "PERF"
+  std::string storage_str =
+      storage_type == utils::kBuffer ? "Buf" : "Tex";
+  std::string dtype_str = dtype == vkapi::kFloat ? "fp32" : "fp16";
+  std::string bias_str = config.has_bias ? "+bias" : "";
+
+  std::string test_name = prefix + "  " + "B=" + std::to_string(config.batch_size) +
+      "  I=" + std::to_string(config.in_features) +
+      "  O=" + std::to_string(config.out_features) + "  " + storage_str + "  " +
+      dtype_str + bias_str;
+  if (impl_selector == 1) {
+    test_name += " L"; // Legacy/alternative implementation
+  }
+  test_case.set_name(test_name);
+
+  // Set the operator name for the test case - use the test operator
+  std::string operator_name = "test_etvk.test_fp_linear.default";
+  test_case.set_operator_name(operator_name);
+
+  // Input tensor - [batch_size, in_features]
+  std::vector<int64_t> input_size = {config.batch_size, config.in_features};
+  ValueSpec input_tensor(
+      input_size, dtype, storage_type, memory_layout, DataGenType::RANDOM);
+
+  if (debugging()) {
+    print_valuespec_data(input_tensor, "input_tensor", false, 64);
+  }
+
+  // Weight tensor - [out_features, in_features]
+  std::vector<int64_t> weight_size = {config.out_features, config.in_features};
+  ValueSpec weight_tensor(
+      weight_size, dtype, storage_type, memory_layout, DataGenType::RANDOM);
+  weight_tensor.set_constant(true);
+
+  if (debugging()) {
+    print_valuespec_data(weight_tensor, "weight_tensor", false, 64);
+  }
+
+  // Bias tensor (optional) - [out_features]
+  ValueSpec bias_tensor;
+  if (config.has_bias) {
+    std::vector<int64_t> bias_size = {config.out_features};
+    bias_tensor = ValueSpec(
+        bias_size, dtype, storage_type, utils::kWidthPacked, DataGenType::RANDOM);
+    bias_tensor.set_constant(true);
+
+    if (debugging()) {
+      print_valuespec_data(bias_tensor, "bias_tensor", false, 64);
+    }
+  } else {
+    bias_tensor = ValueSpec();
+    bias_tensor.set_none(true);
+  }
+
+  // Output tensor - [batch_size, out_features]
+  std::vector<int64_t> output_size = {config.batch_size, config.out_features};
+  ValueSpec output_tensor(
+      output_size, dtype, storage_type, memory_layout, DataGenType::ZEROS);
+
+  // Add impl_selector parameter
+  ValueSpec impl_selector_spec(impl_selector);
+
+  // Add all specs to test case
+  test_case.add_input_spec(input_tensor);
+  test_case.add_input_spec(weight_tensor);
+  test_case.add_input_spec(bias_tensor);
+  test_case.add_input_spec(impl_selector_spec);
+  test_case.add_output_spec(output_tensor);
+
+  // Set tolerance based on dtype
+  if (dtype == vkapi::kFloat) {
+    test_case.set_abs_tolerance(1e-4f);
+  } else {
+    test_case.set_abs_tolerance(1e-2f);
+  }
+
+  return test_case;
+}
+
+// Generate easy test cases for debugging
+std::vector<TestCase> generate_linear_easy_cases() {
+  std::vector<TestCase> test_cases;
+
+  // Simple configuration for debugging
+  LinearConfig config = {
+      4,    // batch_size
+      64,   // in_features
+      32,   // out_features
+      true, // has_bias
+      "ACCU",
+  };
+
+  std::vector<utils::StorageType> storage_types = {utils::kBuffer};
+  std::vector<vkapi::ScalarType> dtypes = {vkapi::kFloat, vkapi::kHalf};
+
+  for (const utils::StorageType storage_type : storage_types) {
+    for (const vkapi::ScalarType dtype : dtypes) {
+      config.test_case_name = "ACCU";
+      // Test with impl_selector = 0 (default)
+      test_cases.push_back(
+          create_test_case_from_config(config, dtype, storage_type, 0));
+      // Test with impl_selector = 1 (alternative)
+      test_cases.push_back(
+          create_test_case_from_config(config, dtype, storage_type, 1));
+    }
+  }
+
+  return test_cases;
+}
+
+// Generate comprehensive test cases for linear layer
+std::vector<TestCase> generate_linear_test_cases() {
+  std::vector<TestCase> test_cases;
+
+  std::vector<LinearConfig> configs = {
+      // Small accuracy test cases
+      {1, 64, 64, true, "ACCU"},
+      {4, 128, 64, true, "ACCU"},
+      {8, 256, 128, true, "ACCU"},
+      {16, 64, 256, true, "ACCU"},
+      {1, 512, 512, true, "ACCU"},
+      // Without bias
+      {4, 128, 64, false, "ACCU"},
+      {8, 256, 128, false, "ACCU"},
+
+      // Performance test cases - BERT/ViT style (hidden_dim=768, ffn=3072)
+      {1, 768, 768, true, "PERF"},
+      {32, 768, 768, true, "PERF"},
+      {64, 768, 768, true, "PERF"},
+  };
+
+  std::vector<utils::StorageType> storage_types = {
+      utils::kTexture3D, utils::kBuffer};
+  std::vector<vkapi::ScalarType> dtypes = {vkapi::kFloat, vkapi::kHalf};
+
+  for (auto& config : configs) {
+    bool is_performance = config.batch_size > kRefDimSizeLimit ||
+        config.in_features > kRefDimSizeLimit ||
+        config.out_features > kRefDimSizeLimit;
+
+    for (const utils::StorageType storage_type : storage_types) {
+      for (const vkapi::ScalarType dtype : dtypes) {
+        config.test_case_name = is_performance ? "PERF" : "ACCU";
+        // Test with impl_selector = 0 (default)
+        test_cases.push_back(
+            create_test_case_from_config(config, dtype, storage_type, 0));
+        // Test with impl_selector = 1 (alternative)
+        test_cases.push_back(
+            create_test_case_from_config(config, dtype, storage_type, 1));
+      }
+    }
+  }
+
+  return test_cases;
+}
+
+// Reference implementation for fp32/fp16 linear layer
+void linear_reference_impl(TestCase& test_case) {
+  // Extract input specifications
+  const ValueSpec& input_spec = test_case.inputs()[0];
+  const ValueSpec& weight_spec = test_case.inputs()[1];
+  const ValueSpec& bias_spec = test_case.inputs()[2];
+  ValueSpec& output_spec = test_case.outputs()[0];
+
+  // Get tensor dimensions
+  auto input_sizes = input_spec.get_tensor_sizes();   // [batch, in_features]
+  auto weight_sizes = weight_spec.get_tensor_sizes(); // [out_features, in_features]
+  auto output_sizes = output_spec.get_tensor_sizes(); // [batch, out_features]
+
+  int64_t batch_size = input_sizes[0];
+  int64_t in_features = input_sizes[1];
+  int64_t out_features = weight_sizes[0];
+
+  // Skip for large tensors since computation time will be extremely slow
+  if (batch_size > kRefDimSizeLimit || in_features > kRefDimSizeLimit ||
+      out_features > kRefDimSizeLimit) {
+    throw std::invalid_argument(
+        "One or more dimensions exceed the allowed limit for reference implementation.");
+  }
+
+  bool has_bias = !bias_spec.is_none();
+
+  // Get raw data pointers based on dtype
+  auto& ref_data = output_spec.get_ref_float_data();
+  ref_data.resize(batch_size * out_features);
+
+  if (input_spec.dtype == vkapi::kFloat) {
+    auto& input_data = input_spec.get_float_data();
+    auto& weight_data = weight_spec.get_float_data();
+
+    // Perform linear operation: output = input @ weight^T + bias
+    for (int64_t b = 0; b < batch_size; ++b) {
+      for (int64_t o = 0; o < out_features; ++o) {
+        float sum = 0.0f;
+        for (int64_t i = 0; i < in_features; ++i) {
+          // input[b, i] * weight[o, i]
+          int64_t input_idx = b * in_features + i;
+          int64_t weight_idx = o * in_features + i;
+          sum += input_data[input_idx] * weight_data[weight_idx];
+        }
+
+        // Add bias if present
+        if (has_bias) {
+          auto& bias_data = bias_spec.get_float_data();
+          sum += bias_data[o];
+        }
+
+        int64_t output_idx = b * out_features + o;
+        ref_data[output_idx] = sum;
+      }
+    }
+  } else if (input_spec.dtype == vkapi::kHalf) {
+    auto& input_data = input_spec.get_half_data();
+    auto& weight_data = weight_spec.get_half_data();
+
+    // Perform linear operation: output = input @ weight^T + bias
+    for (int64_t b = 0; b < batch_size; ++b) {
+      for (int64_t o = 0; o < out_features; ++o) {
+        float sum = 0.0f;
+        for (int64_t i = 0; i < in_features; ++i) {
+          // input[b, i] * weight[o, i]
+          int64_t input_idx = b * in_features + i;
+          int64_t weight_idx = o * in_features + i;
+          sum += static_cast<float>(input_data[input_idx]) *
+              static_cast<float>(weight_data[weight_idx]);
+        }
+
+        // Add bias if present
+        if (has_bias) {
+          auto& bias_data = bias_spec.get_half_data();
+          sum += static_cast<float>(bias_data[o]);
+        }
+
+        int64_t output_idx = b * out_features + o;
+        ref_data[output_idx] = sum;
+      }
+    }
+  } else {
+    throw std::invalid_argument("Unsupported dtype for linear reference impl");
+  }
+}
+
+void reference_impl(TestCase& test_case) {
+  linear_reference_impl(test_case);
+}
+
+// FLOP calculator for linear operation
+int64_t linear_flop_calculator(const TestCase& test_case) {
+  const auto& input_sizes = test_case.inputs()[0].get_tensor_sizes();
+  const auto& weight_sizes = test_case.inputs()[1].get_tensor_sizes();
+
+  int64_t batch_size = input_sizes[0];
+  int64_t in_features = input_sizes[1];
+  int64_t out_features = weight_sizes[0];
+
+  // Each output element requires in_features multiply-accumulate operations
+  // Plus one add for bias (if present)
+  int64_t output_elements = batch_size * out_features;
+  int64_t ops_per_output = in_features; // MACs
+
+  int64_t flop = output_elements * ops_per_output;
+
+  return flop;
+}
+
+int main(int argc, char* argv[]) {
+  set_debugging(false);
+  set_print_output(false);
+  set_print_latencies(false);
+  set_use_gpu_timestamps(false);
+
+  print_performance_header();
+  std::cout << "FP32/FP16 Linear Layer Benchmark" << std::endl;
+  print_separator();
+
+  ReferenceComputeFunc ref_fn = reference_impl;
+
+  // Execute test cases using the framework with custom FLOP calculator
+  auto results = execute_test_cases(
+#ifdef DEBUG_MODE
+      generate_linear_easy_cases,
+#else
+      generate_linear_test_cases,
+#endif
+      linear_flop_calculator,
+      "FPLinear",
+#ifdef DEBUG_MODE
+      0,
+      1,
+#else
+      5,
+      40,
+#endif
+      ref_fn);
+
+  return 0;
+}