From eccc975b53c00e005234964a11d36b7d21f0b3fc Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm26340.ftw0.facebook.com>
Date: Tue, 17 Feb 2026 12:19:27 -0800
Subject: [PATCH] [ET-VK][qconv] Fix depthwise weight_sums sum dimension

The weight_sums tensor stores per-output-channel sums of quantized weight values, used to apply activation zero point correction during integer accumulation. For depthwise convolutions, the weight tensor is reshaped to (H, W, OC), but the sum was unconditionally computed along dim=1 (the W dimension). This produced a tensor of shape (H, OC) instead of (OC,), causing incorrect zero point correction and corrupted depthwise conv output.

Fix by branching on is_depthwise_conv to sum over dims (0, 1) for the (H, W, OC) layout.

Differential Revision: [D93511635](https://our.internmc.facebook.com/intern/diff/D93511635/)

[ghstack-poisoned]
---
 backends/vulkan/patterns/quantized_convolution.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/backends/vulkan/patterns/quantized_convolution.py b/backends/vulkan/patterns/quantized_convolution.py
index b89dfe9aaab..93140e15341 100644
--- a/backends/vulkan/patterns/quantized_convolution.py
+++ b/backends/vulkan/patterns/quantized_convolution.py
@@ -215,9 +215,17 @@ def make_q8ta_conv2d_custom_op(
     with graph_module.graph.inserting_before(first_graph_node):
         qweight_tensor_name = utils.get_tensor_name(ep, match.weight_node)
         # Pre-compute the weight sums which are needed to apply activation zero point
-        # when using integer accumulation. For the reshaped 2D weight matrix (IC_per_group * H * W, OC),
-        # sum over dimension 0 to get sums per output channel
-        sum_per_output_channel = weight_tensor.sum(dim=1).to(torch.int32).contiguous()
+        # when using integer accumulation. Sum all weight elements per output channel.
+        if is_depthwise_conv:
+            # weight_tensor shape is (H, W, OC); sum over spatial dims (H, W)
+            sum_per_output_channel = (
+                weight_tensor.sum(dim=(0, 1)).to(torch.int32).contiguous()
+            )
+        else:
+            # weight_tensor shape is (OC, H*W*IC_per_group); sum over dim 1
+            sum_per_output_channel = (
+                weight_tensor.sum(dim=1).to(torch.int32).contiguous()
+            )
         sums_name = qweight_tensor_name + "_sums"
         # Sanitize the name
         sums_name = sums_name.replace(".", "_")