microsoft · hariharans29 · Mar 21, 2026 · Mar 21, 2026
diff --git a/onnxruntime/core/mlas/lib/snchwc.cpp b/onnxruntime/core/mlas/lib/snchwc.cpp
@@ -698,33 +698,41 @@ struct MLAS_NCHWC_CONV_NCHWC_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
             size_t WorkThisIteration = std::min(WorkRemaining, OutputHeight - ph);
 
             //
-            // Walk over each input image organized as a set of NCHWc blocks.
+            // Apply the convolution kernel to each row of the output batch.
             //
 
-            for (size_t ic = 0; ic < InputChannels; ic += BlockSize) {
+            for (size_t work = 0; work < WorkThisIteration; work++) {
 
-                unsigned KernelFlags = ComputeKernelFlags(ic, BlockSize);
+                //
+                // Constrain the effective kernel parameters once per output row.
+                // The input row and effective kernel height are shared across all
+                // input-channel blocks for this row.
+                //
+
+                size_t ih;
+                size_t EffectiveKernelHeight;
+                const float* EffectiveFilterBase = Filter;
+
+                ComputeEffectiveKernel(ph + work, BlockSize * BlockSize * KernelWidth,
+                    &EffectiveFilterBase, &ih, &EffectiveKernelHeight);
+
+                float* output = Output + (ph + work) * BlockedOutputWidth;
 
                 //
-                // Apply the convolution kernel to each row of the output batch.
+                // Walk over each input image organized as a set of NCHWc blocks.
                 //
 
-                const float* input = Input + ic * InputSize;
-                float* output = Output + ph * BlockedOutputWidth;
+                for (size_t ic = 0; ic < InputChannels; ic += BlockSize) {
 
-                for (size_t work = 0; work < WorkThisIteration; work++) {
+                    unsigned KernelFlags = ComputeKernelFlags(ic, BlockSize);
+                    const float* input = Input + ic * InputSize;
 
                     //
-                    // Constrain the effective kernel parameters if the output row
-                    // uses one or more input padding rows.
+                    // The input row and effective kernel height were computed
+                    // once for this output row.
                     //
 
-                    const float* filter = Filter + BlockSize * ic * KernelSize;
-                    size_t ih;
-                    size_t EffectiveKernelHeight;
-
-                    ComputeEffectiveKernel(ph + work, BlockSize * BlockSize * KernelWidth,
-                        &filter, &ih, &EffectiveKernelHeight);
+                    const float* filter = EffectiveFilterBase + BlockSize * ic * KernelSize;
 
                     //
                     // Invoke the convolution kernel.
@@ -745,8 +753,6 @@ struct MLAS_NCHWC_CONV_NCHWC_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
                     if ((KernelFlags & MLAS_CONV_KERNEL_FLAG_OTHER_ACTIVATION) != 0) {
                         DoActivation(output, FilterCount, BlockedOutputWidth);
                     }
-
-                    output += BlockedOutputWidth;
                 }
             }