Skip to content

Commit 4b20342

Browse files
authored
Make quantized_conv1d_depthwise kernels call the 2d optimized versions on HiFi (pytorch#18883)
Differential Revision: D99731352 Pull Request resolved: pytorch#18883
1 parent 1fc75d0 commit 4b20342

4 files changed

Lines changed: 335 additions & 0 deletions

File tree

backends/cadence/aot/functions_hifi.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -578,3 +578,13 @@
578578
kernels:
579579
- arg_meta: null
580580
kernel_name: impl::HiFi::quantized_conv1d_nlc_per_tensor_out
581+
582+
- func: cadence::quantized_depthwise_conv1d_ncl.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
583+
kernels:
584+
- arg_meta: null
585+
kernel_name: impl::HiFi::quantized_depthwise_conv1d_ncl_per_tensor_out
586+
587+
- func: cadence::quantized_depthwise_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
588+
kernels:
589+
- arg_meta: null
590+
kernel_name: impl::HiFi::quantized_depthwise_conv1d_nlc_per_tensor_out
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/backends/cadence/generic/operators/op_quantized_conv1d_ncl.h>
10+
#include <executorch/runtime/kernel/kernel_includes.h>
11+
12+
using Tensor = executorch::aten::Tensor;
13+
using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
14+
using ::executorch::aten::IntArrayRef;
15+
16+
namespace impl {
17+
namespace HiFi {
18+
namespace native {
19+
20+
// Depthwise conv1d NCL for HiFi: falls back to the generic implementation.
21+
// In practice this op is always converted to the NLC variant by
22+
// ReplaceConvWithChannelLastConvPass before reaching C++ kernels,
23+
// so no NNLib optimization is needed here.
24+
void quantized_depthwise_conv1d_ncl_per_tensor_out(
25+
KernelRuntimeContext& ctx,
26+
const Tensor& input,
27+
const Tensor& weight,
28+
const Tensor& bias,
29+
IntArrayRef stride,
30+
IntArrayRef padding,
31+
IntArrayRef dilation,
32+
int64_t groups,
33+
int64_t in_zero_point,
34+
int64_t weight_zero_point,
35+
double bias_scale,
36+
double output_scale,
37+
int64_t output_zero_point,
38+
int64_t out_multiplier,
39+
int64_t out_shift,
40+
Tensor& out) {
41+
impl::generic::native::quantized_conv1d_ncl_per_tensor_out(
42+
ctx,
43+
input,
44+
weight,
45+
bias,
46+
stride,
47+
padding,
48+
dilation,
49+
groups,
50+
in_zero_point,
51+
weight_zero_point,
52+
bias_scale,
53+
output_scale,
54+
output_zero_point,
55+
out_multiplier,
56+
out_shift,
57+
out);
58+
}
59+
60+
} // namespace native
61+
} // namespace HiFi
62+
} // namespace impl
Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h>
10+
#include <executorch/backends/cadence/hifi/kernels/kernels.h>
11+
#include <executorch/runtime/kernel/kernel_includes.h>
12+
13+
#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
14+
15+
using Tensor = executorch::aten::Tensor;
16+
using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
17+
using ScalarType = executorch::aten::ScalarType;
18+
using ::executorch::aten::IntArrayRef;
19+
20+
namespace impl {
21+
namespace HiFi {
22+
namespace native {
23+
24+
namespace {
25+
26+
// Optimized depthwise conv1d NLC using NNLib's conv2d depthwise kernel
27+
// with kernel_height=1. Handles both int8 and uint8 via the same
28+
// xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s function (uint8 is cast).
29+
//
30+
// Input: [N, L, C] (NLC format)
31+
// Weight: [OC, K, 1] (NLC depthwise, IC/groups == 1)
32+
// Output: [N, OL, OC]
33+
//
34+
// NNLib expects depthwise weight in [K, OC] format, so we transpose
35+
// the weight from [OC, K] (squeezed from [OC, K, 1]) to [K, OC].
36+
void xa_opt_quantized_depthwise_conv1d_nlc(
37+
KernelRuntimeContext& ctx,
38+
const Tensor& input,
39+
const Tensor& weight,
40+
const Tensor& bias,
41+
IntArrayRef stride,
42+
IntArrayRef padding,
43+
int32_t in_zero_point,
44+
int32_t weight_zero_point,
45+
float bias_scale,
46+
float output_scale,
47+
int32_t output_zero_point,
48+
Tensor& out) {
49+
WORD8* __restrict__ p_out =
50+
(WORD8* __restrict__)out.mutable_data_ptr<int8_t>();
51+
WORD8* __restrict__ p_inp =
52+
(WORD8* __restrict__)input.const_data_ptr<int8_t>();
53+
WORD8* __restrict__ p_kernel =
54+
(WORD8* __restrict__)weight.const_data_ptr<int8_t>();
55+
WORD32* __restrict__ p_bias =
56+
(WORD32* __restrict__)bias.const_data_ptr<int32_t>();
57+
58+
// NLC format: [N, L, C]
59+
WORD32 batches = input.size(0);
60+
WORD32 input_width = input.size(1);
61+
WORD32 input_channels = input.size(2);
62+
WORD32 input_height = 1;
63+
64+
// Weight: [OC, K, IC/groups] where IC/groups == 1 for depthwise
65+
WORD32 out_channels = weight.size(0);
66+
WORD32 kernel_width = weight.size(1);
67+
WORD32 kernel_height = 1;
68+
69+
WORD32 out_width = out.size(1);
70+
WORD32 out_height = 1;
71+
72+
// For 1D conv: stride/padding are 1-element arrays
73+
WORD32 x_stride = stride[stride.size() - 1];
74+
WORD32 y_stride = 1;
75+
WORD32 x_padding = padding[padding.size() - 1];
76+
WORD32 y_padding = 0;
77+
78+
WORD32 input_zero_bias = -in_zero_point;
79+
WORD32 out_zero_bias = output_zero_point;
80+
WORD32 inp_precision = 8;
81+
82+
WORD32 channels_multiplier = out_channels / input_channels;
83+
84+
// Per-channel output multiplier/shift (uniform for per-tensor quantization)
85+
WORD32 out_multiplier32[out_channels];
86+
WORD32 out_shift32[out_channels];
87+
88+
float out_scale = 1. / output_scale;
89+
90+
for (int i = 0; i < out_channels; i++) {
91+
out_multiplier32[i] = bias_scale * out_scale * 2147483648;
92+
out_shift32[i] = 0;
93+
}
94+
95+
// Transpose weight from [OC, K, 1] (effectively [OC, K]) to [K, OC]
96+
// which is the format NNLib depthwise expects.
97+
constexpr int kNnlibMaxDim = 5;
98+
99+
WORD8* ptr_weight = (WORD8*)kernels::allocate_temp_memory(
100+
ctx, ((out_channels * kernel_width) + 8) * sizeof(WORD8));
101+
WORD8* p_transposed_kernel = (WORD8*)ALIGN_PTR(ptr_weight, 8);
102+
103+
WORD32 p_kernel_shape[kNnlibMaxDim] = {1, 1, 1, out_channels, kernel_width};
104+
WORD32 p_kernel_out_shape[kNnlibMaxDim] = {
105+
1, 1, 1, kernel_width, out_channels};
106+
WORD32 p_permute_vec[kNnlibMaxDim] = {0, 1, 2, 4, 3};
107+
108+
xa_nn_transpose_8_8(
109+
p_transposed_kernel,
110+
p_kernel_out_shape,
111+
p_kernel,
112+
p_kernel_shape,
113+
p_permute_vec,
114+
kNnlibMaxDim,
115+
kNnlibMaxDim);
116+
117+
// Get scratch buffer for depthwise conv
118+
WORD32 scratch_size = xa_nn_conv2d_depthwise_getsize(
119+
input_height,
120+
input_width,
121+
input_channels,
122+
kernel_height,
123+
kernel_width,
124+
channels_multiplier,
125+
x_stride,
126+
y_stride,
127+
x_padding,
128+
y_padding,
129+
out_height,
130+
out_width,
131+
inp_precision,
132+
0); // NHWC
133+
134+
scratch_size = scratch_size < 0 ? 0 : scratch_size;
135+
136+
WORD32* ptr_scratch =
137+
(WORD32*)kernels::allocate_temp_memory(ctx, scratch_size);
138+
pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
139+
140+
for (int _n = 0; _n < batches; _n++) {
141+
WORD8* in_batch = p_inp + _n * input_channels * input_height * input_width;
142+
WORD8* out_batch = p_out + _n * out_channels * out_height * out_width;
143+
144+
xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s(
145+
out_batch,
146+
p_transposed_kernel,
147+
in_batch,
148+
p_bias,
149+
input_height,
150+
input_width,
151+
input_channels,
152+
kernel_height,
153+
kernel_width,
154+
channels_multiplier,
155+
x_stride,
156+
y_stride,
157+
x_padding,
158+
y_padding,
159+
out_height,
160+
out_width,
161+
input_zero_bias,
162+
out_multiplier32,
163+
out_shift32,
164+
out_zero_bias,
165+
0, // inp_data_format = NHWC
166+
0, // out_data_format = NHWC
167+
p_scratch);
168+
}
169+
}
170+
171+
} // namespace
172+
173+
void quantized_depthwise_conv1d_nlc_per_tensor_out(
174+
KernelRuntimeContext& ctx,
175+
const Tensor& input,
176+
const Tensor& weight,
177+
const Tensor& bias,
178+
IntArrayRef stride,
179+
IntArrayRef padding,
180+
IntArrayRef dilation,
181+
int64_t groups,
182+
int64_t in_zero_point,
183+
int64_t weight_zero_point,
184+
double bias_scale,
185+
double output_scale,
186+
int64_t output_zero_point,
187+
int64_t out_multiplier,
188+
int64_t out_shift,
189+
Tensor& out) {
190+
// Fall back to generic for dilation != 1, since NNLib depthwise
191+
// does not support dilation.
192+
if (dilation[dilation.size() - 1] != 1) {
193+
impl::generic::native::quantized_conv1d_nlc_per_tensor_out(
194+
ctx,
195+
input,
196+
weight,
197+
bias,
198+
stride,
199+
padding,
200+
dilation,
201+
groups,
202+
in_zero_point,
203+
weight_zero_point,
204+
bias_scale,
205+
output_scale,
206+
output_zero_point,
207+
out_multiplier,
208+
out_shift,
209+
out);
210+
return;
211+
}
212+
213+
ScalarType dtype = out.scalar_type();
214+
215+
if (dtype == ScalarType::Char || dtype == ScalarType::Byte) {
216+
// Both int8 and uint8 use the same NNLib function
217+
// (uint8 is cast to int8 internally by NNLib)
218+
xa_opt_quantized_depthwise_conv1d_nlc(
219+
ctx,
220+
input,
221+
weight,
222+
bias,
223+
stride,
224+
padding,
225+
static_cast<int32_t>(in_zero_point),
226+
static_cast<int32_t>(weight_zero_point),
227+
static_cast<float>(bias_scale),
228+
static_cast<float>(output_scale),
229+
static_cast<int32_t>(output_zero_point),
230+
out);
231+
} else {
232+
ET_DCHECK_MSG(
233+
false,
234+
"Unhandled dtype %s for quantized_depthwise_conv1d_nlc",
235+
torch::executor::toString(dtype));
236+
}
237+
}
238+
239+
} // namespace native
240+
} // namespace HiFi
241+
} // namespace impl

backends/cadence/hifi/operators/targets.bzl

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,28 @@ def define_common_targets():
388388
compatible_with = ["ovr_config//cpu:xtensa"],
389389
)
390390

391+
runtime.cxx_library(
392+
name = "op_quantized_depthwise_conv1d_ncl",
393+
srcs = ["op_quantized_depthwise_conv1d_ncl.cpp"],
394+
platforms = CXX,
395+
deps = COMMON_DEPS + [
396+
"//executorch/backends/cadence/generic/operators:op_quantized_conv1d_ncl",
397+
],
398+
visibility = ["PUBLIC"],
399+
compatible_with = ["ovr_config//cpu:xtensa"],
400+
)
401+
402+
runtime.cxx_library(
403+
name = "op_quantized_depthwise_conv1d_nlc",
404+
srcs = ["op_quantized_depthwise_conv1d_nlc.cpp"],
405+
platforms = CXX,
406+
deps = COMMON_DEPS + [
407+
"//executorch/backends/cadence/generic/operators:op_quantized_conv1d_nlc",
408+
],
409+
visibility = ["PUBLIC"],
410+
compatible_with = ["ovr_config//cpu:xtensa"],
411+
)
412+
391413
runtime.cxx_library(
392414
name = "op_quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out",
393415
srcs = ["op_quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp"],

0 commit comments

Comments
 (0)