diff --git a/backends/cadence/aot/BUCK b/backends/cadence/aot/BUCK index a17ef6cb49a..c85dc23c4bd 100644 --- a/backends/cadence/aot/BUCK +++ b/backends/cadence/aot/BUCK @@ -154,7 +154,8 @@ fbcode_target(_kind = executorch_generated_lib, "//executorch/backends/cadence/generic/operators:op_quantize_per_tensor", "//executorch/backends/cadence/generic/operators:op_quantized_add", "//executorch/backends/cadence/generic/operators:op_quantized_conv2d", - "//executorch/backends/cadence/generic/operators:op_quantized_conv1d", + "//executorch/backends/cadence/generic/operators:op_quantized_conv1d_ncl", + "//executorch/backends/cadence/generic/operators:op_quantized_conv1d_nlc", "//executorch/backends/cadence/generic/operators:op_quantized_fully_connected", "//executorch/backends/cadence/generic/operators:op_quantized_layer_norm", "//executorch/backends/cadence/generic/operators:op_quantized_linear", diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml index 3ba6f4700b1..528ceadaf19 100644 --- a/backends/cadence/aot/functions.yaml +++ b/backends/cadence/aot/functions.yaml @@ -359,6 +359,16 @@ - arg_meta: null kernel_name: impl::generic::quantized_conv2d_nhwc_per_tensor_out +- func: cadence::quantized_conv1d_ncl.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::generic::quantized_conv1d_ncl_per_tensor_out + +- func: cadence::quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::generic::quantized_conv1d_nlc_per_tensor_out + - func: cadence::quantized_conv2d_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null @@ -419,26 +429,6 @@ - arg_meta: null kernel_name: impl::generic::quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out -- func: cadence::quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) - kernels: - - arg_meta: null - kernel_name: impl::generic::quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out - -- func: cadence::quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) - kernels: - - arg_meta: null - kernel_name: impl::generic::quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out - -- func: cadence::quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) - kernels: - - arg_meta: null - kernel_name: impl::generic::quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out - -- func: cadence::quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) - kernels: - - arg_meta: null - kernel_name: impl::generic::quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out - - func: cadence::quantized_fully_connected.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index f713d0a3227..e7ae2fcaeeb 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -445,26 +445,6 @@ - arg_meta: null kernel_name: impl::HiFi::quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out -- func: cadence::quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) - kernels: - - arg_meta: null - kernel_name: impl::HiFi::quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out - -- func: cadence::quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) - kernels: - - arg_meta: null - kernel_name: impl::HiFi::quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out - -- func: cadence::quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) - kernels: - - arg_meta: null - kernel_name: impl::HiFi::quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out - -- func: cadence::quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) - kernels: - - arg_meta: null - kernel_name: impl::HiFi::quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out - - func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null @@ -563,3 +543,13 @@ kernels: - arg_meta: null kernel_name: impl::HiFi::quantized_fully_connected_asym8uxasym8u_asym8u_per_tensor_out + +- func: cadence::quantized_conv1d_ncl.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::HiFi::native::quantized_conv1d_ncl_per_tensor_out + +- func: cadence::quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::HiFi::native::quantized_conv1d_nlc_per_tensor_out diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py index d805426737b..cbc179e05d2 100644 --- a/backends/cadence/aot/ops_registrations.py +++ b/backends/cadence/aot/ops_registrations.py @@ -225,6 +225,30 @@ def register_fake( lib.define( "quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) +lib.define( + "quantized_conv1d_ncl(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Z)" +) +lib.define( + "quantized_conv1d_ncl.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)" +) +lib.define( + "quantized_conv1d_ncl.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" +) +lib.define( + "quantized_conv1d_ncl.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" +) +lib.define( + "quantized_conv1d_nlc(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Z)" +) +lib.define( + "quantized_conv1d_nlc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)" +) +lib.define( + "quantized_conv1d_nlc.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" +) +lib.define( + "quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" +) lib.define( "quantized_conv2d_nchw(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Z)" ) @@ -297,30 +321,6 @@ def register_fake( lib.define( "quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) -lib.define( - "quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" -) -lib.define( - "quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" -) -lib.define( - "quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" -) -lib.define( - "quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" -) -lib.define( - "quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" -) -lib.define( - "quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" -) -lib.define( - "quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" -) -lib.define( - "quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" -) lib.define( "quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" ) @@ -1071,6 +1071,154 @@ def quantized_conv2d_nhwc_meta( return input.new_empty(output_size, dtype=input.dtype) +@register_fake("cadence::quantized_conv1d_ncl") +def quantized_conv1d_ncl_meta( + input: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: Tuple[int], + padding: Tuple[int], + dilation: Tuple[int], + groups: int, + in_zero_point: int, + weight_zero_point: torch.Tensor, + bias_scale: torch.Tensor, + output_scale: float, + output_zero_point: int, + out_multiplier: torch.Tensor, + out_shift: torch.Tensor, +) -> torch.Tensor: + # NCL format: input is [N, C, L], weight is [OC, IC/groups, K] + out_channels, _, kernel_size = weight.shape + + in_size = input.shape + assert len(in_size) == 3 + + # Compute the output tensor size for 1D conv (NCL format, channel_last=False) + output_size = get_conv1d_output_size( + in_size, + out_channels, + stride[-1], + padding[-1], + dilation[-1], + kernel_size, + False, + ) + + return input.new_empty(output_size, dtype=input.dtype) + + +@register_fake("cadence::quantized_conv1d_ncl.per_tensor") +def quantized_conv1d_ncl_per_tensor_meta( + input: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: Tuple[int], + padding: Tuple[int], + dilation: Tuple[int], + groups: int, + in_zero_point: int, + weight_zero_point: int, + bias_scale: float, + output_scale: float, + output_zero_point: int, + out_multiplier: int, + out_shift: int, +) -> torch.Tensor: + # NCL format: input is [N, C, L], weight is [OC, IC/groups, K] + out_channels, _, kernel_size = weight.shape + + in_size = input.shape + assert len(in_size) == 3 + + # Compute the output tensor size for 1D conv (NCL format, channel_last=False) + output_size = get_conv1d_output_size( + in_size, + out_channels, + stride[-1], + padding[-1], + dilation[-1], + kernel_size, + False, + ) + + return input.new_empty(output_size, dtype=input.dtype) + + +@register_fake("cadence::quantized_conv1d_nlc") +def quantized_conv1d_nlc_meta( + input: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: Tuple[int], + padding: Tuple[int], + dilation: Tuple[int], + groups: int, + in_zero_point: int, + weight_zero_point: torch.Tensor, + bias_scale: torch.Tensor, + output_scale: float, + output_zero_point: int, + out_multiplier: torch.Tensor, + out_shift: torch.Tensor, +) -> torch.Tensor: + # NLC format: input is [N, L, C], weight is [OC, K, IC/groups] + out_channels, kernel_size, _ = weight.shape + + in_size = input.shape + assert len(in_size) == 3 + + # Compute the output tensor size for 1D conv (NLC format, channel_last=True) + output_size = get_conv1d_output_size( + in_size, + out_channels, + stride[-1], + padding[-1], + dilation[-1], + kernel_size, + True, + ) + + return input.new_empty(output_size, dtype=input.dtype) + + +@register_fake("cadence::quantized_conv1d_nlc.per_tensor") +def quantized_conv1d_nlc_per_tensor_meta( + input: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: Tuple[int], + padding: Tuple[int], + dilation: Tuple[int], + groups: int, + in_zero_point: int, + weight_zero_point: int, + bias_scale: float, + output_scale: float, + output_zero_point: int, + out_multiplier: int, + out_shift: int, +) -> torch.Tensor: + # NLC format: input is [N, L, C], weight is [OC, K, IC/groups] + out_channels, kernel_size, _ = weight.shape + + in_size = input.shape + assert len(in_size) == 3 + + # Compute the output tensor size for 1D conv (NLC format, channel_last=True) + output_size = get_conv1d_output_size( + in_size, + out_channels, + stride[-1], + padding[-1], + dilation[-1], + kernel_size, + True, + ) + + return input.new_empty(output_size, dtype=input.dtype) + + @register_fake("cadence::quantized_conv2d_nchw") def quantized_conv2d_nchw_meta( input: torch.Tensor, @@ -2674,150 +2822,6 @@ def roi_align_box_processor_meta( return rois.new_empty((rois.shape[0], 80), dtype=torch.uint8) -@register_fake("cadence::quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor") -def quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_meta( - input: torch.Tensor, - weight: torch.Tensor, - bias: torch.Tensor, - stride: Tuple[int], - padding: Tuple[int], - dilation: Tuple[int], - groups: int, - in_zero_point: int, - weight_zero_point: int, - bias_scale: float, - output_scale: float, - output_zero_point: int, - out_multiplier: int, - out_shift: int, -) -> torch.Tensor: - assert input.dim() == 3 and weight.dim() == 3 - assert ( - input.dtype == torch.int8 - and weight.dtype == torch.int8 - and bias.dtype == torch.int32 - ) - out_channels, _, kernel_size = weight.shape - output_size = get_conv1d_output_size( - input.shape, - out_channels, - stride[1], - padding[1], - dilation[1], - kernel_size, - False, - ) - return input.new_empty(output_size, dtype=input.dtype) - - -@register_fake("cadence::quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor") -def quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_meta( - input: torch.Tensor, - weight: torch.Tensor, - bias: torch.Tensor, - stride: Tuple[int], - padding: Tuple[int], - dilation: Tuple[int], - groups: int, - in_zero_point: int, - weight_zero_point: int, - bias_scale: float, - output_scale: float, - output_zero_point: int, - out_multiplier: int, - out_shift: int, -) -> torch.Tensor: - assert input.dim() == 3 and weight.dim() == 3 - assert ( - input.dtype == torch.uint8 - and weight.dtype == torch.uint8 - and bias.dtype == torch.int32 - ) - out_channels, _, kernel_size = weight.shape - output_size = get_conv1d_output_size( - input.shape, - out_channels, - stride[1], - padding[1], - dilation[1], - kernel_size, - False, - ) - return input.new_empty(output_size, dtype=input.dtype) - - -@register_fake("cadence::quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor") -def quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_meta( - input: torch.Tensor, - weight: torch.Tensor, - bias: torch.Tensor, - stride: Tuple[int], - padding: Tuple[int], - dilation: Tuple[int], - groups: int, - in_zero_point: int, - weight_zero_point: int, - bias_scale: float, - output_scale: float, - output_zero_point: int, - out_multiplier: int, - out_shift: int, -) -> torch.Tensor: - assert input.dim() == 3 and weight.dim() == 3 - assert ( - input.dtype == torch.int8 - and weight.dtype == torch.int8 - and bias.dtype == torch.int32 - ) - out_channels, kernel_size, _ = weight.shape - output_size = get_conv1d_output_size( - input.shape, - out_channels, - stride[1], - padding[1], - dilation[1], - kernel_size, - True, - ) - return input.new_empty(output_size, dtype=input.dtype) - - -@register_fake("cadence::quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor") -def quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_meta( - input: torch.Tensor, - weight: torch.Tensor, - bias: torch.Tensor, - stride: Tuple[int], - padding: Tuple[int], - dilation: Tuple[int], - groups: int, - in_zero_point: int, - weight_zero_point: int, - bias_scale: float, - output_scale: float, - output_zero_point: int, - out_multiplier: int, - out_shift: int, -) -> torch.Tensor: - assert input.dim() == 3 and weight.dim() == 3 - assert ( - input.dtype == torch.uint8 - and weight.dtype == torch.uint8 - and bias.dtype == torch.int32 - ) - out_channels, kernel_size, _ = weight.shape - output_size = get_conv1d_output_size( - input.shape, - out_channels, - stride[1], - padding[1], - dilation[1], - kernel_size, - True, - ) - return input.new_empty(output_size, dtype=input.dtype) - - @register_fake("cadence::_softmax_f32_f32") def softmax_f32_f32_meta( input_tensor: torch.Tensor, diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py index a37233fd492..44cae6e55ea 100644 --- a/backends/cadence/aot/ref_implementations.py +++ b/backends/cadence/aot/ref_implementations.py @@ -766,9 +766,9 @@ def quantized_conv_per_tensor( input_tensor: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, - stride: tuple[int, int], - padding: tuple[int, int], - dilation: tuple[int, int], + stride: tuple[int, ...], + padding: tuple[int, ...], + dilation: tuple[int, ...], groups: int, in_zero_point: int, weight_zero_point: int, @@ -887,6 +887,194 @@ def quantized_conv2d_nchw_per_tensor( ) +@impl_tracked(m, "quantized_conv1d_ncl.per_tensor") +def quantized_conv1d_ncl_per_tensor( + input_tensor: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: tuple[int], + padding: tuple[int], + dilation: tuple[int], + groups: int, + in_zero_point: int, + weight_zero_point: int, + bias_scale: float, + output_scale: float, + output_zero_point: int, + out_multiplier: int, + out_shift: int, +) -> torch.Tensor: + """ + Quantized 1D convolution operation in NCL (channels-first) format. + + Args: + - input_tensor (Tensor): The activations tensor in [N, C, L] format + - weight (Tensor): The weight tensor in [OC, IC/groups, K] format + - bias (Tensor): The bias tensor + - stride (Tuple[int]): The stride of the convolution + - padding (Tuple[int]): The padding of the convolution + - dilation (Tuple[int]): The dilation of the convolution + - groups (int): The number of groups + - in_zero_point (int): The quantized mapping of zero for the input + - weight_zero_point (int): The quantized mapping of zero for the weight + - bias_scale (float): The quantized bias scale + - output_scale (float): The scale of the output + - output_zero_point (int): The zero point of the output + - out_multiplier (int): Unused + - out_shift (int): Unused + """ + if not input_tensor.is_contiguous(memory_format=torch.contiguous_format): + raise ValueError("Input tensor must be in NCL format") + return quantized_conv_per_tensor( + input_tensor, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + ) + + +@impl_tracked(m, "quantized_conv1d_ncl") +def quantized_conv1d_ncl( + input_tensor: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: tuple[int], + padding: tuple[int], + dilation: tuple[int], + groups: int, + in_zero_point: int, + weight_zero_point: torch.Tensor, + bias_scale: torch.Tensor, + output_scale: float, + output_zero_point: int, + out_multiplier: torch.Tensor, + out_shift: torch.Tensor, +) -> torch.Tensor: + return quantized_conv1d_ncl_per_tensor( + input_tensor, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + int(weight_zero_point.item()), + float(bias_scale.item()), + output_scale, + output_zero_point, + int(out_multiplier.item()), + int(out_shift.item()), + ) + + +@impl_tracked(m, "quantized_conv1d_nlc.per_tensor") +def quantized_conv1d_nlc_per_tensor( + input_tensor: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: tuple[int], + padding: tuple[int], + dilation: tuple[int], + groups: int, + in_zero_point: int, + weight_zero_point: int, + bias_scale: float, + output_scale: float, + output_zero_point: int, + out_multiplier: int, + out_shift: int, +) -> torch.Tensor: + """ + Quantized 1D convolution operation in NLC (channels-last) format. + + Args: + - input_tensor (Tensor): The activations tensor in [N, L, C] format + - weight (Tensor): The weight tensor in [OC, K, IC/groups] format + - bias (Tensor): The bias tensor + - stride (Tuple[int]): The stride of the convolution + - padding (Tuple[int]): The padding of the convolution + - dilation (Tuple[int]): The dilation of the convolution + - groups (int): The number of groups + - in_zero_point (int): The quantized mapping of zero for the input + - weight_zero_point (int): The quantized mapping of zero for the weight + - bias_scale (float): The quantized bias scale + - output_scale (float): The scale of the output + - output_zero_point (int): The zero point of the output + - out_multiplier (int): Unused + - out_shift (int): Unused + """ + # Convert NLC to NCL for processing + input_ncl = input_tensor.permute(0, 2, 1).contiguous() + # Convert weight from [OC, K, IC/groups] to [OC, IC/groups, K] + weight_ncl = weight.permute(0, 2, 1).contiguous() + + result_ncl = quantized_conv_per_tensor( + input_ncl, + weight_ncl, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + ) + + # Convert result back to NLC format + return result_ncl.permute(0, 2, 1).contiguous() + + +@impl_tracked(m, "quantized_conv1d_nlc") +def quantized_conv1d_nlc( + input_tensor: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: tuple[int], + padding: tuple[int], + dilation: tuple[int], + groups: int, + in_zero_point: int, + weight_zero_point: torch.Tensor, + bias_scale: torch.Tensor, + output_scale: float, + output_zero_point: int, + out_multiplier: torch.Tensor, + out_shift: torch.Tensor, +) -> torch.Tensor: + return quantized_conv1d_nlc_per_tensor( + input_tensor, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + int(weight_zero_point.item()), + float(bias_scale.item()), + output_scale, + output_zero_point, + int(out_multiplier.item()), + int(out_shift.item()), + ) + + @impl_tracked(m, "quantized_conv2d_nchw") def quantized_conv2d_nchw( input_tensor: torch.Tensor, @@ -1343,26 +1531,6 @@ def quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor() -> ( ): ... -@impl_tracked(m, "quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor") -@quantized_conv_variant("nchw", torch.int8, torch.int8, is_1d=True) -def quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ... - - -@impl_tracked(m, "quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor") -@quantized_conv_variant("nchw", torch.uint8, torch.uint8, is_1d=True) -def quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ... - - -@impl_tracked(m, "quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor") -@quantized_conv_variant("nhwc", torch.int8, torch.int8, is_1d=True) -def quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ... - - -@impl_tracked(m, "quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor") -@quantized_conv_variant("nhwc", torch.uint8, torch.uint8, is_1d=True) -def quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ... - - @impl_tracked(m, "conv1d") def conv1d( input_tensor: torch.Tensor, diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py index bf9e4d39250..936aad4e585 100644 --- a/backends/cadence/aot/tests/test_ref_implementations.py +++ b/backends/cadence/aot/tests/test_ref_implementations.py @@ -3237,3 +3237,275 @@ def test_slice_scatter_with_none_start_end(self) -> None: torch.equal(self_tensor, expected), f"Values don't match: got {self_tensor}, expected {expected}", ) + + def test_quantized_conv1d_ncl_per_tensor_basic(self) -> None: + """Test quantized_conv1d_ncl.per_tensor with basic NCL format input.""" + # NCL format: input is [N, C, L] + # Create simple 1D convolution input + batch_size = 1 + length = 4 + out_channels = 1 + kernel_size = 2 + + input_tensor = torch.tensor([[[1, 2, 3, 4], [5, 6, 7, 8]]], dtype=torch.int8) + # Weight shape: [OC, IC/groups, K] + weight = torch.tensor([[[1, 1], [1, 1]]], dtype=torch.int8) + bias = torch.tensor([0], dtype=torch.int32) + + stride = (1,) + padding = (0,) + dilation = (1,) + groups = 1 + in_zero_point = 0 + weight_zero_point = 0 + bias_scale = 1.0 + output_scale = 1.0 + output_zero_point = 0 + out_multiplier = 0 + out_shift = 0 + + output = torch.ops.cadence.quantized_conv1d_ncl.per_tensor( + input_tensor, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + ) + + # Verify output shape: [N, OC, OL] where OL = (L + 2*padding - kernel) / stride + 1 + expected_length = (length + 2 * padding[0] - kernel_size) // stride[0] + 1 + self.assertEqual(output.shape, (batch_size, out_channels, expected_length)) + self.assertEqual(output.dtype, torch.int8) + + def test_quantized_conv1d_ncl_default_variant(self) -> None: + """Test quantized_conv1d_ncl (default variant with tensor params).""" + input_tensor = torch.tensor([[[1, 2, 3, 4]]], dtype=torch.int8) + weight = torch.tensor([[[1, 1]]], dtype=torch.int8) + bias = torch.tensor([0], dtype=torch.int32) + + stride = (1,) + padding = (0,) + dilation = (1,) + groups = 1 + in_zero_point = 0 + weight_zero_point = torch.tensor([0], dtype=torch.int32) + bias_scale = torch.tensor([1.0], dtype=torch.float32) + output_scale = 1.0 + output_zero_point = 0 + out_multiplier = torch.tensor([1073741824], dtype=torch.int32) + out_shift = torch.tensor([0], dtype=torch.int32) + + output = torch.ops.cadence.quantized_conv1d_ncl( + input_tensor, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + ) + + self.assertEqual(output.shape, (1, 1, 3)) + self.assertEqual(output.dtype, torch.int8) + + def test_quantized_conv1d_nlc_per_tensor_basic(self) -> None: + """Test quantized_conv1d_nlc.per_tensor with basic NLC format input.""" + # NLC format: input is [N, L, C] + batch_size = 1 + length = 4 + out_channels = 1 + kernel_size = 2 + + # Input in NLC format + input_tensor = torch.tensor( + [[[1, 5], [2, 6], [3, 7], [4, 8]]], dtype=torch.int8 + ) + # Weight shape: [OC, K, IC/groups] + weight = torch.tensor([[[1, 1], [1, 1]]], dtype=torch.int8) + bias = torch.tensor([0], dtype=torch.int32) + + stride = (1,) + padding = (0,) + dilation = (1,) + groups = 1 + in_zero_point = 0 + weight_zero_point = 0 + bias_scale = 1.0 + output_scale = 1.0 + output_zero_point = 0 + out_multiplier = 0 + out_shift = 0 + + output = torch.ops.cadence.quantized_conv1d_nlc.per_tensor( + input_tensor, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + ) + + # Verify output shape: [N, OL, OC] + expected_length = (length + 2 * padding[0] - kernel_size) // stride[0] + 1 + self.assertEqual(output.shape, (batch_size, expected_length, out_channels)) + self.assertEqual(output.dtype, torch.int8) + + def test_quantized_conv1d_nlc_default_variant(self) -> None: + """Test quantized_conv1d_nlc (default variant with tensor params).""" + # Input in NLC format: [N, L, C] + input_tensor = torch.tensor([[[1], [2], [3], [4]]], dtype=torch.int8) + # Weight shape: [OC, K, IC/groups] + weight = torch.tensor([[[1], [1]]], dtype=torch.int8) + bias = torch.tensor([0], dtype=torch.int32) + + stride = (1,) + padding = (0,) + dilation = (1,) + groups = 1 + in_zero_point = 0 + weight_zero_point = torch.tensor([0], dtype=torch.int32) + bias_scale = torch.tensor([1.0], dtype=torch.float32) + output_scale = 1.0 + output_zero_point = 0 + out_multiplier = torch.tensor([1073741824], dtype=torch.int32) + out_shift = torch.tensor([0], dtype=torch.int32) + + output = torch.ops.cadence.quantized_conv1d_nlc( + input_tensor, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + ) + + # Output should be [N, OL, OC] = [1, 3, 1] + self.assertEqual(output.shape, (1, 3, 1)) + self.assertEqual(output.dtype, torch.int8) + + def test_quantized_conv1d_ncl_with_groups(self) -> None: + """Test quantized_conv1d_ncl.per_tensor with groups > 1.""" + batch_size = 1 + in_channels = 4 + length = 4 + out_channels = 4 + kernel_size = 2 + groups = 2 + + input_tensor = torch.randint( + -5, 5, (batch_size, in_channels, length), dtype=torch.int8 + ) + # Weight shape: [OC, IC/groups, K] + weight = torch.randint( + -2, 2, (out_channels, in_channels // groups, kernel_size), dtype=torch.int8 + ) + bias = torch.zeros(out_channels, dtype=torch.int32) + + stride = (1,) + padding = (0,) + dilation = (1,) + in_zero_point = 0 + weight_zero_point = 0 + bias_scale = 1.0 + output_scale = 0.1 + output_zero_point = 0 + out_multiplier = 0 + out_shift = 0 + + output = torch.ops.cadence.quantized_conv1d_ncl.per_tensor( + input_tensor, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + ) + + expected_length = (length + 2 * padding[0] - kernel_size) // stride[0] + 1 + self.assertEqual(output.shape, (batch_size, out_channels, expected_length)) + self.assertEqual(output.dtype, torch.int8) + + def test_quantized_conv1d_nlc_with_padding(self) -> None: + """Test quantized_conv1d_nlc.per_tensor with padding.""" + batch_size = 1 + length = 3 + out_channels = 1 + + # Input in NLC format: [N, L, C] + input_tensor = torch.tensor([[[1], [2], [3]]], dtype=torch.int8) + # Weight shape: [OC, K, IC/groups] + weight = torch.tensor([[[1], [1], [1]]], dtype=torch.int8) + bias = torch.tensor([0], dtype=torch.int32) + + stride = (1,) + padding = (1,) # Add padding + dilation = (1,) + groups = 1 + in_zero_point = 0 + weight_zero_point = 0 + bias_scale = 1.0 + output_scale = 1.0 + output_zero_point = 0 + out_multiplier = 0 + out_shift = 0 + + output = torch.ops.cadence.quantized_conv1d_nlc.per_tensor( + input_tensor, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + ) + + # With padding=1, output length = (3 + 2*1 - 3) / 1 + 1 = 3 + self.assertEqual(output.shape, (batch_size, length, out_channels)) + self.assertEqual(output.dtype, torch.int8) diff --git a/backends/cadence/aot/tests/test_type_dispatch_passes.py b/backends/cadence/aot/tests/test_type_dispatch_passes.py index 870735aad1a..f0847e8ca77 100644 --- a/backends/cadence/aot/tests/test_type_dispatch_passes.py +++ b/backends/cadence/aot/tests/test_type_dispatch_passes.py @@ -307,63 +307,6 @@ def test_dispatch_quantized_conv_2d_dilated( # Should be replaced with dtype-specific variant self.assertEqual(count_node(gm, expected_op), 1) - @expand( - [ - ( - "int8_nchw_1d", - torch.int8, - (1, 3, 8), # x_shape - exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor, - exir_ops.edge.cadence.quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor, - ), - ( - "uint8_nchw_1d", - torch.uint8, - (1, 3, 8), # x_shape - exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor, - exir_ops.edge.cadence.quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor, - ), - ( - "int8_nhwc_1d", - torch.int8, - (1, 8, 3), # x_shape - exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor, - exir_ops.edge.cadence.quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor, - ), - ( - "uint8_nhwc_1d", - torch.uint8, - (1, 8, 3), # x_shape - exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor, - exir_ops.edge.cadence.quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor, - ), - ] - ) - def test_dispatch_quantized_conv_1d( - self, - _: str, - dtype: torch.dtype, - x_shape: tuple[int, ...], - original_op: torch._ops.OpOverload, - expected_op: torch._ops.OpOverload, - ) -> None: - """Test quantized_conv_1d (nchw/nhwc) dispatches to correct dtype-specific variant""" - min_val, max_val = torch.iinfo(dtype).min, torch.iinfo(dtype).max - x = torch.randint(min_val, max_val, x_shape, dtype=dtype) - w = torch.randint(min_val, max_val, (16, 3, 3), dtype=dtype) - b = torch.randint(-2147483648, 2147483647, (16,), dtype=torch.int32) - gm = single_op_builder( - placeholders=(x, w, b), - op=original_op, - args=(x, w, b, [1, 1], [0, 0], [1, 1], 1, 0, 0, 1.0, 1.0, 0, 1, 1), - ) - p = CompileTimeTypeDispatchPass() - gm = cast(PassResult, p(gm)).graph_module - # Original op should be replaced - self.assertEqual(count_node(gm, original_op), 0) - # Should be replaced with dtype-specific variant - self.assertEqual(count_node(gm, expected_op), 1) - @expand( [ ( diff --git a/backends/cadence/aot/type_dispatch.py b/backends/cadence/aot/type_dispatch.py index 69fd721e4e3..b809cc90978 100644 --- a/backends/cadence/aot/type_dispatch.py +++ b/backends/cadence/aot/type_dispatch.py @@ -171,18 +171,11 @@ def call_operator( is_depthwise = is_depthwise_conv(groups, input_channels) # pyre-ignore[16]: None has no attribute '__iter__'. is_dilated = any(d > 1 for d in args[5]) - is_1d = len(args[0].to_tensor().shape) == 3 if is_depthwise: typed_op_name = f"{base_name}_depthwise_{type_suffix}" elif is_dilated: typed_op_name = f"{base_name}_dilated_{type_suffix}" - elif is_1d and groups == 1: - if "nchw" in base_name: - layout_suffix = "ncl" - else: - layout_suffix = "nlc" - typed_op_name = f"quantized_conv1d_{layout_suffix}_{type_suffix}" typed_op = getattr( getattr(exir_ops.edge.cadence, typed_op_name), config.variant diff --git a/backends/cadence/generic/operators/op_quantized_conv1d.cpp b/backends/cadence/generic/operators/op_quantized_conv1d.cpp index 6ae3a6613fb..5beafd9e1fe 100644 --- a/backends/cadence/generic/operators/op_quantized_conv1d.cpp +++ b/backends/cadence/generic/operators/op_quantized_conv1d.cpp @@ -6,8 +6,6 @@ * LICENSE file in the root directory of this source tree. */ -#include - #include #include #include @@ -373,142 +371,6 @@ void quantized_conv1d_nlc( } // namespace -Tensor& quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out( - ET_UNUSED KernelRuntimeContext& ctx, - const Tensor& input, - const Tensor& weight, - const Tensor& bias, - IntArrayRef stride, - IntArrayRef padding, - IntArrayRef dilation, - int64_t groups, - int64_t in_zero_point, - int64_t weight_zero_point, - double bias_scale, - double output_scale, - int64_t output_zero_point, - ET_UNUSED int64_t out_multiplier, - ET_UNUSED int64_t out_shift, - Tensor& out) { - quantized_conv1d_ncl( - input, - weight, - bias, - stride, - padding, - dilation, - static_cast(groups), - static_cast(in_zero_point), - static_cast(weight_zero_point), - static_cast(bias_scale), - static_cast(output_scale), - static_cast(output_zero_point), - out); - return out; -} - -Tensor& quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out( - ET_UNUSED KernelRuntimeContext& ctx, - const Tensor& input, - const Tensor& weight, - const Tensor& bias, - IntArrayRef stride, - IntArrayRef padding, - IntArrayRef dilation, - int64_t groups, - int64_t in_zero_point, - int64_t weight_zero_point, - double bias_scale, - double output_scale, - int64_t output_zero_point, - ET_UNUSED int64_t out_multiplier, - ET_UNUSED int64_t out_shift, - Tensor& out) { - quantized_conv1d_ncl( - input, - weight, - bias, - stride, - padding, - dilation, - static_cast(groups), - static_cast(in_zero_point), - static_cast(weight_zero_point), - static_cast(bias_scale), - static_cast(output_scale), - static_cast(output_zero_point), - out); - return out; -} - -Tensor& quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out( - ET_UNUSED KernelRuntimeContext& ctx, - const Tensor& input, - const Tensor& weight, - const Tensor& bias, - IntArrayRef stride, - IntArrayRef padding, - IntArrayRef dilation, - int64_t groups, - int64_t in_zero_point, - int64_t weight_zero_point, - double bias_scale, - double output_scale, - int64_t output_zero_point, - ET_UNUSED int64_t out_multiplier, - ET_UNUSED int64_t out_shift, - Tensor& out) { - quantized_conv1d_nlc( - input, - weight, - bias, - stride, - padding, - dilation, - static_cast(groups), - static_cast(in_zero_point), - static_cast(weight_zero_point), - static_cast(bias_scale), - static_cast(output_scale), - static_cast(output_zero_point), - out); - return out; -} - -Tensor& quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out( - ET_UNUSED KernelRuntimeContext& ctx, - const Tensor& input, - const Tensor& weight, - const Tensor& bias, - IntArrayRef stride, - IntArrayRef padding, - IntArrayRef dilation, - int64_t groups, - int64_t in_zero_point, - int64_t weight_zero_point, - double bias_scale, - double output_scale, - int64_t output_zero_point, - ET_UNUSED int64_t out_multiplier, - ET_UNUSED int64_t out_shift, - Tensor& out) { - quantized_conv1d_nlc( - input, - weight, - bias, - stride, - padding, - dilation, - static_cast(groups), - static_cast(in_zero_point), - static_cast(weight_zero_point), - static_cast(bias_scale), - static_cast(output_scale), - static_cast(output_zero_point), - out); - return out; -} - } // namespace native } // namespace generic } // namespace impl diff --git a/backends/cadence/generic/operators/op_quantized_conv1d.h b/backends/cadence/generic/operators/op_quantized_conv1d.h deleted file mode 100644 index 5cb79ab09fa..00000000000 --- a/backends/cadence/generic/operators/op_quantized_conv1d.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include - -namespace impl { -namespace generic { -namespace native { - -executorch::aten::Tensor& -quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out( - executorch::runtime::KernelRuntimeContext& ctx, - const executorch::aten::Tensor& input, - const executorch::aten::Tensor& weight, - const executorch::aten::Tensor& bias, - executorch::aten::IntArrayRef stride, - executorch::aten::IntArrayRef padding, - executorch::aten::IntArrayRef dilation, - int64_t groups, - int64_t in_zero_point, - int64_t weight_zero_point, - double bias_scale, - double output_scale, - int64_t output_zero_point, - int64_t out_multiplier, - int64_t out_shift, - executorch::aten::Tensor& out); - -executorch::aten::Tensor& -quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out( - executorch::runtime::KernelRuntimeContext& ctx, - const executorch::aten::Tensor& input, - const executorch::aten::Tensor& weight, - const executorch::aten::Tensor& bias, - executorch::aten::IntArrayRef stride, - executorch::aten::IntArrayRef padding, - executorch::aten::IntArrayRef dilation, - int64_t groups, - int64_t in_zero_point, - int64_t weight_zero_point, - double bias_scale, - double output_scale, - int64_t output_zero_point, - int64_t out_multiplier, - int64_t out_shift, - executorch::aten::Tensor& out); - -executorch::aten::Tensor& -quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out( - executorch::runtime::KernelRuntimeContext& ctx, - const executorch::aten::Tensor& input, - const executorch::aten::Tensor& weight, - const executorch::aten::Tensor& bias, - executorch::aten::IntArrayRef stride, - executorch::aten::IntArrayRef padding, - executorch::aten::IntArrayRef dilation, - int64_t groups, - int64_t in_zero_point, - int64_t weight_zero_point, - double bias_scale, - double output_scale, - int64_t output_zero_point, - int64_t out_multiplier, - int64_t out_shift, - executorch::aten::Tensor& out); - -executorch::aten::Tensor& -quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out( - executorch::runtime::KernelRuntimeContext& ctx, - const executorch::aten::Tensor& input, - const executorch::aten::Tensor& weight, - const executorch::aten::Tensor& bias, - executorch::aten::IntArrayRef stride, - executorch::aten::IntArrayRef padding, - executorch::aten::IntArrayRef dilation, - int64_t groups, - int64_t in_zero_point, - int64_t weight_zero_point, - double bias_scale, - double output_scale, - int64_t output_zero_point, - int64_t out_multiplier, - int64_t out_shift, - executorch::aten::Tensor& out); - -} // namespace native -} // namespace generic -} // namespace impl diff --git a/backends/cadence/generic/operators/op_quantized_conv1d_ncl.cpp b/backends/cadence/generic/operators/op_quantized_conv1d_ncl.cpp new file mode 100644 index 00000000000..c013b2f7da0 --- /dev/null +++ b/backends/cadence/generic/operators/op_quantized_conv1d_ncl.cpp @@ -0,0 +1,285 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include + +namespace impl { +namespace generic { +namespace native { + +namespace { +using ::executorch::aten::IntArrayRef; +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::runtime::KernelRuntimeContext; +using ::impl::generic::kernels::quantize; + +// This implements a generic 1d conv kernel that operates on raw pointers. +// The quantized version handles both quantized convolutions for 1D inputs. +// The input is of shape [n x c x w] +// The weight is of shape [oc x wc x ww], where wc == c +// The output is of shape [n x oc x ow] +// The bias is of shape [oc] + +template < + typename IT = float, + typename WT = IT, + typename BT = IT, + typename OT = IT, + bool quantized = false> +__attribute__((noinline)) void conv1d_ncl_core_generic( + // All the arrays + const IT* __restrict__ p_in, + const WT* __restrict__ p_weight, + const BT* __restrict__ p_bias, + OT* __restrict__ p_out, + // The array sizes + int32_t n, + int32_t c, + int32_t w, + int32_t oc, + int32_t wc, + int32_t ww, + int32_t ow, + // Stride + int16_t s, + // Padding + int16_t p, + // Dilation + int16_t d, + // Group for depthwise conv + int16_t groups, + // Optional args that are only relevant for quantized convolution + // input zero point + IT in_zero_point = 0, + // weight zero point + int32_t weight_zero_point = 0, + float bias_scale = 1, + float out_scale = 1, + OT out_zero_point = 0) { + float inv_out_scale = 1. / out_scale; + bool zero_pad_unit_dilation = d == 1 && p == 0; + + // Compute the number of in and out channels per group + const int ocpg = oc / groups; + const int icpg = c / groups; + + // Iterate over all the output batches (i.e., n) + for (int _n = 0; _n < n; ++_n) { + const IT* in_batch = p_in + _n * c * w; + OT* out_batch = p_out + _n * oc * ow; + // Compute separable convolution for each group + for (int _g = 0; _g < groups; ++_g) { + // Identify the input and output channels involved in the computation + // of this group + int sic = _g * icpg; + int soc = _g * ocpg; + // Populate all the output channels in the group + for (int _oc = soc; _oc < soc + ocpg; ++_oc) { + OT* out_plane = out_batch + _oc * ow; + const WT* weight_batch = p_weight + _oc * wc * ww; + // We compute one output channel at a time. The computation can be + // thought of as a stencil computation: we iterate over an input of size + // icpg x w, with a stencil of size icpg x ww, to compute an + // output channel of size 1 x ow. + for (int _w = 0, _ow = 0; _ow < ow; _w += s, ++_ow) { + float acc = p_bias[_oc]; + // Below is the stencil computation that performs the hadamard + // product+accumulation of each input channel (contributing to the + // output channel being computed) with the corresponding weight + // channel. + // If the padding is 0, and dilation is 1, then we can remove the + // unnecessary checks, and simplify the code so that it can be + // vectorized by Tensilica compiler. + if (zero_pad_unit_dilation) { + for (int _ic = sic; _ic < sic + icpg; ++_ic) { + const IT* in_plane = in_batch + _ic * w; + const WT* weight_plane = weight_batch + (_ic - sic) * ww; + for (int _ww = 0; _ww < ww; ++_ww) { + int ioff = _w + _ww; + int woff = _ww; + float lhs = in_plane[ioff] - in_zero_point; + float rhs = + weight_plane[woff] - (quantized ? weight_zero_point : 0); + acc += lhs * rhs; + } + } + } else { + for (int _ic = sic; _ic < sic + icpg; ++_ic) { + const IT* in_plane = in_batch + _ic * w; + const WT* weight_plane = weight_batch + (_ic - sic) * ww; + for (int _ww = 0; _ww < ww; ++_ww) { + if (((_w + d * _ww - p) >= 0) && ((_w + d * _ww - p) < w)) { + int ioff = _w + d * _ww - p; + int woff = _ww; + float lhs = in_plane[ioff] - in_zero_point; + float rhs = + weight_plane[woff] - (quantized ? weight_zero_point : 0); + acc += lhs * rhs; + } + } + } + } + if (quantized) { + float val = bias_scale * acc; + out_plane[_ow] = quantize(val, inv_out_scale, out_zero_point); + } else { + out_plane[_ow] = acc; + } + } + } + } + } +} + +void quantized_conv1d_ncl( + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int16_t groups, + int32_t in_zero_point, + int32_t weight_zero_point, + float bias_scale, + float output_scale, + int32_t output_zero_point, + Tensor& out) { + // input = [n, c, w] + const int n = input.size(0); + const int c = input.size(1); + const int w = input.size(2); + // weight = [oc, wc, ww] + const int oc = weight.size(0); + const int wc = weight.size(1); + const int ww = weight.size(2); + // output = [n, oc, ow] + const int ow = out.size(2); + +#define typed_quantized_conv1d_ncl(ctype, dtype) \ + case ScalarType::dtype: { \ + conv1d_ncl_core_generic( \ + input.const_data_ptr(), \ + weight.const_data_ptr(), \ + bias.const_data_ptr(), \ + out.mutable_data_ptr(), \ + n, \ + c, \ + w, \ + oc, \ + wc, \ + ww, \ + ow, \ + stride[0], \ + padding[0], \ + dilation[0], \ + groups, \ + in_zero_point, \ + weight_zero_point, \ + bias_scale, \ + output_scale, \ + (ctype)output_zero_point); \ + break; \ + } + ScalarType dtype = out.scalar_type(); + switch (dtype) { + ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_conv1d_ncl); + default: + ET_DCHECK_MSG( + false, "Unhandled dtype %s", torch::executor::toString(dtype)); + } + +#undef typed_quantized_conv1d_ncl +} + +} // namespace + +// Public exported kernel functions + +::executorch::aten::Tensor& quantized_conv1d_ncl_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t input_zero_point, + const Tensor& weight_zero_point, + const Tensor& bias_scale, + double output_scale, + int64_t output_zero_point, + const Tensor& out_multiplier, + const Tensor& out_shift, + Tensor& out) { + (void)ctx; + (void)out_multiplier; + (void)out_shift; + quantized_conv1d_ncl( + input, + weight, + bias, + stride, + padding, + dilation, + static_cast(groups), + static_cast(input_zero_point), + weight_zero_point.const_data_ptr()[0], + bias_scale.const_data_ptr()[0], + static_cast(output_scale), + static_cast(output_zero_point), + out); + return out; +} + +::executorch::aten::Tensor& quantized_conv1d_ncl_per_tensor_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + __ET_UNUSED IntArrayRef dilation, + int64_t groups, + int64_t input_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + (void)ctx; + quantized_conv1d_ncl( + input, + weight, + bias, + stride, + padding, + {1}, + static_cast(groups), + static_cast(input_zero_point), + static_cast(weight_zero_point), + static_cast(bias_scale), + static_cast(output_scale), + static_cast(output_zero_point), + out); + return out; +} + +} // namespace native +} // namespace generic +} // namespace impl diff --git a/backends/cadence/generic/operators/op_quantized_conv1d_ncl.h b/backends/cadence/generic/operators/op_quantized_conv1d_ncl.h new file mode 100644 index 00000000000..f6854beff12 --- /dev/null +++ b/backends/cadence/generic/operators/op_quantized_conv1d_ncl.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace impl { +namespace generic { +namespace native { + +using ::executorch::aten::IntArrayRef; +using ::executorch::aten::Tensor; +using ::executorch::runtime::KernelRuntimeContext; + +// NCL format (N=batch, C=channels, L=length) +::executorch::aten::Tensor& quantized_conv1d_ncl_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t input_zero_point, + const Tensor& weight_zero_point, + const Tensor& bias_scale, + double output_scale, + int64_t output_zero_point, + const Tensor& out_multiplier, + const Tensor& out_shift, + Tensor& out); + +::executorch::aten::Tensor& quantized_conv1d_ncl_per_tensor_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t input_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + int64_t out_multiplier, + int64_t out_shift, + Tensor& out); + +} // namespace native +} // namespace generic +} // namespace impl diff --git a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp new file mode 100644 index 00000000000..b19ac059563 --- /dev/null +++ b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp @@ -0,0 +1,280 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include + +namespace impl { +namespace generic { +namespace native { + +namespace { +using ::executorch::aten::IntArrayRef; +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::runtime::KernelRuntimeContext; +using ::impl::generic::kernels::quantize; + +// This implements a generic 1d conv kernel that operates on raw pointers. +// The quantized version handles both quantized convolutions for 1D inputs. +// The input is of shape [n x w x c] (NLC format) +// The weight is of shape [oc x ww x wc], where wc == c +// The output is of shape [n x ow x oc] +// The bias is of shape [oc] + +template < + typename IT = float, + typename WT = IT, + typename BT = IT, + typename OT = IT, + bool quantized = false> +__attribute__((noinline)) void conv1d_nlc_core_generic( + // All the arrays + const IT* __restrict__ p_in, + const WT* __restrict__ p_weight, + const BT* __restrict__ p_bias, + OT* __restrict__ p_out, + // The array sizes + int32_t n, + int32_t w, + int32_t c, + int32_t oc, + int32_t ww, + int32_t wc, + int32_t ow, + // Stride + int16_t s, + // Padding + int16_t p, + // Dilation + int16_t d, + // Group for depthwise conv + int16_t groups, + // Optional args that are only relevant for quantized convolution + // input zero point + IT in_zero_point = 0, + // weight zero point + int32_t weight_zero_point = 0, + float bias_scale = 1, + float out_scale = 1, + OT out_zero_point = 0) { + float inv_out_scale = 1. / out_scale; + bool zero_pad_unit_dilation = d == 1 && p == 0; + + // Compute the number of in and out channels per group + const int ocpg = oc / groups; + const int icpg = c / groups; + + // Iterate over all the output batches (i.e., n) + for (int _n = 0; _n < n; ++_n) { + const IT* in_batch = p_in + _n * w * c; + OT* out_batch = p_out + _n * ow * oc; + for (int _w = 0, _ow = 0; _ow < ow; _w += s, ++_ow) { + OT* out_line = out_batch + _ow * oc; + // Compute separable convolution for each group + for (int _g = 0; _g < groups; ++_g) { + // Identify the input and output channels involved in the computation + // of this group + int sic = _g * icpg; + int soc = _g * ocpg; + // Populate all the output channels in the group + for (int _oc = soc; _oc < soc + ocpg; ++_oc) { + const WT* weight_batch = p_weight + _oc * ww * wc; + // We compute one output channel at a time. The computation can be + // thought of as a stencil computation: we iterate over an input of + // size w x icpg, with a stencil of size ww x icpg, to + // compute an output channel of size ow x 1. + float acc = p_bias[_oc]; + // Below is the stencil computation that performs the hadamard + // product+accumulation of each input channel (contributing to + // the output channel being computed) with the corresponding + // weight channel. If the padding is 0, and dilation is 1, then + // we can remove the unnecessary checks, and simplify the code + // so that it can be vectorized by Tensilica compiler. + if (zero_pad_unit_dilation) { + for (int _ww = 0; _ww < ww; ++_ww) { + const IT* in_line = in_batch + (_w + _ww) * c; + const WT* weight_line = weight_batch + _ww * wc; + for (int _ic = sic; _ic < sic + icpg; ++_ic) { + float lhs = in_line[_ic] - in_zero_point; + float rhs = weight_line[_ic - sic] - + (quantized ? weight_zero_point : 0); + acc += lhs * rhs; + } + } + } else { + for (int _ww = 0; _ww < ww; ++_ww) { + if (((_w + d * _ww - p) >= 0) && ((_w + d * _ww - p) < w)) { + const IT* in_line = in_batch + (_w + d * _ww - p) * c; + const WT* weight_line = weight_batch + _ww * wc; + for (int _ic = sic; _ic < sic + icpg; ++_ic) { + float lhs = in_line[_ic] - in_zero_point; + float rhs = weight_line[_ic - sic] - + (quantized ? weight_zero_point : 0); + acc += lhs * rhs; + } + } + } + } + if (quantized) { + float val = bias_scale * acc; + out_line[_oc] = quantize(val, inv_out_scale, out_zero_point); + } else { + out_line[_oc] = acc; + } + } + } + } + } +} + +void quantized_conv1d_nlc( + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int16_t groups, + int32_t in_zero_point, + int32_t weight_zero_point, + float bias_scale, + float output_scale, + int32_t output_zero_point, + Tensor& out) { + // input = [n, w, c] + const int n = input.size(0); + const int w = input.size(1); + const int c = input.size(2); + // weight = [oc, ww, wc] + const int oc = weight.size(0); + const int ww = weight.size(1); + const int wc = weight.size(2); + // output = [n, ow, oc] + const int ow = out.size(1); + +#define typed_quantized_conv1d_nlc(ctype, dtype) \ + case ScalarType::dtype: { \ + conv1d_nlc_core_generic( \ + input.const_data_ptr(), \ + weight.const_data_ptr(), \ + bias.const_data_ptr(), \ + out.mutable_data_ptr(), \ + n, \ + w, \ + c, \ + oc, \ + ww, \ + wc, \ + ow, \ + stride[0], \ + padding[0], \ + dilation[0], \ + groups, \ + in_zero_point, \ + weight_zero_point, \ + bias_scale, \ + output_scale, \ + (ctype)output_zero_point); \ + break; \ + } + ScalarType dtype = out.scalar_type(); + switch (dtype) { + ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_conv1d_nlc); + default: + ET_DCHECK_MSG( + false, "Unhandled dtype %s", torch::executor::toString(dtype)); + } + +#undef typed_quantized_conv1d_nlc +} + +} // namespace + +// Public exported kernel functions + +::executorch::aten::Tensor& quantized_conv1d_nlc_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t input_zero_point, + const Tensor& weight_zero_point, + const Tensor& bias_scale, + double output_scale, + int64_t output_zero_point, + const Tensor& out_multiplier, + const Tensor& out_shift, + Tensor& out) { + (void)ctx; + (void)out_multiplier; + (void)out_shift; + quantized_conv1d_nlc( + input, + weight, + bias, + stride, + padding, + dilation, + static_cast(groups), + static_cast(input_zero_point), + weight_zero_point.const_data_ptr()[0], + bias_scale.const_data_ptr()[0], + static_cast(output_scale), + static_cast(output_zero_point), + out); + return out; +} + +::executorch::aten::Tensor& quantized_conv1d_nlc_per_tensor_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + __ET_UNUSED IntArrayRef dilation, + int64_t groups, + int64_t input_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + (void)ctx; + quantized_conv1d_nlc( + input, + weight, + bias, + stride, + padding, + {1}, + static_cast(groups), + static_cast(input_zero_point), + static_cast(weight_zero_point), + static_cast(bias_scale), + static_cast(output_scale), + static_cast(output_zero_point), + out); + return out; +} + +} // namespace native +} // namespace generic +} // namespace impl diff --git a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h new file mode 100644 index 00000000000..7713121cf97 --- /dev/null +++ b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace impl { +namespace generic { +namespace native { + +using ::executorch::aten::IntArrayRef; +using ::executorch::aten::Tensor; +using ::executorch::runtime::KernelRuntimeContext; + +// NLC format (N=batch, L=length, C=channels) +::executorch::aten::Tensor& quantized_conv1d_nlc_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t input_zero_point, + const Tensor& weight_zero_point, + const Tensor& bias_scale, + double output_scale, + int64_t output_zero_point, + const Tensor& out_multiplier, + const Tensor& out_shift, + Tensor& out); + +::executorch::aten::Tensor& quantized_conv1d_nlc_per_tensor_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t input_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + int64_t out_multiplier, + int64_t out_shift, + Tensor& out); + +} // namespace native +} // namespace generic +} // namespace impl diff --git a/backends/cadence/generic/operators/targets.bzl b/backends/cadence/generic/operators/targets.bzl index 77535466d46..faa63e4f46f 100644 --- a/backends/cadence/generic/operators/targets.bzl +++ b/backends/cadence/generic/operators/targets.bzl @@ -121,9 +121,22 @@ def define_common_targets(): ) runtime.cxx_library( - name = "op_quantized_conv1d", - srcs = ["op_quantized_conv1d.cpp"], - exported_headers = ["op_quantized_conv1d.h"], + name = "op_quantized_conv1d_ncl", + srcs = ["op_quantized_conv1d_ncl.cpp"], + exported_headers = ["op_quantized_conv1d_ncl.h"], + platforms = CXX, + deps = [ + ":cadence_type_util", + "//executorch/backends/cadence/generic/kernels:cadence_kernels", + "//executorch/runtime/kernel:kernel_includes", + ], + visibility = ["PUBLIC"], + ) + + runtime.cxx_library( + name = "op_quantized_conv1d_nlc", + srcs = ["op_quantized_conv1d_nlc.cpp"], + exported_headers = ["op_quantized_conv1d_nlc.h"], platforms = CXX, deps = [ ":cadence_type_util", diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt index 185ea390c0e..626033a5a43 100644 --- a/backends/cadence/hifi/operators/CMakeLists.txt +++ b/backends/cadence/hifi/operators/CMakeLists.txt @@ -124,11 +124,9 @@ add_library( "op_quantized_relu_asym8u_asym8u_per_tensor_out.cpp" "op_dequantize_per_tensor.cpp" "op_dequantize_per_tensor_asym8s.cpp" - "op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out.cpp" - "op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out.cpp" - "op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out.cpp" - "op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out.cpp" "op_quantized_conv2d_nchw_out.cpp" + "op_quantized_conv1d_ncl.cpp" + "op_quantized_conv1d_nlc.cpp" "op_quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp" "op_quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp" "op_quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp" diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl.cpp similarity index 54% rename from backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out.cpp rename to backends/cadence/hifi/operators/op_quantized_conv1d_ncl.cpp index f543f4633cf..9b0ccf4ea25 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl.cpp @@ -21,6 +21,8 @@ namespace impl { namespace HiFi { namespace native { +namespace { + // Optimized NCHW 1D convolution for int8 x int8 -> int8 void xa_opt_quantized_conv1d_ncl_asym8sxsym8s_asym8s( KernelRuntimeContext& ctx, @@ -199,7 +201,144 @@ void xa_opt_quantized_conv1d_ncl_asym8sxsym8s_asym8s( } } -void quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out( +// Optimized NCHW 1D convolution for uint8 x uint8 -> uint8 +void xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + int32_t in_zero_point, + int32_t weight_zero_point, + float bias_scale, + float output_scale, + int32_t output_zero_point, + Tensor& out) { + constexpr int kNnlibMaxDim = 5; + + UWORD8* __restrict__ p_out = + (UWORD8* __restrict__)out.mutable_data_ptr(); + UWORD8* __restrict__ p_inp = + (UWORD8* __restrict__)input.const_data_ptr(); + UWORD8* __restrict__ p_kernel = + (UWORD8* __restrict__)weight.const_data_ptr(); + WORD32* __restrict__ p_bias = + (WORD32* __restrict__)bias.const_data_ptr(); + + WORD32 batches = input.size(0); + WORD32 input_channels = input.size(1); + WORD32 input_width = input.size(2); + WORD32 input_height = 1; + WORD32 kernel_height = 1; + WORD32 out_channels = weight.size(0); + WORD32 kernel_channels = weight.size(1); + WORD32 kernel_width = weight.size(2); + WORD32 out_width = out.size(2); + WORD32 out_height = 1; + WORD32 x_stride = stride[1]; + WORD32 x_padding = padding[1]; + WORD32 input_zero_bias = -in_zero_point; + WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648; + WORD32 out_shift32 = 0; + WORD32 kernel_zero_bias = -weight_zero_point; + + WORD32 out_zero_bias = output_zero_point; + WORD32 out_data_format = 1; + + WORD32 scratch_size = + xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8); + scratch_size = scratch_size < 0 ? 0 : scratch_size; + WORD32* ptr_scratch = + (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size); + pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8); + + WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory( + ctx, ((batches * input_channels * input_width) + 8) * sizeof(WORD8)); + WORD8* ptr2 = (WORD8*)kernels::allocate_temp_memory( + ctx, + ((out_channels * kernel_channels * kernel_width) + 8) * sizeof(WORD8)); + WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8); + WORD8* pkernel = (WORD8*)ALIGN_PTR(ptr2, 8); + + WORD32 p_inp_shape[kNnlibMaxDim]; + p_inp_shape[0] = 1; + p_inp_shape[1] = 1; + p_inp_shape[2] = batches; + p_inp_shape[3] = input_channels; + p_inp_shape[4] = input_width; + + WORD32 p_out_shape[kNnlibMaxDim]; + p_out_shape[0] = 1; + p_out_shape[1] = 1; + p_out_shape[2] = batches; + p_out_shape[3] = input_width; + p_out_shape[4] = input_channels; + + WORD32 p_permute_vec[kNnlibMaxDim] = {0, 1, 2, 4, 3}; + + xa_nn_transpose_8_8( + (WORD8*)pin, + p_out_shape, + (WORD8*)p_inp, + p_inp_shape, + p_permute_vec, + kNnlibMaxDim, + kNnlibMaxDim); + + WORD32 p_inp_shape1[kNnlibMaxDim]; + p_inp_shape1[0] = 1; + p_inp_shape1[1] = 1; + p_inp_shape1[2] = out_channels; + p_inp_shape1[3] = kernel_channels; + p_inp_shape1[4] = kernel_width; + + WORD32 p_out_shape1[kNnlibMaxDim]; + p_out_shape1[0] = 1; + p_out_shape1[1] = 1; + p_out_shape1[2] = out_channels; + p_out_shape1[3] = kernel_width; + p_out_shape1[4] = kernel_channels; + + xa_nn_transpose_8_8( + (WORD8*)pkernel, + p_out_shape1, + (WORD8*)p_kernel, + p_inp_shape1, + p_permute_vec, + kNnlibMaxDim, + kNnlibMaxDim); + + for (int _n = 0; _n < batches; _n++) { + UWORD8* in_batch = (UWORD8*)(pin + _n * input_channels * input_width); + UWORD8* out_batch = (UWORD8*)(p_out + _n * out_channels * out_width); + + xa_nn_conv1d_std_asym8uxasym8u( + out_batch, + in_batch, + (UWORD8*)pkernel, + p_bias, + input_width, + input_height, + input_channels, + kernel_width, + out_channels, + x_stride, + x_padding, + out_width, + input_zero_bias, + kernel_zero_bias, + out_multiplier32, + out_shift32, + out_zero_bias, + out_data_format, + p_scratch); + } +} + +} // namespace + +void quantized_conv1d_ncl_per_tensor_out( KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -216,19 +355,42 @@ void quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - xa_opt_quantized_conv1d_ncl_asym8sxsym8s_asym8s( - ctx, - input, - weight, - bias, - stride, - padding, - in_zero_point, - weight_zero_point, - bias_scale, - output_scale, - output_zero_point, - out); + ScalarType dtype = out.scalar_type(); + + if (dtype == ScalarType::Char) { + xa_opt_quantized_conv1d_ncl_asym8sxsym8s_asym8s( + ctx, + input, + weight, + bias, + stride, + padding, + static_cast(in_zero_point), + static_cast(weight_zero_point), + static_cast(bias_scale), + static_cast(output_scale), + static_cast(output_zero_point), + out); + } else if (dtype == ScalarType::Byte) { + xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u( + ctx, + input, + weight, + bias, + stride, + padding, + static_cast(in_zero_point), + static_cast(weight_zero_point), + static_cast(bias_scale), + static_cast(output_scale), + static_cast(output_zero_point), + out); + } else { + ET_DCHECK_MSG( + false, + "Unhandled dtype %s for quantized_conv1d_ncl", + torch::executor::toString(dtype)); + } } } // namespace native diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out.cpp deleted file mode 100644 index 4ad36a3b5fa..00000000000 --- a/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out.cpp +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include - -#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1))) - -using Tensor = executorch::aten::Tensor; -using KernelRuntimeContext = torch::executor::KernelRuntimeContext; -using ScalarType = executorch::aten::ScalarType; -using ::executorch::aten::IntArrayRef; - -namespace impl { -namespace HiFi { -namespace native { - -// Optimized NCHW 1D convolution for uint8 x uint8 -> uint8 -void xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u( - KernelRuntimeContext& ctx, - const Tensor& input, - const Tensor& weight, - const Tensor& bias, - IntArrayRef stride, - IntArrayRef padding, - int32_t in_zero_point, - int32_t weight_zero_point, - float bias_scale, - float output_scale, - int32_t output_zero_point, - Tensor& out) { - constexpr int kNnlibMaxDim = 5; - - UWORD8* __restrict__ p_out = - (UWORD8* __restrict__)out.mutable_data_ptr(); - UWORD8* __restrict__ p_inp = - (UWORD8* __restrict__)input.const_data_ptr(); - UWORD8* __restrict__ p_kernel = - (UWORD8* __restrict__)weight.const_data_ptr(); - WORD32* __restrict__ p_bias = - (WORD32* __restrict__)bias.const_data_ptr(); - - WORD32 batches = input.size(0); - WORD32 input_channels = input.size(1); - WORD32 input_width = input.size(2); - WORD32 input_height = 1; - WORD32 kernel_height = 1; - WORD32 out_channels = weight.size(0); - WORD32 kernel_channels = weight.size(1); - WORD32 kernel_width = weight.size(2); - WORD32 out_width = out.size(2); - WORD32 out_height = 1; - WORD32 x_stride = stride[1]; - WORD32 x_padding = padding[1]; - WORD32 input_zero_bias = -in_zero_point; - WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648; - WORD32 out_shift32 = 0; - WORD32 kernel_zero_bias = -weight_zero_point; - - WORD32 out_zero_bias = output_zero_point; - WORD32 out_data_format = 1; - - WORD32 scratch_size = - xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8); - scratch_size = scratch_size < 0 ? 0 : scratch_size; - WORD32* ptr_scratch = - (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size); - pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8); - - WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory( - ctx, ((batches * input_channels * input_width) + 8) * sizeof(WORD8)); - WORD8* ptr2 = (WORD8*)kernels::allocate_temp_memory( - ctx, - ((out_channels * kernel_channels * kernel_width) + 8) * sizeof(WORD8)); - WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8); - WORD8* pkernel = (WORD8*)ALIGN_PTR(ptr2, 8); - - WORD32 p_inp_shape[kNnlibMaxDim]; - p_inp_shape[0] = 1; - p_inp_shape[1] = 1; - p_inp_shape[2] = batches; - p_inp_shape[3] = input_channels; - p_inp_shape[4] = input_width; - - WORD32 p_out_shape[kNnlibMaxDim]; - p_out_shape[0] = 1; - p_out_shape[1] = 1; - p_out_shape[2] = batches; - p_out_shape[3] = input_width; - p_out_shape[4] = input_channels; - - WORD32 p_permute_vec[kNnlibMaxDim] = {0, 1, 2, 4, 3}; - - xa_nn_transpose_8_8( - (WORD8*)pin, - p_out_shape, - (WORD8*)p_inp, - p_inp_shape, - p_permute_vec, - kNnlibMaxDim, - kNnlibMaxDim); - - WORD32 p_inp_shape1[kNnlibMaxDim]; - p_inp_shape1[0] = 1; - p_inp_shape1[1] = 1; - p_inp_shape1[2] = out_channels; - p_inp_shape1[3] = kernel_channels; - p_inp_shape1[4] = kernel_width; - - WORD32 p_out_shape1[kNnlibMaxDim]; - p_out_shape1[0] = 1; - p_out_shape1[1] = 1; - p_out_shape1[2] = out_channels; - p_out_shape1[3] = kernel_width; - p_out_shape1[4] = kernel_channels; - - xa_nn_transpose_8_8( - (WORD8*)pkernel, - p_out_shape1, - (WORD8*)p_kernel, - p_inp_shape1, - p_permute_vec, - kNnlibMaxDim, - kNnlibMaxDim); - - for (int _n = 0; _n < batches; _n++) { - UWORD8* in_batch = (UWORD8*)(pin + _n * input_channels * input_width); - UWORD8* out_batch = (UWORD8*)(p_out + _n * out_channels * out_width); - - xa_nn_conv1d_std_asym8uxasym8u( - out_batch, - in_batch, - (UWORD8*)pkernel, - p_bias, - input_width, - input_height, - input_channels, - kernel_width, - out_channels, - x_stride, - x_padding, - out_width, - input_zero_bias, - kernel_zero_bias, - out_multiplier32, - out_shift32, - out_zero_bias, - out_data_format, - p_scratch); - } -} - -void quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out( - KernelRuntimeContext& ctx, - const Tensor& input, - const Tensor& weight, - const Tensor& bias, - IntArrayRef stride, - IntArrayRef padding, - __ET_UNUSED IntArrayRef dilation, - __ET_UNUSED int64_t groups, - int64_t in_zero_point, - int64_t weight_zero_point, - double bias_scale, - double output_scale, - int64_t output_zero_point, - __ET_UNUSED int64_t out_multiplier, - __ET_UNUSED int64_t out_shift, - Tensor& out) { - xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u( - ctx, - input, - weight, - bias, - stride, - padding, - in_zero_point, - weight_zero_point, - bias_scale, - output_scale, - output_zero_point, - out); -} - -} // namespace native -} // namespace HiFi -} // namespace impl diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp similarity index 57% rename from backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out.cpp rename to backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp index 3b1c7b9a900..e40cca4a88a 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp @@ -21,6 +21,8 @@ namespace impl { namespace HiFi { namespace native { +namespace { + // Optimized NHWC 1D convolution for int8 x int8 -> int8 void xa_opt_quantized_conv1d_nlc_asym8sxsym8s_asym8s( KernelRuntimeContext& ctx, @@ -141,7 +143,81 @@ void xa_opt_quantized_conv1d_nlc_asym8sxsym8s_asym8s( } } -void quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out( +// Optimized NHWC 1D convolution for uint8 x uint8 -> uint8 +void xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + int32_t in_zero_point, + int32_t weight_zero_point, + float bias_scale, + float output_scale, + int32_t output_zero_point, + Tensor& out) { + UWORD8* __restrict__ p_out = + (UWORD8* __restrict__)out.mutable_data_ptr(); + UWORD8* __restrict__ p_inp = + (UWORD8* __restrict__)input.const_data_ptr(); + UWORD8* __restrict__ p_kernel = + (UWORD8* __restrict__)weight.const_data_ptr(); + WORD32* __restrict__ p_bias = + (WORD32* __restrict__)bias.const_data_ptr(); + + WORD32 batches = input.size(0); + WORD32 input_channels = input.size(2); + WORD32 input_width = input.size(1); + WORD32 out_channels = weight.size(2); + WORD32 kernel_width = weight.size(1); + WORD32 out_width = out.size(1); + WORD32 x_stride = stride[1]; + WORD32 x_padding = padding[1]; + WORD32 input_zero_bias = -in_zero_point; + WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648; + WORD32 out_shift32 = 0; + WORD32 kernel_zero_bias = -weight_zero_point; + + WORD32 out_zero_bias = output_zero_point; + WORD32 out_data_format = 0; + WORD32 scratch_size = + xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8); + scratch_size = scratch_size < 0 ? 0 : scratch_size; + WORD32* ptr_scratch = + (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size); + pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8); + + for (int _n = 0; _n < batches; _n++) { + UWORD8* in_batch = p_inp + _n * input_channels * input_width; + UWORD8* out_batch = p_out + _n * out_channels * out_width; + + xa_nn_conv1d_std_asym8uxasym8u( + out_batch, + in_batch, + p_kernel, + p_bias, + input_width, + 1, + input_channels, + kernel_width, + out_channels, + x_stride, + x_padding, + out_width, + input_zero_bias, + kernel_zero_bias, + out_multiplier32, + out_shift32, + out_zero_bias, + out_data_format, + p_scratch); + } +} + +} // namespace + +void quantized_conv1d_nlc_per_tensor_out( KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, @@ -158,19 +234,42 @@ void quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - xa_opt_quantized_conv1d_nlc_asym8sxsym8s_asym8s( - ctx, - input, - weight, - bias, - stride, - padding, - in_zero_point, - weight_zero_point, - bias_scale, - output_scale, - output_zero_point, - out); + ScalarType dtype = out.scalar_type(); + + if (dtype == ScalarType::Char) { + xa_opt_quantized_conv1d_nlc_asym8sxsym8s_asym8s( + ctx, + input, + weight, + bias, + stride, + padding, + static_cast(in_zero_point), + static_cast(weight_zero_point), + static_cast(bias_scale), + static_cast(output_scale), + static_cast(output_zero_point), + out); + } else if (dtype == ScalarType::Byte) { + xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u( + ctx, + input, + weight, + bias, + stride, + padding, + static_cast(in_zero_point), + static_cast(weight_zero_point), + static_cast(bias_scale), + static_cast(output_scale), + static_cast(output_zero_point), + out); + } else { + ET_DCHECK_MSG( + false, + "Unhandled dtype %s for quantized_conv1d_nlc", + torch::executor::toString(dtype)); + } } } // namespace native diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out.cpp deleted file mode 100644 index 5539410f46e..00000000000 --- a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out.cpp +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include - -#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1))) - -using Tensor = executorch::aten::Tensor; -using KernelRuntimeContext = torch::executor::KernelRuntimeContext; -using ScalarType = executorch::aten::ScalarType; -using ::executorch::aten::IntArrayRef; - -namespace impl { -namespace HiFi { -namespace native { - -// Optimized NHWC 1D convolution for uint8 x uint8 -> uint8 -void xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u( - KernelRuntimeContext& ctx, - const Tensor& input, - const Tensor& weight, - const Tensor& bias, - IntArrayRef stride, - IntArrayRef padding, - int32_t in_zero_point, - int32_t weight_zero_point, - float bias_scale, - float output_scale, - int32_t output_zero_point, - Tensor& out) { - UWORD8* __restrict__ p_out = - (UWORD8* __restrict__)out.mutable_data_ptr(); - UWORD8* __restrict__ p_inp = - (UWORD8* __restrict__)input.const_data_ptr(); - UWORD8* __restrict__ p_kernel = - (UWORD8* __restrict__)weight.const_data_ptr(); - WORD32* __restrict__ p_bias = - (WORD32* __restrict__)bias.const_data_ptr(); - - WORD32 batches = input.size(0); - WORD32 input_channels = input.size(2); - WORD32 input_width = input.size(1); - WORD32 out_channels = weight.size(2); - WORD32 kernel_width = weight.size(1); - WORD32 out_width = out.size(1); - WORD32 x_stride = stride[1]; - WORD32 x_padding = padding[1]; - WORD32 input_zero_bias = -in_zero_point; - WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648; - WORD32 out_shift32 = 0; - WORD32 kernel_zero_bias = -weight_zero_point; - - WORD32 out_zero_bias = output_zero_point; - WORD32 out_data_format = 0; - WORD32 scratch_size = - xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8); - scratch_size = scratch_size < 0 ? 0 : scratch_size; - WORD32* ptr_scratch = - (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size); - pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8); - - for (int _n = 0; _n < batches; _n++) { - UWORD8* in_batch = p_inp + _n * input_channels * input_width; - UWORD8* out_batch = p_out + _n * out_channels * out_width; - - xa_nn_conv1d_std_asym8uxasym8u( - out_batch, - in_batch, - p_kernel, - p_bias, - input_width, - 1, - input_channels, - kernel_width, - out_channels, - x_stride, - x_padding, - out_width, - input_zero_bias, - kernel_zero_bias, - out_multiplier32, - out_shift32, - out_zero_bias, - out_data_format, - p_scratch); - } -} - -void quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out( - KernelRuntimeContext& ctx, - const Tensor& input, - const Tensor& weight, - const Tensor& bias, - IntArrayRef stride, - IntArrayRef padding, - __ET_UNUSED IntArrayRef dilation, - __ET_UNUSED int64_t groups, - int64_t in_zero_point, - int64_t weight_zero_point, - double bias_scale, - double output_scale, - int64_t output_zero_point, - __ET_UNUSED int64_t out_multiplier, - __ET_UNUSED int64_t out_shift, - Tensor& out) { - xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u( - ctx, - input, - weight, - bias, - stride, - padding, - in_zero_point, - weight_zero_point, - bias_scale, - output_scale, - output_zero_point, - out); -} - -} // namespace native -} // namespace HiFi -} // namespace impl diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl index 1e6c7f26d42..9753051bf72 100644 --- a/backends/cadence/hifi/operators/targets.bzl +++ b/backends/cadence/hifi/operators/targets.bzl @@ -365,8 +365,8 @@ def define_common_targets(): ) runtime.cxx_library( - name = "op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out", - srcs = ["op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out.cpp"], + name = "op_quantized_conv1d_ncl", + srcs = ["op_quantized_conv1d_ncl.cpp"], exported_headers = ["operators.h"], platforms = CXX, deps = COMMON_DEPS, @@ -375,28 +375,8 @@ def define_common_targets(): ) runtime.cxx_library( - name = "op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out", - srcs = ["op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out.cpp"], - exported_headers = ["operators.h"], - platforms = CXX, - deps = COMMON_DEPS, - visibility = ["PUBLIC"], - compatible_with = ["ovr_config//cpu:xtensa"], - ) - - runtime.cxx_library( - name = "op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out", - srcs = ["op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out.cpp"], - exported_headers = ["operators.h"], - platforms = CXX, - deps = COMMON_DEPS, - visibility = ["PUBLIC"], - compatible_with = ["ovr_config//cpu:xtensa"], - ) - - runtime.cxx_library( - name = "op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out", - srcs = ["op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out.cpp"], + name = "op_quantized_conv1d_nlc", + srcs = ["op_quantized_conv1d_nlc.cpp"], exported_headers = ["operators.h"], platforms = CXX, deps = COMMON_DEPS,