diff --git a/backends/cadence/aot/BUCK b/backends/cadence/aot/BUCK
index a17ef6cb49a..c85dc23c4bd 100644
--- a/backends/cadence/aot/BUCK
+++ b/backends/cadence/aot/BUCK
@@ -154,7 +154,8 @@ fbcode_target(_kind = executorch_generated_lib,
         "//executorch/backends/cadence/generic/operators:op_quantize_per_tensor",
         "//executorch/backends/cadence/generic/operators:op_quantized_add",
         "//executorch/backends/cadence/generic/operators:op_quantized_conv2d",
-        "//executorch/backends/cadence/generic/operators:op_quantized_conv1d",
+        "//executorch/backends/cadence/generic/operators:op_quantized_conv1d_ncl",
+        "//executorch/backends/cadence/generic/operators:op_quantized_conv1d_nlc",
         "//executorch/backends/cadence/generic/operators:op_quantized_fully_connected",
         "//executorch/backends/cadence/generic/operators:op_quantized_layer_norm",
         "//executorch/backends/cadence/generic/operators:op_quantized_linear",
diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
index 3ba6f4700b1..528ceadaf19 100644
--- a/backends/cadence/aot/functions.yaml
+++ b/backends/cadence/aot/functions.yaml
@@ -359,6 +359,16 @@
     - arg_meta: null
       kernel_name: impl::generic::quantized_conv2d_nhwc_per_tensor_out
 
+- func: cadence::quantized_conv1d_ncl.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::generic::quantized_conv1d_ncl_per_tensor_out
+
+- func: cadence::quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::generic::quantized_conv1d_nlc_per_tensor_out
+
 - func: cadence::quantized_conv2d_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
@@ -419,26 +429,6 @@
     - arg_meta: null
       kernel_name: impl::generic::quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out
 
-- func: cadence::quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
-  kernels:
-    - arg_meta: null
-      kernel_name: impl::generic::quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out
-
-- func: cadence::quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
-  kernels:
-    - arg_meta: null
-      kernel_name: impl::generic::quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out
-
-- func: cadence::quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
-  kernels:
-    - arg_meta: null
-      kernel_name: impl::generic::quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out
-
-- func: cadence::quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
-  kernels:
-    - arg_meta: null
-      kernel_name: impl::generic::quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out
-
 - func: cadence::quantized_fully_connected.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index f713d0a3227..e7ae2fcaeeb 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -445,26 +445,6 @@
     - arg_meta: null
       kernel_name: impl::HiFi::quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out
 
-- func: cadence::quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
-  kernels:
-    - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out
-
-- func: cadence::quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
-  kernels:
-    - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out
-
-- func: cadence::quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
-  kernels:
-    - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out
-
-- func: cadence::quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
-  kernels:
-    - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out
-
 - func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
@@ -563,3 +543,13 @@
   kernels:
     - arg_meta: null
       kernel_name: impl::HiFi::quantized_fully_connected_asym8uxasym8u_asym8u_per_tensor_out
+
+- func: cadence::quantized_conv1d_ncl.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::native::quantized_conv1d_ncl_per_tensor_out
+
+- func: cadence::quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::native::quantized_conv1d_nlc_per_tensor_out
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index d805426737b..cbc179e05d2 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -225,6 +225,30 @@ def register_fake(
 lib.define(
     "quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
+lib.define(
+    "quantized_conv1d_ncl(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Z)"
+)
+lib.define(
+    "quantized_conv1d_ncl.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.define(
+    "quantized_conv1d_ncl.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+)
+lib.define(
+    "quantized_conv1d_ncl.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.define(
+    "quantized_conv1d_nlc(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Z)"
+)
+lib.define(
+    "quantized_conv1d_nlc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.define(
+    "quantized_conv1d_nlc.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+)
+lib.define(
+    "quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+)
 lib.define(
     "quantized_conv2d_nchw(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Z)"
 )
@@ -297,30 +321,6 @@ def register_fake(
 lib.define(
     "quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
-lib.define(
-    "quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
-)
-lib.define(
-    "quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
-)
-lib.define(
-    "quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
-)
-lib.define(
-    "quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
-)
-lib.define(
-    "quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
-)
-lib.define(
-    "quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
-)
-lib.define(
-    "quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
-)
-lib.define(
-    "quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
-)
 lib.define(
     "quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
 )
@@ -1071,6 +1071,154 @@ def quantized_conv2d_nhwc_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
+@register_fake("cadence::quantized_conv1d_ncl")
+def quantized_conv1d_ncl_meta(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: Tuple[int],
+    padding: Tuple[int],
+    dilation: Tuple[int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: torch.Tensor,
+    bias_scale: torch.Tensor,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: torch.Tensor,
+    out_shift: torch.Tensor,
+) -> torch.Tensor:
+    # NCL format: input is [N, C, L], weight is [OC, IC/groups, K]
+    out_channels, _, kernel_size = weight.shape
+
+    in_size = input.shape
+    assert len(in_size) == 3
+
+    # Compute the output tensor size for 1D conv (NCL format, channel_last=False)
+    output_size = get_conv1d_output_size(
+        in_size,
+        out_channels,
+        stride[-1],
+        padding[-1],
+        dilation[-1],
+        kernel_size,
+        False,
+    )
+
+    return input.new_empty(output_size, dtype=input.dtype)
+
+
+@register_fake("cadence::quantized_conv1d_ncl.per_tensor")
+def quantized_conv1d_ncl_per_tensor_meta(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: Tuple[int],
+    padding: Tuple[int],
+    dilation: Tuple[int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: int,
+    bias_scale: float,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: int,
+    out_shift: int,
+) -> torch.Tensor:
+    # NCL format: input is [N, C, L], weight is [OC, IC/groups, K]
+    out_channels, _, kernel_size = weight.shape
+
+    in_size = input.shape
+    assert len(in_size) == 3
+
+    # Compute the output tensor size for 1D conv (NCL format, channel_last=False)
+    output_size = get_conv1d_output_size(
+        in_size,
+        out_channels,
+        stride[-1],
+        padding[-1],
+        dilation[-1],
+        kernel_size,
+        False,
+    )
+
+    return input.new_empty(output_size, dtype=input.dtype)
+
+
+@register_fake("cadence::quantized_conv1d_nlc")
+def quantized_conv1d_nlc_meta(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: Tuple[int],
+    padding: Tuple[int],
+    dilation: Tuple[int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: torch.Tensor,
+    bias_scale: torch.Tensor,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: torch.Tensor,
+    out_shift: torch.Tensor,
+) -> torch.Tensor:
+    # NLC format: input is [N, L, C], weight is [OC, K, IC/groups]
+    out_channels, kernel_size, _ = weight.shape
+
+    in_size = input.shape
+    assert len(in_size) == 3
+
+    # Compute the output tensor size for 1D conv (NLC format, channel_last=True)
+    output_size = get_conv1d_output_size(
+        in_size,
+        out_channels,
+        stride[-1],
+        padding[-1],
+        dilation[-1],
+        kernel_size,
+        True,
+    )
+
+    return input.new_empty(output_size, dtype=input.dtype)
+
+
+@register_fake("cadence::quantized_conv1d_nlc.per_tensor")
+def quantized_conv1d_nlc_per_tensor_meta(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: Tuple[int],
+    padding: Tuple[int],
+    dilation: Tuple[int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: int,
+    bias_scale: float,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: int,
+    out_shift: int,
+) -> torch.Tensor:
+    # NLC format: input is [N, L, C], weight is [OC, K, IC/groups]
+    out_channels, kernel_size, _ = weight.shape
+
+    in_size = input.shape
+    assert len(in_size) == 3
+
+    # Compute the output tensor size for 1D conv (NLC format, channel_last=True)
+    output_size = get_conv1d_output_size(
+        in_size,
+        out_channels,
+        stride[-1],
+        padding[-1],
+        dilation[-1],
+        kernel_size,
+        True,
+    )
+
+    return input.new_empty(output_size, dtype=input.dtype)
+
+
 @register_fake("cadence::quantized_conv2d_nchw")
 def quantized_conv2d_nchw_meta(
     input: torch.Tensor,
@@ -2674,150 +2822,6 @@ def roi_align_box_processor_meta(
     return rois.new_empty((rois.shape[0], 80), dtype=torch.uint8)
 
 
-@register_fake("cadence::quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor")
-def quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_meta(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    stride: Tuple[int],
-    padding: Tuple[int],
-    dilation: Tuple[int],
-    groups: int,
-    in_zero_point: int,
-    weight_zero_point: int,
-    bias_scale: float,
-    output_scale: float,
-    output_zero_point: int,
-    out_multiplier: int,
-    out_shift: int,
-) -> torch.Tensor:
-    assert input.dim() == 3 and weight.dim() == 3
-    assert (
-        input.dtype == torch.int8
-        and weight.dtype == torch.int8
-        and bias.dtype == torch.int32
-    )
-    out_channels, _, kernel_size = weight.shape
-    output_size = get_conv1d_output_size(
-        input.shape,
-        out_channels,
-        stride[1],
-        padding[1],
-        dilation[1],
-        kernel_size,
-        False,
-    )
-    return input.new_empty(output_size, dtype=input.dtype)
-
-
-@register_fake("cadence::quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor")
-def quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_meta(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    stride: Tuple[int],
-    padding: Tuple[int],
-    dilation: Tuple[int],
-    groups: int,
-    in_zero_point: int,
-    weight_zero_point: int,
-    bias_scale: float,
-    output_scale: float,
-    output_zero_point: int,
-    out_multiplier: int,
-    out_shift: int,
-) -> torch.Tensor:
-    assert input.dim() == 3 and weight.dim() == 3
-    assert (
-        input.dtype == torch.uint8
-        and weight.dtype == torch.uint8
-        and bias.dtype == torch.int32
-    )
-    out_channels, _, kernel_size = weight.shape
-    output_size = get_conv1d_output_size(
-        input.shape,
-        out_channels,
-        stride[1],
-        padding[1],
-        dilation[1],
-        kernel_size,
-        False,
-    )
-    return input.new_empty(output_size, dtype=input.dtype)
-
-
-@register_fake("cadence::quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor")
-def quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_meta(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    stride: Tuple[int],
-    padding: Tuple[int],
-    dilation: Tuple[int],
-    groups: int,
-    in_zero_point: int,
-    weight_zero_point: int,
-    bias_scale: float,
-    output_scale: float,
-    output_zero_point: int,
-    out_multiplier: int,
-    out_shift: int,
-) -> torch.Tensor:
-    assert input.dim() == 3 and weight.dim() == 3
-    assert (
-        input.dtype == torch.int8
-        and weight.dtype == torch.int8
-        and bias.dtype == torch.int32
-    )
-    out_channels, kernel_size, _ = weight.shape
-    output_size = get_conv1d_output_size(
-        input.shape,
-        out_channels,
-        stride[1],
-        padding[1],
-        dilation[1],
-        kernel_size,
-        True,
-    )
-    return input.new_empty(output_size, dtype=input.dtype)
-
-
-@register_fake("cadence::quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor")
-def quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_meta(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    stride: Tuple[int],
-    padding: Tuple[int],
-    dilation: Tuple[int],
-    groups: int,
-    in_zero_point: int,
-    weight_zero_point: int,
-    bias_scale: float,
-    output_scale: float,
-    output_zero_point: int,
-    out_multiplier: int,
-    out_shift: int,
-) -> torch.Tensor:
-    assert input.dim() == 3 and weight.dim() == 3
-    assert (
-        input.dtype == torch.uint8
-        and weight.dtype == torch.uint8
-        and bias.dtype == torch.int32
-    )
-    out_channels, kernel_size, _ = weight.shape
-    output_size = get_conv1d_output_size(
-        input.shape,
-        out_channels,
-        stride[1],
-        padding[1],
-        dilation[1],
-        kernel_size,
-        True,
-    )
-    return input.new_empty(output_size, dtype=input.dtype)
-
-
 @register_fake("cadence::_softmax_f32_f32")
 def softmax_f32_f32_meta(
     input_tensor: torch.Tensor,
diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
index a37233fd492..44cae6e55ea 100644
--- a/backends/cadence/aot/ref_implementations.py
+++ b/backends/cadence/aot/ref_implementations.py
@@ -766,9 +766,9 @@ def quantized_conv_per_tensor(
     input_tensor: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
-    stride: tuple[int, int],
-    padding: tuple[int, int],
-    dilation: tuple[int, int],
+    stride: tuple[int, ...],
+    padding: tuple[int, ...],
+    dilation: tuple[int, ...],
     groups: int,
     in_zero_point: int,
     weight_zero_point: int,
@@ -887,6 +887,194 @@ def quantized_conv2d_nchw_per_tensor(
     )
 
 
+@impl_tracked(m, "quantized_conv1d_ncl.per_tensor")
+def quantized_conv1d_ncl_per_tensor(
+    input_tensor: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: tuple[int],
+    padding: tuple[int],
+    dilation: tuple[int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: int,
+    bias_scale: float,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: int,
+    out_shift: int,
+) -> torch.Tensor:
+    """
+    Quantized 1D convolution operation in NCL (channels-first) format.
+
+    Args:
+        - input_tensor (Tensor): The activations tensor in [N, C, L] format
+        - weight (Tensor): The weight tensor in [OC, IC/groups, K] format
+        - bias (Tensor): The bias tensor
+        - stride (Tuple[int]): The stride of the convolution
+        - padding (Tuple[int]): The padding of the convolution
+        - dilation (Tuple[int]): The dilation of the convolution
+        - groups (int): The number of groups
+        - in_zero_point (int): The quantized mapping of zero for the input
+        - weight_zero_point (int): The quantized mapping of zero for the weight
+        - bias_scale (float): The quantized bias scale
+        - output_scale (float): The scale of the output
+        - output_zero_point (int): The zero point of the output
+        - out_multiplier (int): Unused
+        - out_shift (int): Unused
+    """
+    if not input_tensor.is_contiguous(memory_format=torch.contiguous_format):
+        raise ValueError("Input tensor must be in NCL format")
+    return quantized_conv_per_tensor(
+        input_tensor,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        in_zero_point,
+        weight_zero_point,
+        bias_scale,
+        output_scale,
+        output_zero_point,
+        out_multiplier,
+        out_shift,
+    )
+
+
+@impl_tracked(m, "quantized_conv1d_ncl")
+def quantized_conv1d_ncl(
+    input_tensor: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: tuple[int],
+    padding: tuple[int],
+    dilation: tuple[int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: torch.Tensor,
+    bias_scale: torch.Tensor,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: torch.Tensor,
+    out_shift: torch.Tensor,
+) -> torch.Tensor:
+    return quantized_conv1d_ncl_per_tensor(
+        input_tensor,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        in_zero_point,
+        int(weight_zero_point.item()),
+        float(bias_scale.item()),
+        output_scale,
+        output_zero_point,
+        int(out_multiplier.item()),
+        int(out_shift.item()),
+    )
+
+
+@impl_tracked(m, "quantized_conv1d_nlc.per_tensor")
+def quantized_conv1d_nlc_per_tensor(
+    input_tensor: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: tuple[int],
+    padding: tuple[int],
+    dilation: tuple[int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: int,
+    bias_scale: float,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: int,
+    out_shift: int,
+) -> torch.Tensor:
+    """
+    Quantized 1D convolution operation in NLC (channels-last) format.
+
+    Args:
+        - input_tensor (Tensor): The activations tensor in [N, L, C] format
+        - weight (Tensor): The weight tensor in [OC, K, IC/groups] format
+        - bias (Tensor): The bias tensor
+        - stride (Tuple[int]): The stride of the convolution
+        - padding (Tuple[int]): The padding of the convolution
+        - dilation (Tuple[int]): The dilation of the convolution
+        - groups (int): The number of groups
+        - in_zero_point (int): The quantized mapping of zero for the input
+        - weight_zero_point (int): The quantized mapping of zero for the weight
+        - bias_scale (float): The quantized bias scale
+        - output_scale (float): The scale of the output
+        - output_zero_point (int): The zero point of the output
+        - out_multiplier (int): Unused
+        - out_shift (int): Unused
+    """
+    # Convert NLC to NCL for processing
+    input_ncl = input_tensor.permute(0, 2, 1).contiguous()
+    # Convert weight from [OC, K, IC/groups] to [OC, IC/groups, K]
+    weight_ncl = weight.permute(0, 2, 1).contiguous()
+
+    result_ncl = quantized_conv_per_tensor(
+        input_ncl,
+        weight_ncl,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        in_zero_point,
+        weight_zero_point,
+        bias_scale,
+        output_scale,
+        output_zero_point,
+        out_multiplier,
+        out_shift,
+    )
+
+    # Convert result back to NLC format
+    return result_ncl.permute(0, 2, 1).contiguous()
+
+
+@impl_tracked(m, "quantized_conv1d_nlc")
+def quantized_conv1d_nlc(
+    input_tensor: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: tuple[int],
+    padding: tuple[int],
+    dilation: tuple[int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: torch.Tensor,
+    bias_scale: torch.Tensor,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: torch.Tensor,
+    out_shift: torch.Tensor,
+) -> torch.Tensor:
+    return quantized_conv1d_nlc_per_tensor(
+        input_tensor,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        in_zero_point,
+        int(weight_zero_point.item()),
+        float(bias_scale.item()),
+        output_scale,
+        output_zero_point,
+        int(out_multiplier.item()),
+        int(out_shift.item()),
+    )
+
+
 @impl_tracked(m, "quantized_conv2d_nchw")
 def quantized_conv2d_nchw(
     input_tensor: torch.Tensor,
@@ -1343,26 +1531,6 @@ def quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor() -> (
 ): ...
 
 
-@impl_tracked(m, "quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor")
-@quantized_conv_variant("nchw", torch.int8, torch.int8, is_1d=True)
-def quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ...
-
-
-@impl_tracked(m, "quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor")
-@quantized_conv_variant("nchw", torch.uint8, torch.uint8, is_1d=True)
-def quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ...
-
-
-@impl_tracked(m, "quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor")
-@quantized_conv_variant("nhwc", torch.int8, torch.int8, is_1d=True)
-def quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ...
-
-
-@impl_tracked(m, "quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor")
-@quantized_conv_variant("nhwc", torch.uint8, torch.uint8, is_1d=True)
-def quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ...
-
-
 @impl_tracked(m, "conv1d")
 def conv1d(
     input_tensor: torch.Tensor,
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
index bf9e4d39250..936aad4e585 100644
--- a/backends/cadence/aot/tests/test_ref_implementations.py
+++ b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -3237,3 +3237,275 @@ def test_slice_scatter_with_none_start_end(self) -> None:
             torch.equal(self_tensor, expected),
             f"Values don't match: got {self_tensor}, expected {expected}",
         )
+
+    def test_quantized_conv1d_ncl_per_tensor_basic(self) -> None:
+        """Test quantized_conv1d_ncl.per_tensor with basic NCL format input."""
+        # NCL format: input is [N, C, L]
+        # Create simple 1D convolution input
+        batch_size = 1
+        length = 4
+        out_channels = 1
+        kernel_size = 2
+
+        input_tensor = torch.tensor([[[1, 2, 3, 4], [5, 6, 7, 8]]], dtype=torch.int8)
+        # Weight shape: [OC, IC/groups, K]
+        weight = torch.tensor([[[1, 1], [1, 1]]], dtype=torch.int8)
+        bias = torch.tensor([0], dtype=torch.int32)
+
+        stride = (1,)
+        padding = (0,)
+        dilation = (1,)
+        groups = 1
+        in_zero_point = 0
+        weight_zero_point = 0
+        bias_scale = 1.0
+        output_scale = 1.0
+        output_zero_point = 0
+        out_multiplier = 0
+        out_shift = 0
+
+        output = torch.ops.cadence.quantized_conv1d_ncl.per_tensor(
+            input_tensor,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            groups,
+            in_zero_point,
+            weight_zero_point,
+            bias_scale,
+            output_scale,
+            output_zero_point,
+            out_multiplier,
+            out_shift,
+        )
+
+        # Verify output shape: [N, OC, OL] where OL = (L + 2*padding - kernel) / stride + 1
+        expected_length = (length + 2 * padding[0] - kernel_size) // stride[0] + 1
+        self.assertEqual(output.shape, (batch_size, out_channels, expected_length))
+        self.assertEqual(output.dtype, torch.int8)
+
+    def test_quantized_conv1d_ncl_default_variant(self) -> None:
+        """Test quantized_conv1d_ncl (default variant with tensor params)."""
+        input_tensor = torch.tensor([[[1, 2, 3, 4]]], dtype=torch.int8)
+        weight = torch.tensor([[[1, 1]]], dtype=torch.int8)
+        bias = torch.tensor([0], dtype=torch.int32)
+
+        stride = (1,)
+        padding = (0,)
+        dilation = (1,)
+        groups = 1
+        in_zero_point = 0
+        weight_zero_point = torch.tensor([0], dtype=torch.int32)
+        bias_scale = torch.tensor([1.0], dtype=torch.float32)
+        output_scale = 1.0
+        output_zero_point = 0
+        out_multiplier = torch.tensor([1073741824], dtype=torch.int32)
+        out_shift = torch.tensor([0], dtype=torch.int32)
+
+        output = torch.ops.cadence.quantized_conv1d_ncl(
+            input_tensor,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            groups,
+            in_zero_point,
+            weight_zero_point,
+            bias_scale,
+            output_scale,
+            output_zero_point,
+            out_multiplier,
+            out_shift,
+        )
+
+        self.assertEqual(output.shape, (1, 1, 3))
+        self.assertEqual(output.dtype, torch.int8)
+
+    def test_quantized_conv1d_nlc_per_tensor_basic(self) -> None:
+        """Test quantized_conv1d_nlc.per_tensor with basic NLC format input."""
+        # NLC format: input is [N, L, C]
+        batch_size = 1
+        length = 4
+        out_channels = 1
+        kernel_size = 2
+
+        # Input in NLC format
+        input_tensor = torch.tensor(
+            [[[1, 5], [2, 6], [3, 7], [4, 8]]], dtype=torch.int8
+        )
+        # Weight shape: [OC, K, IC/groups]
+        weight = torch.tensor([[[1, 1], [1, 1]]], dtype=torch.int8)
+        bias = torch.tensor([0], dtype=torch.int32)
+
+        stride = (1,)
+        padding = (0,)
+        dilation = (1,)
+        groups = 1
+        in_zero_point = 0
+        weight_zero_point = 0
+        bias_scale = 1.0
+        output_scale = 1.0
+        output_zero_point = 0
+        out_multiplier = 0
+        out_shift = 0
+
+        output = torch.ops.cadence.quantized_conv1d_nlc.per_tensor(
+            input_tensor,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            groups,
+            in_zero_point,
+            weight_zero_point,
+            bias_scale,
+            output_scale,
+            output_zero_point,
+            out_multiplier,
+            out_shift,
+        )
+
+        # Verify output shape: [N, OL, OC]
+        expected_length = (length + 2 * padding[0] - kernel_size) // stride[0] + 1
+        self.assertEqual(output.shape, (batch_size, expected_length, out_channels))
+        self.assertEqual(output.dtype, torch.int8)
+
+    def test_quantized_conv1d_nlc_default_variant(self) -> None:
+        """Test quantized_conv1d_nlc (default variant with tensor params)."""
+        # Input in NLC format: [N, L, C]
+        input_tensor = torch.tensor([[[1], [2], [3], [4]]], dtype=torch.int8)
+        # Weight shape: [OC, K, IC/groups]
+        weight = torch.tensor([[[1], [1]]], dtype=torch.int8)
+        bias = torch.tensor([0], dtype=torch.int32)
+
+        stride = (1,)
+        padding = (0,)
+        dilation = (1,)
+        groups = 1
+        in_zero_point = 0
+        weight_zero_point = torch.tensor([0], dtype=torch.int32)
+        bias_scale = torch.tensor([1.0], dtype=torch.float32)
+        output_scale = 1.0
+        output_zero_point = 0
+        out_multiplier = torch.tensor([1073741824], dtype=torch.int32)
+        out_shift = torch.tensor([0], dtype=torch.int32)
+
+        output = torch.ops.cadence.quantized_conv1d_nlc(
+            input_tensor,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            groups,
+            in_zero_point,
+            weight_zero_point,
+            bias_scale,
+            output_scale,
+            output_zero_point,
+            out_multiplier,
+            out_shift,
+        )
+
+        # Output should be [N, OL, OC] = [1, 3, 1]
+        self.assertEqual(output.shape, (1, 3, 1))
+        self.assertEqual(output.dtype, torch.int8)
+
+    def test_quantized_conv1d_ncl_with_groups(self) -> None:
+        """Test quantized_conv1d_ncl.per_tensor with groups > 1."""
+        batch_size = 1
+        in_channels = 4
+        length = 4
+        out_channels = 4
+        kernel_size = 2
+        groups = 2
+
+        input_tensor = torch.randint(
+            -5, 5, (batch_size, in_channels, length), dtype=torch.int8
+        )
+        # Weight shape: [OC, IC/groups, K]
+        weight = torch.randint(
+            -2, 2, (out_channels, in_channels // groups, kernel_size), dtype=torch.int8
+        )
+        bias = torch.zeros(out_channels, dtype=torch.int32)
+
+        stride = (1,)
+        padding = (0,)
+        dilation = (1,)
+        in_zero_point = 0
+        weight_zero_point = 0
+        bias_scale = 1.0
+        output_scale = 0.1
+        output_zero_point = 0
+        out_multiplier = 0
+        out_shift = 0
+
+        output = torch.ops.cadence.quantized_conv1d_ncl.per_tensor(
+            input_tensor,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            groups,
+            in_zero_point,
+            weight_zero_point,
+            bias_scale,
+            output_scale,
+            output_zero_point,
+            out_multiplier,
+            out_shift,
+        )
+
+        expected_length = (length + 2 * padding[0] - kernel_size) // stride[0] + 1
+        self.assertEqual(output.shape, (batch_size, out_channels, expected_length))
+        self.assertEqual(output.dtype, torch.int8)
+
+    def test_quantized_conv1d_nlc_with_padding(self) -> None:
+        """Test quantized_conv1d_nlc.per_tensor with padding."""
+        batch_size = 1
+        length = 3
+        out_channels = 1
+
+        # Input in NLC format: [N, L, C]
+        input_tensor = torch.tensor([[[1], [2], [3]]], dtype=torch.int8)
+        # Weight shape: [OC, K, IC/groups]
+        weight = torch.tensor([[[1], [1], [1]]], dtype=torch.int8)
+        bias = torch.tensor([0], dtype=torch.int32)
+
+        stride = (1,)
+        padding = (1,)  # Add padding
+        dilation = (1,)
+        groups = 1
+        in_zero_point = 0
+        weight_zero_point = 0
+        bias_scale = 1.0
+        output_scale = 1.0
+        output_zero_point = 0
+        out_multiplier = 0
+        out_shift = 0
+
+        output = torch.ops.cadence.quantized_conv1d_nlc.per_tensor(
+            input_tensor,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            groups,
+            in_zero_point,
+            weight_zero_point,
+            bias_scale,
+            output_scale,
+            output_zero_point,
+            out_multiplier,
+            out_shift,
+        )
+
+        # With padding=1, output length = (3 + 2*1 - 3) / 1 + 1 = 3
+        self.assertEqual(output.shape, (batch_size, length, out_channels))
+        self.assertEqual(output.dtype, torch.int8)
diff --git a/backends/cadence/aot/tests/test_type_dispatch_passes.py b/backends/cadence/aot/tests/test_type_dispatch_passes.py
index 870735aad1a..f0847e8ca77 100644
--- a/backends/cadence/aot/tests/test_type_dispatch_passes.py
+++ b/backends/cadence/aot/tests/test_type_dispatch_passes.py
@@ -307,63 +307,6 @@ def test_dispatch_quantized_conv_2d_dilated(
         # Should be replaced with dtype-specific variant
         self.assertEqual(count_node(gm, expected_op), 1)
 
-    @expand(
-        [
-            (
-                "int8_nchw_1d",
-                torch.int8,
-                (1, 3, 8),  # x_shape
-                exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor,
-                exir_ops.edge.cadence.quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor,
-            ),
-            (
-                "uint8_nchw_1d",
-                torch.uint8,
-                (1, 3, 8),  # x_shape
-                exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor,
-                exir_ops.edge.cadence.quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor,
-            ),
-            (
-                "int8_nhwc_1d",
-                torch.int8,
-                (1, 8, 3),  # x_shape
-                exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor,
-                exir_ops.edge.cadence.quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor,
-            ),
-            (
-                "uint8_nhwc_1d",
-                torch.uint8,
-                (1, 8, 3),  # x_shape
-                exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor,
-                exir_ops.edge.cadence.quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor,
-            ),
-        ]
-    )
-    def test_dispatch_quantized_conv_1d(
-        self,
-        _: str,
-        dtype: torch.dtype,
-        x_shape: tuple[int, ...],
-        original_op: torch._ops.OpOverload,
-        expected_op: torch._ops.OpOverload,
-    ) -> None:
-        """Test quantized_conv_1d (nchw/nhwc) dispatches to correct dtype-specific variant"""
-        min_val, max_val = torch.iinfo(dtype).min, torch.iinfo(dtype).max
-        x = torch.randint(min_val, max_val, x_shape, dtype=dtype)
-        w = torch.randint(min_val, max_val, (16, 3, 3), dtype=dtype)
-        b = torch.randint(-2147483648, 2147483647, (16,), dtype=torch.int32)
-        gm = single_op_builder(
-            placeholders=(x, w, b),
-            op=original_op,
-            args=(x, w, b, [1, 1], [0, 0], [1, 1], 1, 0, 0, 1.0, 1.0, 0, 1, 1),
-        )
-        p = CompileTimeTypeDispatchPass()
-        gm = cast(PassResult, p(gm)).graph_module
-        # Original op should be replaced
-        self.assertEqual(count_node(gm, original_op), 0)
-        # Should be replaced with dtype-specific variant
-        self.assertEqual(count_node(gm, expected_op), 1)
-
     @expand(
         [
             (
diff --git a/backends/cadence/aot/type_dispatch.py b/backends/cadence/aot/type_dispatch.py
index 69fd721e4e3..b809cc90978 100644
--- a/backends/cadence/aot/type_dispatch.py
+++ b/backends/cadence/aot/type_dispatch.py
@@ -171,18 +171,11 @@ def call_operator(
             is_depthwise = is_depthwise_conv(groups, input_channels)
             # pyre-ignore[16]: None has no attribute '__iter__'.
             is_dilated = any(d > 1 for d in args[5])
-            is_1d = len(args[0].to_tensor().shape) == 3
 
             if is_depthwise:
                 typed_op_name = f"{base_name}_depthwise_{type_suffix}"
             elif is_dilated:
                 typed_op_name = f"{base_name}_dilated_{type_suffix}"
-            elif is_1d and groups == 1:
-                if "nchw" in base_name:
-                    layout_suffix = "ncl"
-                else:
-                    layout_suffix = "nlc"
-                typed_op_name = f"quantized_conv1d_{layout_suffix}_{type_suffix}"
 
         typed_op = getattr(
             getattr(exir_ops.edge.cadence, typed_op_name), config.variant
diff --git a/backends/cadence/generic/operators/op_quantized_conv1d.cpp b/backends/cadence/generic/operators/op_quantized_conv1d.cpp
index 6ae3a6613fb..5beafd9e1fe 100644
--- a/backends/cadence/generic/operators/op_quantized_conv1d.cpp
+++ b/backends/cadence/generic/operators/op_quantized_conv1d.cpp
@@ -6,8 +6,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/cadence/generic/operators/op_quantized_conv1d.h>
-
 #include <executorch/backends/cadence/generic/kernels/kernels.h>
 #include <executorch/backends/cadence/generic/operators/cadence_type_util.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
@@ -373,142 +371,6 @@ void quantized_conv1d_nlc(
 
 } // namespace
 
-Tensor& quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out(
-    ET_UNUSED KernelRuntimeContext& ctx,
-    const Tensor& input,
-    const Tensor& weight,
-    const Tensor& bias,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    int64_t groups,
-    int64_t in_zero_point,
-    int64_t weight_zero_point,
-    double bias_scale,
-    double output_scale,
-    int64_t output_zero_point,
-    ET_UNUSED int64_t out_multiplier,
-    ET_UNUSED int64_t out_shift,
-    Tensor& out) {
-  quantized_conv1d_ncl(
-      input,
-      weight,
-      bias,
-      stride,
-      padding,
-      dilation,
-      static_cast<int16_t>(groups),
-      static_cast<int32_t>(in_zero_point),
-      static_cast<int32_t>(weight_zero_point),
-      static_cast<float>(bias_scale),
-      static_cast<float>(output_scale),
-      static_cast<int32_t>(output_zero_point),
-      out);
-  return out;
-}
-
-Tensor& quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out(
-    ET_UNUSED KernelRuntimeContext& ctx,
-    const Tensor& input,
-    const Tensor& weight,
-    const Tensor& bias,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    int64_t groups,
-    int64_t in_zero_point,
-    int64_t weight_zero_point,
-    double bias_scale,
-    double output_scale,
-    int64_t output_zero_point,
-    ET_UNUSED int64_t out_multiplier,
-    ET_UNUSED int64_t out_shift,
-    Tensor& out) {
-  quantized_conv1d_ncl(
-      input,
-      weight,
-      bias,
-      stride,
-      padding,
-      dilation,
-      static_cast<int16_t>(groups),
-      static_cast<int32_t>(in_zero_point),
-      static_cast<int32_t>(weight_zero_point),
-      static_cast<float>(bias_scale),
-      static_cast<float>(output_scale),
-      static_cast<int32_t>(output_zero_point),
-      out);
-  return out;
-}
-
-Tensor& quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out(
-    ET_UNUSED KernelRuntimeContext& ctx,
-    const Tensor& input,
-    const Tensor& weight,
-    const Tensor& bias,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    int64_t groups,
-    int64_t in_zero_point,
-    int64_t weight_zero_point,
-    double bias_scale,
-    double output_scale,
-    int64_t output_zero_point,
-    ET_UNUSED int64_t out_multiplier,
-    ET_UNUSED int64_t out_shift,
-    Tensor& out) {
-  quantized_conv1d_nlc(
-      input,
-      weight,
-      bias,
-      stride,
-      padding,
-      dilation,
-      static_cast<int16_t>(groups),
-      static_cast<int32_t>(in_zero_point),
-      static_cast<int32_t>(weight_zero_point),
-      static_cast<float>(bias_scale),
-      static_cast<float>(output_scale),
-      static_cast<int32_t>(output_zero_point),
-      out);
-  return out;
-}
-
-Tensor& quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out(
-    ET_UNUSED KernelRuntimeContext& ctx,
-    const Tensor& input,
-    const Tensor& weight,
-    const Tensor& bias,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    int64_t groups,
-    int64_t in_zero_point,
-    int64_t weight_zero_point,
-    double bias_scale,
-    double output_scale,
-    int64_t output_zero_point,
-    ET_UNUSED int64_t out_multiplier,
-    ET_UNUSED int64_t out_shift,
-    Tensor& out) {
-  quantized_conv1d_nlc(
-      input,
-      weight,
-      bias,
-      stride,
-      padding,
-      dilation,
-      static_cast<int16_t>(groups),
-      static_cast<int32_t>(in_zero_point),
-      static_cast<int32_t>(weight_zero_point),
-      static_cast<float>(bias_scale),
-      static_cast<float>(output_scale),
-      static_cast<int32_t>(output_zero_point),
-      out);
-  return out;
-}
-
 } // namespace native
 } // namespace generic
 } // namespace impl
diff --git a/backends/cadence/generic/operators/op_quantized_conv1d.h b/backends/cadence/generic/operators/op_quantized_conv1d.h
deleted file mode 100644
index 5cb79ab09fa..00000000000
--- a/backends/cadence/generic/operators/op_quantized_conv1d.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/kernel/kernel_runtime_context.h>
-
-namespace impl {
-namespace generic {
-namespace native {
-
-executorch::aten::Tensor&
-quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out(
-    executorch::runtime::KernelRuntimeContext& ctx,
-    const executorch::aten::Tensor& input,
-    const executorch::aten::Tensor& weight,
-    const executorch::aten::Tensor& bias,
-    executorch::aten::IntArrayRef stride,
-    executorch::aten::IntArrayRef padding,
-    executorch::aten::IntArrayRef dilation,
-    int64_t groups,
-    int64_t in_zero_point,
-    int64_t weight_zero_point,
-    double bias_scale,
-    double output_scale,
-    int64_t output_zero_point,
-    int64_t out_multiplier,
-    int64_t out_shift,
-    executorch::aten::Tensor& out);
-
-executorch::aten::Tensor&
-quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out(
-    executorch::runtime::KernelRuntimeContext& ctx,
-    const executorch::aten::Tensor& input,
-    const executorch::aten::Tensor& weight,
-    const executorch::aten::Tensor& bias,
-    executorch::aten::IntArrayRef stride,
-    executorch::aten::IntArrayRef padding,
-    executorch::aten::IntArrayRef dilation,
-    int64_t groups,
-    int64_t in_zero_point,
-    int64_t weight_zero_point,
-    double bias_scale,
-    double output_scale,
-    int64_t output_zero_point,
-    int64_t out_multiplier,
-    int64_t out_shift,
-    executorch::aten::Tensor& out);
-
-executorch::aten::Tensor&
-quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out(
-    executorch::runtime::KernelRuntimeContext& ctx,
-    const executorch::aten::Tensor& input,
-    const executorch::aten::Tensor& weight,
-    const executorch::aten::Tensor& bias,
-    executorch::aten::IntArrayRef stride,
-    executorch::aten::IntArrayRef padding,
-    executorch::aten::IntArrayRef dilation,
-    int64_t groups,
-    int64_t in_zero_point,
-    int64_t weight_zero_point,
-    double bias_scale,
-    double output_scale,
-    int64_t output_zero_point,
-    int64_t out_multiplier,
-    int64_t out_shift,
-    executorch::aten::Tensor& out);
-
-executorch::aten::Tensor&
-quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out(
-    executorch::runtime::KernelRuntimeContext& ctx,
-    const executorch::aten::Tensor& input,
-    const executorch::aten::Tensor& weight,
-    const executorch::aten::Tensor& bias,
-    executorch::aten::IntArrayRef stride,
-    executorch::aten::IntArrayRef padding,
-    executorch::aten::IntArrayRef dilation,
-    int64_t groups,
-    int64_t in_zero_point,
-    int64_t weight_zero_point,
-    double bias_scale,
-    double output_scale,
-    int64_t output_zero_point,
-    int64_t out_multiplier,
-    int64_t out_shift,
-    executorch::aten::Tensor& out);
-
-} // namespace native
-} // namespace generic
-} // namespace impl
diff --git a/backends/cadence/generic/operators/op_quantized_conv1d_ncl.cpp b/backends/cadence/generic/operators/op_quantized_conv1d_ncl.cpp
new file mode 100644
index 00000000000..c013b2f7da0
--- /dev/null
+++ b/backends/cadence/generic/operators/op_quantized_conv1d_ncl.cpp
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/generic/operators/op_quantized_conv1d_ncl.h>
+
+#include <executorch/backends/cadence/generic/kernels/kernels.h>
+#include <executorch/backends/cadence/generic/operators/cadence_type_util.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+
+namespace impl {
+namespace generic {
+namespace native {
+
+namespace {
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+using ::impl::generic::kernels::quantize;
+
+// This implements a generic 1d conv kernel that operates on raw pointers.
+// The quantized version handles both quantized convolutions for 1D inputs.
+// The input is of shape [n x c x w]
+// The weight is of shape [oc x wc x ww], where wc == c
+// The output is of shape [n x oc x ow]
+// The bias is of shape [oc]
+
+template <
+    typename IT = float,
+    typename WT = IT,
+    typename BT = IT,
+    typename OT = IT,
+    bool quantized = false>
+__attribute__((noinline)) void conv1d_ncl_core_generic(
+    // All the arrays
+    const IT* __restrict__ p_in,
+    const WT* __restrict__ p_weight,
+    const BT* __restrict__ p_bias,
+    OT* __restrict__ p_out,
+    // The array sizes
+    int32_t n,
+    int32_t c,
+    int32_t w,
+    int32_t oc,
+    int32_t wc,
+    int32_t ww,
+    int32_t ow,
+    // Stride
+    int16_t s,
+    // Padding
+    int16_t p,
+    // Dilation
+    int16_t d,
+    // Group for depthwise conv
+    int16_t groups,
+    // Optional args that are only relevant for quantized convolution
+    // input zero point
+    IT in_zero_point = 0,
+    // weight zero point
+    int32_t weight_zero_point = 0,
+    float bias_scale = 1,
+    float out_scale = 1,
+    OT out_zero_point = 0) {
+  float inv_out_scale = 1. / out_scale;
+  bool zero_pad_unit_dilation = d == 1 && p == 0;
+
+  // Compute the number of in and out channels per group
+  const int ocpg = oc / groups;
+  const int icpg = c / groups;
+
+  // Iterate over all the output batches (i.e., n)
+  for (int _n = 0; _n < n; ++_n) {
+    const IT* in_batch = p_in + _n * c * w;
+    OT* out_batch = p_out + _n * oc * ow;
+    // Compute separable convolution for each group
+    for (int _g = 0; _g < groups; ++_g) {
+      // Identify the input and output channels involved in the computation
+      // of this group
+      int sic = _g * icpg;
+      int soc = _g * ocpg;
+      // Populate all the output channels in the group
+      for (int _oc = soc; _oc < soc + ocpg; ++_oc) {
+        OT* out_plane = out_batch + _oc * ow;
+        const WT* weight_batch = p_weight + _oc * wc * ww;
+        // We compute one output channel at a time. The computation can be
+        // thought of as a stencil computation: we iterate over an input of size
+        // icpg x w, with a stencil of size icpg x ww, to compute an
+        // output channel of size 1 x ow.
+        for (int _w = 0, _ow = 0; _ow < ow; _w += s, ++_ow) {
+          float acc = p_bias[_oc];
+          // Below is the stencil computation that performs the hadamard
+          // product+accumulation of each input channel (contributing to the
+          // output channel being computed) with the corresponding weight
+          // channel.
+          // If the padding is 0, and dilation is 1, then we can remove the
+          // unnecessary checks, and simplify the code so that it can be
+          // vectorized by Tensilica compiler.
+          if (zero_pad_unit_dilation) {
+            for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+              const IT* in_plane = in_batch + _ic * w;
+              const WT* weight_plane = weight_batch + (_ic - sic) * ww;
+              for (int _ww = 0; _ww < ww; ++_ww) {
+                int ioff = _w + _ww;
+                int woff = _ww;
+                float lhs = in_plane[ioff] - in_zero_point;
+                float rhs =
+                    weight_plane[woff] - (quantized ? weight_zero_point : 0);
+                acc += lhs * rhs;
+              }
+            }
+          } else {
+            for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+              const IT* in_plane = in_batch + _ic * w;
+              const WT* weight_plane = weight_batch + (_ic - sic) * ww;
+              for (int _ww = 0; _ww < ww; ++_ww) {
+                if (((_w + d * _ww - p) >= 0) && ((_w + d * _ww - p) < w)) {
+                  int ioff = _w + d * _ww - p;
+                  int woff = _ww;
+                  float lhs = in_plane[ioff] - in_zero_point;
+                  float rhs =
+                      weight_plane[woff] - (quantized ? weight_zero_point : 0);
+                  acc += lhs * rhs;
+                }
+              }
+            }
+          }
+          if (quantized) {
+            float val = bias_scale * acc;
+            out_plane[_ow] = quantize<OT>(val, inv_out_scale, out_zero_point);
+          } else {
+            out_plane[_ow] = acc;
+          }
+        }
+      }
+    }
+  }
+}
+
+void quantized_conv1d_ncl(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int16_t groups,
+    int32_t in_zero_point,
+    int32_t weight_zero_point,
+    float bias_scale,
+    float output_scale,
+    int32_t output_zero_point,
+    Tensor& out) {
+  // input = [n, c, w]
+  const int n = input.size(0);
+  const int c = input.size(1);
+  const int w = input.size(2);
+  // weight = [oc, wc, ww]
+  const int oc = weight.size(0);
+  const int wc = weight.size(1);
+  const int ww = weight.size(2);
+  // output = [n, oc, ow]
+  const int ow = out.size(2);
+
+#define typed_quantized_conv1d_ncl(ctype, dtype)                 \
+  case ScalarType::dtype: {                                      \
+    conv1d_ncl_core_generic<ctype, ctype, int32_t, ctype, true>( \
+        input.const_data_ptr<ctype>(),                           \
+        weight.const_data_ptr<ctype>(),                          \
+        bias.const_data_ptr<int32_t>(),                          \
+        out.mutable_data_ptr<ctype>(),                           \
+        n,                                                       \
+        c,                                                       \
+        w,                                                       \
+        oc,                                                      \
+        wc,                                                      \
+        ww,                                                      \
+        ow,                                                      \
+        stride[0],                                               \
+        padding[0],                                              \
+        dilation[0],                                             \
+        groups,                                                  \
+        in_zero_point,                                           \
+        weight_zero_point,                                       \
+        bias_scale,                                              \
+        output_scale,                                            \
+        (ctype)output_zero_point);                               \
+    break;                                                       \
+  }
+  ScalarType dtype = out.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_conv1d_ncl);
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", torch::executor::toString(dtype));
+  }
+
+#undef typed_quantized_conv1d_ncl
+}
+
+} // namespace
+
+// Public exported kernel functions
+
+::executorch::aten::Tensor& quantized_conv1d_ncl_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t input_zero_point,
+    const Tensor& weight_zero_point,
+    const Tensor& bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
+    Tensor& out) {
+  (void)ctx;
+  (void)out_multiplier;
+  (void)out_shift;
+  quantized_conv1d_ncl(
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      static_cast<int16_t>(groups),
+      static_cast<int32_t>(input_zero_point),
+      weight_zero_point.const_data_ptr<int32_t>()[0],
+      bias_scale.const_data_ptr<float>()[0],
+      static_cast<float>(output_scale),
+      static_cast<int32_t>(output_zero_point),
+      out);
+  return out;
+}
+
+::executorch::aten::Tensor& quantized_conv1d_ncl_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    __ET_UNUSED IntArrayRef dilation,
+    int64_t groups,
+    int64_t input_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  (void)ctx;
+  quantized_conv1d_ncl(
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      {1},
+      static_cast<int16_t>(groups),
+      static_cast<int32_t>(input_zero_point),
+      static_cast<int32_t>(weight_zero_point),
+      static_cast<float>(bias_scale),
+      static_cast<float>(output_scale),
+      static_cast<int32_t>(output_zero_point),
+      out);
+  return out;
+}
+
+} // namespace native
+} // namespace generic
+} // namespace impl
diff --git a/backends/cadence/generic/operators/op_quantized_conv1d_ncl.h b/backends/cadence/generic/operators/op_quantized_conv1d_ncl.h
new file mode 100644
index 00000000000..f6854beff12
--- /dev/null
+++ b/backends/cadence/generic/operators/op_quantized_conv1d_ncl.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+
+namespace impl {
+namespace generic {
+namespace native {
+
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+// NCL format (N=batch, C=channels, L=length)
+::executorch::aten::Tensor& quantized_conv1d_ncl_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t input_zero_point,
+    const Tensor& weight_zero_point,
+    const Tensor& bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
+    Tensor& out);
+
+::executorch::aten::Tensor& quantized_conv1d_ncl_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t input_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    Tensor& out);
+
+} // namespace native
+} // namespace generic
+} // namespace impl
diff --git a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp
new file mode 100644
index 00000000000..b19ac059563
--- /dev/null
+++ b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h>
+
+#include <executorch/backends/cadence/generic/kernels/kernels.h>
+#include <executorch/backends/cadence/generic/operators/cadence_type_util.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+
+namespace impl {
+namespace generic {
+namespace native {
+
+namespace {
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+using ::impl::generic::kernels::quantize;
+
+// This implements a generic 1d conv kernel that operates on raw pointers.
+// The quantized version handles both quantized convolutions for 1D inputs.
+// The input is of shape [n x w x c] (NLC format)
+// The weight is of shape [oc x ww x wc], where wc == c
+// The output is of shape [n x ow x oc]
+// The bias is of shape [oc]
+
+template <
+    typename IT = float,
+    typename WT = IT,
+    typename BT = IT,
+    typename OT = IT,
+    bool quantized = false>
+__attribute__((noinline)) void conv1d_nlc_core_generic(
+    // All the arrays
+    const IT* __restrict__ p_in,
+    const WT* __restrict__ p_weight,
+    const BT* __restrict__ p_bias,
+    OT* __restrict__ p_out,
+    // The array sizes
+    int32_t n,
+    int32_t w,
+    int32_t c,
+    int32_t oc,
+    int32_t ww,
+    int32_t wc,
+    int32_t ow,
+    // Stride
+    int16_t s,
+    // Padding
+    int16_t p,
+    // Dilation
+    int16_t d,
+    // Group for depthwise conv
+    int16_t groups,
+    // Optional args that are only relevant for quantized convolution
+    // input zero point
+    IT in_zero_point = 0,
+    // weight zero point
+    int32_t weight_zero_point = 0,
+    float bias_scale = 1,
+    float out_scale = 1,
+    OT out_zero_point = 0) {
+  float inv_out_scale = 1. / out_scale;
+  bool zero_pad_unit_dilation = d == 1 && p == 0;
+
+  // Compute the number of in and out channels per group
+  const int ocpg = oc / groups;
+  const int icpg = c / groups;
+
+  // Iterate over all the output batches (i.e., n)
+  for (int _n = 0; _n < n; ++_n) {
+    const IT* in_batch = p_in + _n * w * c;
+    OT* out_batch = p_out + _n * ow * oc;
+    for (int _w = 0, _ow = 0; _ow < ow; _w += s, ++_ow) {
+      OT* out_line = out_batch + _ow * oc;
+      // Compute separable convolution for each group
+      for (int _g = 0; _g < groups; ++_g) {
+        // Identify the input and output channels involved in the computation
+        // of this group
+        int sic = _g * icpg;
+        int soc = _g * ocpg;
+        // Populate all the output channels in the group
+        for (int _oc = soc; _oc < soc + ocpg; ++_oc) {
+          const WT* weight_batch = p_weight + _oc * ww * wc;
+          // We compute one output channel at a time. The computation can be
+          // thought of as a stencil computation: we iterate over an input of
+          // size w x icpg, with a stencil of size ww x icpg, to
+          // compute an output channel of size ow x 1.
+          float acc = p_bias[_oc];
+          // Below is the stencil computation that performs the hadamard
+          // product+accumulation of each input channel (contributing to
+          // the output channel being computed) with the corresponding
+          // weight channel. If the padding is 0, and dilation is 1, then
+          // we can remove the unnecessary checks, and simplify the code
+          // so that it can be vectorized by Tensilica compiler.
+          if (zero_pad_unit_dilation) {
+            for (int _ww = 0; _ww < ww; ++_ww) {
+              const IT* in_line = in_batch + (_w + _ww) * c;
+              const WT* weight_line = weight_batch + _ww * wc;
+              for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+                float lhs = in_line[_ic] - in_zero_point;
+                float rhs = weight_line[_ic - sic] -
+                    (quantized ? weight_zero_point : 0);
+                acc += lhs * rhs;
+              }
+            }
+          } else {
+            for (int _ww = 0; _ww < ww; ++_ww) {
+              if (((_w + d * _ww - p) >= 0) && ((_w + d * _ww - p) < w)) {
+                const IT* in_line = in_batch + (_w + d * _ww - p) * c;
+                const WT* weight_line = weight_batch + _ww * wc;
+                for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+                  float lhs = in_line[_ic] - in_zero_point;
+                  float rhs = weight_line[_ic - sic] -
+                      (quantized ? weight_zero_point : 0);
+                  acc += lhs * rhs;
+                }
+              }
+            }
+          }
+          if (quantized) {
+            float val = bias_scale * acc;
+            out_line[_oc] = quantize<OT>(val, inv_out_scale, out_zero_point);
+          } else {
+            out_line[_oc] = acc;
+          }
+        }
+      }
+    }
+  }
+}
+
+void quantized_conv1d_nlc(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int16_t groups,
+    int32_t in_zero_point,
+    int32_t weight_zero_point,
+    float bias_scale,
+    float output_scale,
+    int32_t output_zero_point,
+    Tensor& out) {
+  // input = [n, w, c]
+  const int n = input.size(0);
+  const int w = input.size(1);
+  const int c = input.size(2);
+  // weight = [oc, ww, wc]
+  const int oc = weight.size(0);
+  const int ww = weight.size(1);
+  const int wc = weight.size(2);
+  // output = [n, ow, oc]
+  const int ow = out.size(1);
+
+#define typed_quantized_conv1d_nlc(ctype, dtype)                 \
+  case ScalarType::dtype: {                                      \
+    conv1d_nlc_core_generic<ctype, ctype, int32_t, ctype, true>( \
+        input.const_data_ptr<ctype>(),                           \
+        weight.const_data_ptr<ctype>(),                          \
+        bias.const_data_ptr<int32_t>(),                          \
+        out.mutable_data_ptr<ctype>(),                           \
+        n,                                                       \
+        w,                                                       \
+        c,                                                       \
+        oc,                                                      \
+        ww,                                                      \
+        wc,                                                      \
+        ow,                                                      \
+        stride[0],                                               \
+        padding[0],                                              \
+        dilation[0],                                             \
+        groups,                                                  \
+        in_zero_point,                                           \
+        weight_zero_point,                                       \
+        bias_scale,                                              \
+        output_scale,                                            \
+        (ctype)output_zero_point);                               \
+    break;                                                       \
+  }
+  ScalarType dtype = out.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_conv1d_nlc);
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", torch::executor::toString(dtype));
+  }
+
+#undef typed_quantized_conv1d_nlc
+}
+
+} // namespace
+
+// Public exported kernel functions
+
+::executorch::aten::Tensor& quantized_conv1d_nlc_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t input_zero_point,
+    const Tensor& weight_zero_point,
+    const Tensor& bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
+    Tensor& out) {
+  (void)ctx;
+  (void)out_multiplier;
+  (void)out_shift;
+  quantized_conv1d_nlc(
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      static_cast<int16_t>(groups),
+      static_cast<int32_t>(input_zero_point),
+      weight_zero_point.const_data_ptr<int32_t>()[0],
+      bias_scale.const_data_ptr<float>()[0],
+      static_cast<float>(output_scale),
+      static_cast<int32_t>(output_zero_point),
+      out);
+  return out;
+}
+
+::executorch::aten::Tensor& quantized_conv1d_nlc_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    __ET_UNUSED IntArrayRef dilation,
+    int64_t groups,
+    int64_t input_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  (void)ctx;
+  quantized_conv1d_nlc(
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      {1},
+      static_cast<int16_t>(groups),
+      static_cast<int32_t>(input_zero_point),
+      static_cast<int32_t>(weight_zero_point),
+      static_cast<float>(bias_scale),
+      static_cast<float>(output_scale),
+      static_cast<int32_t>(output_zero_point),
+      out);
+  return out;
+}
+
+} // namespace native
+} // namespace generic
+} // namespace impl
diff --git a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h
new file mode 100644
index 00000000000..7713121cf97
--- /dev/null
+++ b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+
+namespace impl {
+namespace generic {
+namespace native {
+
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+// NLC format (N=batch, L=length, C=channels)
+::executorch::aten::Tensor& quantized_conv1d_nlc_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t input_zero_point,
+    const Tensor& weight_zero_point,
+    const Tensor& bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
+    Tensor& out);
+
+::executorch::aten::Tensor& quantized_conv1d_nlc_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t input_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    Tensor& out);
+
+} // namespace native
+} // namespace generic
+} // namespace impl
diff --git a/backends/cadence/generic/operators/targets.bzl b/backends/cadence/generic/operators/targets.bzl
index 77535466d46..faa63e4f46f 100644
--- a/backends/cadence/generic/operators/targets.bzl
+++ b/backends/cadence/generic/operators/targets.bzl
@@ -121,9 +121,22 @@ def define_common_targets():
     )
 
     runtime.cxx_library(
-        name = "op_quantized_conv1d",
-        srcs = ["op_quantized_conv1d.cpp"],
-        exported_headers = ["op_quantized_conv1d.h"],
+        name = "op_quantized_conv1d_ncl",
+        srcs = ["op_quantized_conv1d_ncl.cpp"],
+        exported_headers = ["op_quantized_conv1d_ncl.h"],
+        platforms = CXX,
+        deps = [
+            ":cadence_type_util",
+            "//executorch/backends/cadence/generic/kernels:cadence_kernels",
+            "//executorch/runtime/kernel:kernel_includes",
+        ],
+        visibility = ["PUBLIC"],
+    )
+
+    runtime.cxx_library(
+        name = "op_quantized_conv1d_nlc",
+        srcs = ["op_quantized_conv1d_nlc.cpp"],
+        exported_headers = ["op_quantized_conv1d_nlc.h"],
         platforms = CXX,
         deps = [
             ":cadence_type_util",
diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
index 185ea390c0e..626033a5a43 100644
--- a/backends/cadence/hifi/operators/CMakeLists.txt
+++ b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -124,11 +124,9 @@ add_library(
   "op_quantized_relu_asym8u_asym8u_per_tensor_out.cpp"
   "op_dequantize_per_tensor.cpp"
   "op_dequantize_per_tensor_asym8s.cpp"
-  "op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out.cpp"
-  "op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out.cpp"
-  "op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out.cpp"
-  "op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out.cpp"
   "op_quantized_conv2d_nchw_out.cpp"
+  "op_quantized_conv1d_ncl.cpp"
+  "op_quantized_conv1d_nlc.cpp"
   "op_quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp"
   "op_quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp"
   "op_quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp"
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl.cpp
similarity index 54%
rename from backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_conv1d_ncl.cpp
index f543f4633cf..9b0ccf4ea25 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl.cpp
@@ -21,6 +21,8 @@ namespace impl {
 namespace HiFi {
 namespace native {
 
+namespace {
+
 // Optimized NCHW 1D convolution for int8 x int8 -> int8
 void xa_opt_quantized_conv1d_ncl_asym8sxsym8s_asym8s(
     KernelRuntimeContext& ctx,
@@ -199,7 +201,144 @@ void xa_opt_quantized_conv1d_ncl_asym8sxsym8s_asym8s(
   }
 }
 
-void quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out(
+// Optimized NCHW 1D convolution for uint8 x uint8 -> uint8
+void xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    int32_t in_zero_point,
+    int32_t weight_zero_point,
+    float bias_scale,
+    float output_scale,
+    int32_t output_zero_point,
+    Tensor& out) {
+  constexpr int kNnlibMaxDim = 5;
+
+  UWORD8* __restrict__ p_out =
+      (UWORD8* __restrict__)out.mutable_data_ptr<uint8_t>();
+  UWORD8* __restrict__ p_inp =
+      (UWORD8* __restrict__)input.const_data_ptr<uint8_t>();
+  UWORD8* __restrict__ p_kernel =
+      (UWORD8* __restrict__)weight.const_data_ptr<uint8_t>();
+  WORD32* __restrict__ p_bias =
+      (WORD32* __restrict__)bias.const_data_ptr<int32_t>();
+
+  WORD32 batches = input.size(0);
+  WORD32 input_channels = input.size(1);
+  WORD32 input_width = input.size(2);
+  WORD32 input_height = 1;
+  WORD32 kernel_height = 1;
+  WORD32 out_channels = weight.size(0);
+  WORD32 kernel_channels = weight.size(1);
+  WORD32 kernel_width = weight.size(2);
+  WORD32 out_width = out.size(2);
+  WORD32 out_height = 1;
+  WORD32 x_stride = stride[1];
+  WORD32 x_padding = padding[1];
+  WORD32 input_zero_bias = -in_zero_point;
+  WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648;
+  WORD32 out_shift32 = 0;
+  WORD32 kernel_zero_bias = -weight_zero_point;
+
+  WORD32 out_zero_bias = output_zero_point;
+  WORD32 out_data_format = 1;
+
+  WORD32 scratch_size =
+      xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8);
+  scratch_size = scratch_size < 0 ? 0 : scratch_size;
+  WORD32* ptr_scratch =
+      (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size);
+  pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
+
+  WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory(
+      ctx, ((batches * input_channels * input_width) + 8) * sizeof(WORD8));
+  WORD8* ptr2 = (WORD8*)kernels::allocate_temp_memory(
+      ctx,
+      ((out_channels * kernel_channels * kernel_width) + 8) * sizeof(WORD8));
+  WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8);
+  WORD8* pkernel = (WORD8*)ALIGN_PTR(ptr2, 8);
+
+  WORD32 p_inp_shape[kNnlibMaxDim];
+  p_inp_shape[0] = 1;
+  p_inp_shape[1] = 1;
+  p_inp_shape[2] = batches;
+  p_inp_shape[3] = input_channels;
+  p_inp_shape[4] = input_width;
+
+  WORD32 p_out_shape[kNnlibMaxDim];
+  p_out_shape[0] = 1;
+  p_out_shape[1] = 1;
+  p_out_shape[2] = batches;
+  p_out_shape[3] = input_width;
+  p_out_shape[4] = input_channels;
+
+  WORD32 p_permute_vec[kNnlibMaxDim] = {0, 1, 2, 4, 3};
+
+  xa_nn_transpose_8_8(
+      (WORD8*)pin,
+      p_out_shape,
+      (WORD8*)p_inp,
+      p_inp_shape,
+      p_permute_vec,
+      kNnlibMaxDim,
+      kNnlibMaxDim);
+
+  WORD32 p_inp_shape1[kNnlibMaxDim];
+  p_inp_shape1[0] = 1;
+  p_inp_shape1[1] = 1;
+  p_inp_shape1[2] = out_channels;
+  p_inp_shape1[3] = kernel_channels;
+  p_inp_shape1[4] = kernel_width;
+
+  WORD32 p_out_shape1[kNnlibMaxDim];
+  p_out_shape1[0] = 1;
+  p_out_shape1[1] = 1;
+  p_out_shape1[2] = out_channels;
+  p_out_shape1[3] = kernel_width;
+  p_out_shape1[4] = kernel_channels;
+
+  xa_nn_transpose_8_8(
+      (WORD8*)pkernel,
+      p_out_shape1,
+      (WORD8*)p_kernel,
+      p_inp_shape1,
+      p_permute_vec,
+      kNnlibMaxDim,
+      kNnlibMaxDim);
+
+  for (int _n = 0; _n < batches; _n++) {
+    UWORD8* in_batch = (UWORD8*)(pin + _n * input_channels * input_width);
+    UWORD8* out_batch = (UWORD8*)(p_out + _n * out_channels * out_width);
+
+    xa_nn_conv1d_std_asym8uxasym8u(
+        out_batch,
+        in_batch,
+        (UWORD8*)pkernel,
+        p_bias,
+        input_width,
+        input_height,
+        input_channels,
+        kernel_width,
+        out_channels,
+        x_stride,
+        x_padding,
+        out_width,
+        input_zero_bias,
+        kernel_zero_bias,
+        out_multiplier32,
+        out_shift32,
+        out_zero_bias,
+        out_data_format,
+        p_scratch);
+  }
+}
+
+} // namespace
+
+void quantized_conv1d_ncl_per_tensor_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -216,19 +355,42 @@ void quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  xa_opt_quantized_conv1d_ncl_asym8sxsym8s_asym8s(
-      ctx,
-      input,
-      weight,
-      bias,
-      stride,
-      padding,
-      in_zero_point,
-      weight_zero_point,
-      bias_scale,
-      output_scale,
-      output_zero_point,
-      out);
+  ScalarType dtype = out.scalar_type();
+
+  if (dtype == ScalarType::Char) {
+    xa_opt_quantized_conv1d_ncl_asym8sxsym8s_asym8s(
+        ctx,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        static_cast<int32_t>(in_zero_point),
+        static_cast<int32_t>(weight_zero_point),
+        static_cast<float>(bias_scale),
+        static_cast<float>(output_scale),
+        static_cast<int32_t>(output_zero_point),
+        out);
+  } else if (dtype == ScalarType::Byte) {
+    xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u(
+        ctx,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        static_cast<int32_t>(in_zero_point),
+        static_cast<int32_t>(weight_zero_point),
+        static_cast<float>(bias_scale),
+        static_cast<float>(output_scale),
+        static_cast<int32_t>(output_zero_point),
+        out);
+  } else {
+    ET_DCHECK_MSG(
+        false,
+        "Unhandled dtype %s for quantized_conv1d_ncl",
+        torch::executor::toString(dtype));
+  }
 }
 
 } // namespace native
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out.cpp
deleted file mode 100644
index 4ad36a3b5fa..00000000000
--- a/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/cadence/hifi/kernels/kernels.h>
-#include <executorch/backends/cadence/hifi/operators/operators.h>
-#include <executorch/runtime/kernel/kernel_includes.h>
-
-#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
-
-using Tensor = executorch::aten::Tensor;
-using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
-using ScalarType = executorch::aten::ScalarType;
-using ::executorch::aten::IntArrayRef;
-
-namespace impl {
-namespace HiFi {
-namespace native {
-
-// Optimized NCHW 1D convolution for uint8 x uint8 -> uint8
-void xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u(
-    KernelRuntimeContext& ctx,
-    const Tensor& input,
-    const Tensor& weight,
-    const Tensor& bias,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    int32_t in_zero_point,
-    int32_t weight_zero_point,
-    float bias_scale,
-    float output_scale,
-    int32_t output_zero_point,
-    Tensor& out) {
-  constexpr int kNnlibMaxDim = 5;
-
-  UWORD8* __restrict__ p_out =
-      (UWORD8* __restrict__)out.mutable_data_ptr<uint8_t>();
-  UWORD8* __restrict__ p_inp =
-      (UWORD8* __restrict__)input.const_data_ptr<uint8_t>();
-  UWORD8* __restrict__ p_kernel =
-      (UWORD8* __restrict__)weight.const_data_ptr<uint8_t>();
-  WORD32* __restrict__ p_bias =
-      (WORD32* __restrict__)bias.const_data_ptr<int32_t>();
-
-  WORD32 batches = input.size(0);
-  WORD32 input_channels = input.size(1);
-  WORD32 input_width = input.size(2);
-  WORD32 input_height = 1;
-  WORD32 kernel_height = 1;
-  WORD32 out_channels = weight.size(0);
-  WORD32 kernel_channels = weight.size(1);
-  WORD32 kernel_width = weight.size(2);
-  WORD32 out_width = out.size(2);
-  WORD32 out_height = 1;
-  WORD32 x_stride = stride[1];
-  WORD32 x_padding = padding[1];
-  WORD32 input_zero_bias = -in_zero_point;
-  WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648;
-  WORD32 out_shift32 = 0;
-  WORD32 kernel_zero_bias = -weight_zero_point;
-
-  WORD32 out_zero_bias = output_zero_point;
-  WORD32 out_data_format = 1;
-
-  WORD32 scratch_size =
-      xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8);
-  scratch_size = scratch_size < 0 ? 0 : scratch_size;
-  WORD32* ptr_scratch =
-      (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size);
-  pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
-
-  WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory(
-      ctx, ((batches * input_channels * input_width) + 8) * sizeof(WORD8));
-  WORD8* ptr2 = (WORD8*)kernels::allocate_temp_memory(
-      ctx,
-      ((out_channels * kernel_channels * kernel_width) + 8) * sizeof(WORD8));
-  WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8);
-  WORD8* pkernel = (WORD8*)ALIGN_PTR(ptr2, 8);
-
-  WORD32 p_inp_shape[kNnlibMaxDim];
-  p_inp_shape[0] = 1;
-  p_inp_shape[1] = 1;
-  p_inp_shape[2] = batches;
-  p_inp_shape[3] = input_channels;
-  p_inp_shape[4] = input_width;
-
-  WORD32 p_out_shape[kNnlibMaxDim];
-  p_out_shape[0] = 1;
-  p_out_shape[1] = 1;
-  p_out_shape[2] = batches;
-  p_out_shape[3] = input_width;
-  p_out_shape[4] = input_channels;
-
-  WORD32 p_permute_vec[kNnlibMaxDim] = {0, 1, 2, 4, 3};
-
-  xa_nn_transpose_8_8(
-      (WORD8*)pin,
-      p_out_shape,
-      (WORD8*)p_inp,
-      p_inp_shape,
-      p_permute_vec,
-      kNnlibMaxDim,
-      kNnlibMaxDim);
-
-  WORD32 p_inp_shape1[kNnlibMaxDim];
-  p_inp_shape1[0] = 1;
-  p_inp_shape1[1] = 1;
-  p_inp_shape1[2] = out_channels;
-  p_inp_shape1[3] = kernel_channels;
-  p_inp_shape1[4] = kernel_width;
-
-  WORD32 p_out_shape1[kNnlibMaxDim];
-  p_out_shape1[0] = 1;
-  p_out_shape1[1] = 1;
-  p_out_shape1[2] = out_channels;
-  p_out_shape1[3] = kernel_width;
-  p_out_shape1[4] = kernel_channels;
-
-  xa_nn_transpose_8_8(
-      (WORD8*)pkernel,
-      p_out_shape1,
-      (WORD8*)p_kernel,
-      p_inp_shape1,
-      p_permute_vec,
-      kNnlibMaxDim,
-      kNnlibMaxDim);
-
-  for (int _n = 0; _n < batches; _n++) {
-    UWORD8* in_batch = (UWORD8*)(pin + _n * input_channels * input_width);
-    UWORD8* out_batch = (UWORD8*)(p_out + _n * out_channels * out_width);
-
-    xa_nn_conv1d_std_asym8uxasym8u(
-        out_batch,
-        in_batch,
-        (UWORD8*)pkernel,
-        p_bias,
-        input_width,
-        input_height,
-        input_channels,
-        kernel_width,
-        out_channels,
-        x_stride,
-        x_padding,
-        out_width,
-        input_zero_bias,
-        kernel_zero_bias,
-        out_multiplier32,
-        out_shift32,
-        out_zero_bias,
-        out_data_format,
-        p_scratch);
-  }
-}
-
-void quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out(
-    KernelRuntimeContext& ctx,
-    const Tensor& input,
-    const Tensor& weight,
-    const Tensor& bias,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    __ET_UNUSED IntArrayRef dilation,
-    __ET_UNUSED int64_t groups,
-    int64_t in_zero_point,
-    int64_t weight_zero_point,
-    double bias_scale,
-    double output_scale,
-    int64_t output_zero_point,
-    __ET_UNUSED int64_t out_multiplier,
-    __ET_UNUSED int64_t out_shift,
-    Tensor& out) {
-  xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u(
-      ctx,
-      input,
-      weight,
-      bias,
-      stride,
-      padding,
-      in_zero_point,
-      weight_zero_point,
-      bias_scale,
-      output_scale,
-      output_zero_point,
-      out);
-}
-
-} // namespace native
-} // namespace HiFi
-} // namespace impl
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp
similarity index 57%
rename from backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp
index 3b1c7b9a900..e40cca4a88a 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp
@@ -21,6 +21,8 @@ namespace impl {
 namespace HiFi {
 namespace native {
 
+namespace {
+
 // Optimized NHWC 1D convolution for int8 x int8 -> int8
 void xa_opt_quantized_conv1d_nlc_asym8sxsym8s_asym8s(
     KernelRuntimeContext& ctx,
@@ -141,7 +143,81 @@ void xa_opt_quantized_conv1d_nlc_asym8sxsym8s_asym8s(
   }
 }
 
-void quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out(
+// Optimized NHWC 1D convolution for uint8 x uint8 -> uint8
+void xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    int32_t in_zero_point,
+    int32_t weight_zero_point,
+    float bias_scale,
+    float output_scale,
+    int32_t output_zero_point,
+    Tensor& out) {
+  UWORD8* __restrict__ p_out =
+      (UWORD8* __restrict__)out.mutable_data_ptr<uint8_t>();
+  UWORD8* __restrict__ p_inp =
+      (UWORD8* __restrict__)input.const_data_ptr<uint8_t>();
+  UWORD8* __restrict__ p_kernel =
+      (UWORD8* __restrict__)weight.const_data_ptr<uint8_t>();
+  WORD32* __restrict__ p_bias =
+      (WORD32* __restrict__)bias.const_data_ptr<int32_t>();
+
+  WORD32 batches = input.size(0);
+  WORD32 input_channels = input.size(2);
+  WORD32 input_width = input.size(1);
+  WORD32 out_channels = weight.size(2);
+  WORD32 kernel_width = weight.size(1);
+  WORD32 out_width = out.size(1);
+  WORD32 x_stride = stride[1];
+  WORD32 x_padding = padding[1];
+  WORD32 input_zero_bias = -in_zero_point;
+  WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648;
+  WORD32 out_shift32 = 0;
+  WORD32 kernel_zero_bias = -weight_zero_point;
+
+  WORD32 out_zero_bias = output_zero_point;
+  WORD32 out_data_format = 0;
+  WORD32 scratch_size =
+      xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8);
+  scratch_size = scratch_size < 0 ? 0 : scratch_size;
+  WORD32* ptr_scratch =
+      (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size);
+  pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
+
+  for (int _n = 0; _n < batches; _n++) {
+    UWORD8* in_batch = p_inp + _n * input_channels * input_width;
+    UWORD8* out_batch = p_out + _n * out_channels * out_width;
+
+    xa_nn_conv1d_std_asym8uxasym8u(
+        out_batch,
+        in_batch,
+        p_kernel,
+        p_bias,
+        input_width,
+        1,
+        input_channels,
+        kernel_width,
+        out_channels,
+        x_stride,
+        x_padding,
+        out_width,
+        input_zero_bias,
+        kernel_zero_bias,
+        out_multiplier32,
+        out_shift32,
+        out_zero_bias,
+        out_data_format,
+        p_scratch);
+  }
+}
+
+} // namespace
+
+void quantized_conv1d_nlc_per_tensor_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -158,19 +234,42 @@ void quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  xa_opt_quantized_conv1d_nlc_asym8sxsym8s_asym8s(
-      ctx,
-      input,
-      weight,
-      bias,
-      stride,
-      padding,
-      in_zero_point,
-      weight_zero_point,
-      bias_scale,
-      output_scale,
-      output_zero_point,
-      out);
+  ScalarType dtype = out.scalar_type();
+
+  if (dtype == ScalarType::Char) {
+    xa_opt_quantized_conv1d_nlc_asym8sxsym8s_asym8s(
+        ctx,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        static_cast<int32_t>(in_zero_point),
+        static_cast<int32_t>(weight_zero_point),
+        static_cast<float>(bias_scale),
+        static_cast<float>(output_scale),
+        static_cast<int32_t>(output_zero_point),
+        out);
+  } else if (dtype == ScalarType::Byte) {
+    xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u(
+        ctx,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        static_cast<int32_t>(in_zero_point),
+        static_cast<int32_t>(weight_zero_point),
+        static_cast<float>(bias_scale),
+        static_cast<float>(output_scale),
+        static_cast<int32_t>(output_zero_point),
+        out);
+  } else {
+    ET_DCHECK_MSG(
+        false,
+        "Unhandled dtype %s for quantized_conv1d_nlc",
+        torch::executor::toString(dtype));
+  }
 }
 
 } // namespace native
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out.cpp
deleted file mode 100644
index 5539410f46e..00000000000
--- a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/cadence/hifi/kernels/kernels.h>
-#include <executorch/backends/cadence/hifi/operators/operators.h>
-#include <executorch/runtime/kernel/kernel_includes.h>
-
-#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
-
-using Tensor = executorch::aten::Tensor;
-using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
-using ScalarType = executorch::aten::ScalarType;
-using ::executorch::aten::IntArrayRef;
-
-namespace impl {
-namespace HiFi {
-namespace native {
-
-// Optimized NHWC 1D convolution for uint8 x uint8 -> uint8
-void xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u(
-    KernelRuntimeContext& ctx,
-    const Tensor& input,
-    const Tensor& weight,
-    const Tensor& bias,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    int32_t in_zero_point,
-    int32_t weight_zero_point,
-    float bias_scale,
-    float output_scale,
-    int32_t output_zero_point,
-    Tensor& out) {
-  UWORD8* __restrict__ p_out =
-      (UWORD8* __restrict__)out.mutable_data_ptr<uint8_t>();
-  UWORD8* __restrict__ p_inp =
-      (UWORD8* __restrict__)input.const_data_ptr<uint8_t>();
-  UWORD8* __restrict__ p_kernel =
-      (UWORD8* __restrict__)weight.const_data_ptr<uint8_t>();
-  WORD32* __restrict__ p_bias =
-      (WORD32* __restrict__)bias.const_data_ptr<int32_t>();
-
-  WORD32 batches = input.size(0);
-  WORD32 input_channels = input.size(2);
-  WORD32 input_width = input.size(1);
-  WORD32 out_channels = weight.size(2);
-  WORD32 kernel_width = weight.size(1);
-  WORD32 out_width = out.size(1);
-  WORD32 x_stride = stride[1];
-  WORD32 x_padding = padding[1];
-  WORD32 input_zero_bias = -in_zero_point;
-  WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648;
-  WORD32 out_shift32 = 0;
-  WORD32 kernel_zero_bias = -weight_zero_point;
-
-  WORD32 out_zero_bias = output_zero_point;
-  WORD32 out_data_format = 0;
-  WORD32 scratch_size =
-      xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8);
-  scratch_size = scratch_size < 0 ? 0 : scratch_size;
-  WORD32* ptr_scratch =
-      (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size);
-  pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
-
-  for (int _n = 0; _n < batches; _n++) {
-    UWORD8* in_batch = p_inp + _n * input_channels * input_width;
-    UWORD8* out_batch = p_out + _n * out_channels * out_width;
-
-    xa_nn_conv1d_std_asym8uxasym8u(
-        out_batch,
-        in_batch,
-        p_kernel,
-        p_bias,
-        input_width,
-        1,
-        input_channels,
-        kernel_width,
-        out_channels,
-        x_stride,
-        x_padding,
-        out_width,
-        input_zero_bias,
-        kernel_zero_bias,
-        out_multiplier32,
-        out_shift32,
-        out_zero_bias,
-        out_data_format,
-        p_scratch);
-  }
-}
-
-void quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out(
-    KernelRuntimeContext& ctx,
-    const Tensor& input,
-    const Tensor& weight,
-    const Tensor& bias,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    __ET_UNUSED IntArrayRef dilation,
-    __ET_UNUSED int64_t groups,
-    int64_t in_zero_point,
-    int64_t weight_zero_point,
-    double bias_scale,
-    double output_scale,
-    int64_t output_zero_point,
-    __ET_UNUSED int64_t out_multiplier,
-    __ET_UNUSED int64_t out_shift,
-    Tensor& out) {
-  xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u(
-      ctx,
-      input,
-      weight,
-      bias,
-      stride,
-      padding,
-      in_zero_point,
-      weight_zero_point,
-      bias_scale,
-      output_scale,
-      output_zero_point,
-      out);
-}
-
-} // namespace native
-} // namespace HiFi
-} // namespace impl
diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl
index 1e6c7f26d42..9753051bf72 100644
--- a/backends/cadence/hifi/operators/targets.bzl
+++ b/backends/cadence/hifi/operators/targets.bzl
@@ -365,8 +365,8 @@ def define_common_targets():
     )
 
     runtime.cxx_library(
-        name = "op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out",
-        srcs = ["op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out.cpp"],
+        name = "op_quantized_conv1d_ncl",
+        srcs = ["op_quantized_conv1d_ncl.cpp"],
         exported_headers = ["operators.h"],
         platforms = CXX,
         deps = COMMON_DEPS,
@@ -375,28 +375,8 @@ def define_common_targets():
     )
 
     runtime.cxx_library(
-        name = "op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out",
-        srcs = ["op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out.cpp"],
-        exported_headers = ["operators.h"],
-        platforms = CXX,
-        deps = COMMON_DEPS,
-        visibility = ["PUBLIC"],
-        compatible_with = ["ovr_config//cpu:xtensa"],
-    )
-
-    runtime.cxx_library(
-        name = "op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out",
-        srcs = ["op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out.cpp"],
-        exported_headers = ["operators.h"],
-        platforms = CXX,
-        deps = COMMON_DEPS,
-        visibility = ["PUBLIC"],
-        compatible_with = ["ovr_config//cpu:xtensa"],
-    )
-
-    runtime.cxx_library(
-        name = "op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out",
-        srcs = ["op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out.cpp"],
+        name = "op_quantized_conv1d_nlc",
+        srcs = ["op_quantized_conv1d_nlc.cpp"],
         exported_headers = ["operators.h"],
         platforms = CXX,
         deps = COMMON_DEPS,