From 9a0c0afaa5dfd24b7d066e4ee278f1e9d1359cbf Mon Sep 17 00:00:00 2001
From: Yufeng Shi <yufeng.shi@arm.com>
Date: Thu, 5 Feb 2026 09:54:14 +0000
Subject: [PATCH] Arm backend: Add support for aten.erfinv.default

Change-Id: I63613ecd98d1187ef5c4c0509fc1afc88f8ed47f
Signed-off-by: Yufeng Shi <yufeng.shi@arm.com>
---
 backends/arm/_passes/__init__.py              |   1 +
 backends/arm/_passes/arm_pass_manager.py      |   2 +
 backends/arm/_passes/decompose_erfinv_pass.py | 377 ++++++++++++++++++
 backends/arm/_passes/insert_table_ops.py      |   1 +
 .../tosa_profile_supported_op_lists.py        |   2 +
 .../arm/quantizer/quantization_annotator.py   |   1 +
 .../arm/scripts/collect_testname_resources.py |   1 +
 backends/arm/test/ops/test_erfinv.py          | 121 ++++++
 8 files changed, 506 insertions(+)
 create mode 100644 backends/arm/_passes/decompose_erfinv_pass.py
 create mode 100644 backends/arm/test/ops/test_erfinv.py

diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
index f11a91a600c..12efa0822bd 100644
--- a/backends/arm/_passes/__init__.py
+++ b/backends/arm/_passes/__init__.py
@@ -45,6 +45,7 @@
 from .decompose_div_tensor_mode import DecomposeDivTensorModePass  # noqa
 from .decompose_elu_pass import DecomposeEluPass  # noqa
 from .decompose_embedding_pass import DecomposeEmbeddingPass  # noqa  # noqa
+from .decompose_erfinv_pass import DecomposeErfinvPass  # noqa
 from .decompose_expm1_pass import DecomposeExpm1Pass  # noqa
 from .decompose_floor_divide_pass import DecomposeFloorDividePass  # noqa
 from .decompose_gelu_pass import DecomposeGeluPass  # noqa
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index 9cedc6851c8..725bc77a734 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -52,6 +52,7 @@
     DecomposeDivTensorModePass,
     DecomposeEluPass,
     DecomposeEmbeddingPass,
+    DecomposeErfinvPass,
     DecomposeExpm1Pass,
     DecomposeFloorDividePass,
     DecomposeGeluPass,
@@ -283,6 +284,7 @@ def _tosa_pipeline(
                 DecomposeAsinhPass(),
                 DecomposeCoshPass(),
                 DecomposeAsinAndAcosPass(),
+                DecomposeErfinvPass(),
                 DecomposeSqrtPass(),
                 DecomposeAtanPass(),
                 DecomposeAtanhPass(),
diff --git a/backends/arm/_passes/decompose_erfinv_pass.py b/backends/arm/_passes/decompose_erfinv_pass.py
new file mode 100644
index 00000000000..7ebbf181a97
--- /dev/null
+++ b/backends/arm/_passes/decompose_erfinv_pass.py
@@ -0,0 +1,377 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Set, Type
+
+from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.convert_full_like_to_full_pass import (
+    ConvertFullLikeToFullPass,
+)
+from executorch.backends.arm._passes.decompose_sqrt_pass import DecomposeSqrtPass
+from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass
+from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass
+from executorch.backends.arm._passes.replace_scalar_with_tensor_pass import (
+    ReplaceScalarWithTensorByProfilePass,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+
+edge_erfinv_ops = (exir_ops.edge.aten.erfinv.default,)
+
+
+def get_erfinv_decomposition(op) -> tuple:
+    if op in edge_erfinv_ops:
+        # Ordered by first use in call_operator below.
+        return (
+            exir_ops.edge.aten.full_like.default,
+            exir_ops.edge.aten.lt.Tensor,
+            exir_ops.edge.aten.where.self,
+            exir_ops.edge.aten.abs.default,
+            exir_ops.edge.aten.gt.Tensor,
+            exir_ops.edge.aten.eq.Tensor,
+            exir_ops.edge.aten.mul.Tensor,
+            exir_ops.edge.aten.mul.Scalar,
+            exir_ops.edge.aten.add.Scalar,
+            exir_ops.edge.aten.add.Tensor,
+            exir_ops.edge.aten.div.Tensor,
+            exir_ops.edge.aten.sub.Tensor,
+            exir_ops.edge.aten.log.default,
+            exir_ops.edge.aten.sqrt.default,
+            exir_ops.edge.aten.lt.Scalar,
+            exir_ops.edge.aten.erf.default,
+            exir_ops.edge.aten.exp.default,
+            exir_ops.edge.aten.gt.Scalar,
+        )
+
+    raise RuntimeError(f"Can't get erfinv decomposition for op {op}")
+
+
+class DecomposeErfinvPass(ArmPass):
+    """Decomposes `aten.erfinv` using the same *initial-guess* approximation as
+    the PyTorch CPU scalar `calc_erfinv`, with a guarded Newton refinement step
+    to improve numerical accuracy (especially for fp16).
+
+    Source:
+      - PyTorch `calc_erfinv` (CPU scalar fallback), Math.h (pinned commit):
+        https://github.com/pytorch/pytorch/blob/f61180da7043f27a1d8c5aec88fd4b910aca987a/aten/src/ATen/native/Math.h#L152
+
+    The approximation is piecewise:
+
+    Domain / special cases
+      - For |x| > 1:  erfinv(x) = NaN
+      - For  x =  1:  erfinv(x) = +inf
+      - For  x = -1:  erfinv(x) = -inf
+
+    Definitions
+      - s = sign(x)  (s = -1 if x < 0 else +1)
+      - a = |x|
+
+    Branch selection
+      - Central branch if a < 0.7
+      - Tail branch otherwise (a >= 0.7)
+
+    ------------------------------------------------------------------------
+    Central branch (a < 0.7)
+    ------------------------------------------------------------------------
+      Let z = x^2. Define:
+
+        P(z) = a0 + a1*z + a2*z^2 + a3*z^3
+        Q(z) = 1  + b0*z + b1*z^2 + b2*z^3 + b3*z^4
+
+      Initial guess:
+
+        y0 = x * P(z) / Q(z)
+
+      Coefficients:
+        a0= 0.886226899   a1=-1.645349621   a2= 0.914624893   a3=-0.140543331
+        b0=-2.118377725   b1= 1.442710462   b2=-0.329097515   b3= 0.012229801
+
+    ------------------------------------------------------------------------
+    Tail branch (a >= 0.7)
+    ------------------------------------------------------------------------
+      Compute:
+
+        u  = 0.5 * (1 - a)
+        t  = sqrt( -log(u) )
+
+      Note: Computing `log(u)` directly often gives better numerical accuracy than
+      forming an equivalent expression using `log1p(-a)` near a -> 1 (where 1-a is
+      tiny), because the transformation through `log1p` can introduce larger relative
+      error in that regime.
+
+      Define:
+
+        R(t) = c0 + c1*t + c2*t^2 + c3*t^3
+        S(t) = 1  + d0*t + d1*t^2
+
+      Initial guess:
+
+        y0 = s * | R(t) / S(t) |
+
+      Coefficients:
+        c0=-1.970840454   c1=-1.624906493   c2= 3.429567803   c3= 1.641345311
+        d0= 3.543889200   d1= 1.637067800
+
+    Output
+      Returns the refined estimate (starting from y0) with the special-case handling
+      for |x|>1 and x=±1 as described above.
+
+    """
+
+    _passes_required_after: Set[Type[ExportPass]] = {
+        DecomposeSqrtPass,
+        ConvertFullLikeToFullPass,
+        MatchArgRanksPass,
+        MatchArgDtypePass,
+        ReplaceScalarWithTensorByProfilePass,
+    }
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in edge_erfinv_ops:
+            return super().call_operator(op, args, kwargs, meta, updated=False)
+
+        if self._is_quantized_meta(meta):
+            # If quantized, node should be replaced by table op.
+            return super().call_operator(op, args, kwargs, meta)
+
+        x = args[0]
+
+        (
+            op_full_like,
+            op_lt_t,
+            op_where,
+            op_abs,
+            op_gt_t,
+            op_eq_t,
+            op_mul_t,
+            op_mul_s,
+            op_add_s,
+            op_add_t,
+            op_div_t,
+            op_sub_t,
+            op_log,
+            op_sqrt,
+            op_lt_s,
+            op_erf,
+            op_exp,
+            op_gt_s,
+        ) = get_erfinv_decomposition(op)
+
+        # ---- constants (PyTorch calc_erfinv coefficients) ----
+        CENTRAL_RANGE = 0.7
+
+        # central: x * P(z)/Q(z), z = x^2
+        a0, a1, a2, a3 = 0.886226899, -1.645349621, 0.914624893, -0.140543331
+        b0, b1, b2, b3 = -2.118377725, 1.442710462, -0.329097515, 0.012229801
+
+        # tail: s * |R(t)/S(t)|, t = sqrt(-log((1-|x|)/2))
+        c0, c1, c2, c3 = -1.970840454, -1.624906493, 3.429567803, 1.641345311
+        d0, d1 = 3.543889200, 1.637067800
+
+        # Newton refinement controls (safe defaults for fp16)
+        #   y1 = y0 - (erf(y0) - x) / ((2/sqrt(pi)) * exp(-y0^2))
+        NEWTON_ITERS = 1
+        REFINE_MAX_ABS = 0.95
+        DERIV_EPS = 1e-3
+        CORR_MAX = 0.5
+        TWO_OVER_SQRT_PI = 1.1283791670955126
+
+        # ---- zeros / ones (tensor-shaped) ----
+        zeros = super().call_operator(op_full_like, (x, 0.0), {}, meta, updated=True)
+        ones = super().call_operator(op_full_like, (x, 1.0), {}, meta, updated=True)
+        neg_ones = super().call_operator(
+            op_full_like, (x, -1.0), {}, meta, updated=True
+        )
+
+        # ---- s = sign(x): -1 for x<0 else +1 ----
+        x_lt0 = super().call_operator(op_lt_t, (x, zeros), {}, meta, updated=True)
+        s = super().call_operator(
+            op_where, (x_lt0, neg_ones, ones), {}, meta, updated=True
+        )
+
+        # ---- a = |x| and domain masks ----
+        a = super().call_operator(op_abs, (x,), {}, meta, updated=True)
+        abs_gt1 = super().call_operator(
+            op_gt_t, (a, ones), {}, meta, updated=True
+        )  # |x| > 1
+        abs_eq1 = super().call_operator(
+            op_eq_t, (a, ones), {}, meta, updated=True
+        )  # |x| == 1
+
+        # For internal math, avoid evaluating log(0), sqrt(inf), etc. on |x|==1 or |x|>1 lanes.
+        a_math = super().call_operator(
+            op_where, (abs_gt1, zeros, a), {}, meta, updated=True
+        )
+        a_math = super().call_operator(
+            op_where, (abs_eq1, zeros, a_math), {}, meta, updated=True
+        )
+
+        # ------------------------------------------------------------------
+        # Central initial guess: y0 = x * P(z) / Q(z), z = x^2
+        # ------------------------------------------------------------------
+        z = super().call_operator(op_mul_t, (x, x), {}, meta, updated=True)  # z = x^2
+
+        # P(z) via Horner: (((a3*z + a2)*z + a1)*z + a0)
+        P = super().call_operator(op_mul_s, (z, a3), {}, meta, updated=True)
+        P = super().call_operator(op_add_s, (P, a2), {}, meta, updated=True)
+        P = super().call_operator(op_mul_t, (P, z), {}, meta, updated=True)
+        P = super().call_operator(op_add_s, (P, a1), {}, meta, updated=True)
+        P = super().call_operator(op_mul_t, (P, z), {}, meta, updated=True)
+        P = super().call_operator(op_add_s, (P, a0), {}, meta, updated=True)
+
+        # Q(z) via Horner: ((((b3*z + b2)*z + b1)*z + b0)*z + 1)
+        Q = super().call_operator(op_mul_s, (z, b3), {}, meta, updated=True)
+        Q = super().call_operator(op_add_s, (Q, b2), {}, meta, updated=True)
+        Q = super().call_operator(op_mul_t, (Q, z), {}, meta, updated=True)
+        Q = super().call_operator(op_add_s, (Q, b1), {}, meta, updated=True)
+        Q = super().call_operator(op_mul_t, (Q, z), {}, meta, updated=True)
+        Q = super().call_operator(op_add_s, (Q, b0), {}, meta, updated=True)
+        Q = super().call_operator(op_mul_t, (Q, z), {}, meta, updated=True)
+        Q = super().call_operator(op_add_t, (Q, ones), {}, meta, updated=True)
+
+        xP = super().call_operator(op_mul_t, (x, P), {}, meta, updated=True)
+        y0_central = super().call_operator(op_div_t, (xP, Q), {}, meta, updated=True)
+
+        # ------------------------------------------------------------------
+        # Tail initial guess: y0 = s * |R(t)/S(t)|
+        #   u = 0.5*(1-a), t = sqrt(-log(u))
+        # ------------------------------------------------------------------
+        one_minus_a = super().call_operator(
+            op_sub_t, (ones, a_math), {}, meta, updated=True
+        )  # 1-a
+        u = super().call_operator(
+            op_mul_s, (one_minus_a, 0.5), {}, meta, updated=True
+        )  # u = 0.5*(1-a)
+
+        # Avoid log(0) poisoning intermediates: for a==1 lanes, feed log(1)=0 and fix later.
+        u_safe = super().call_operator(
+            op_where, (abs_eq1, ones, u), {}, meta, updated=True
+        )
+        log_u = super().call_operator(op_log, (u_safe,), {}, meta, updated=True)
+        neg_log_u = super().call_operator(
+            op_mul_t, (log_u, neg_ones), {}, meta, updated=True
+        )
+        t = super().call_operator(
+            op_sqrt, (neg_log_u,), {}, meta, updated=True
+        )  # t = sqrt(-log(u))
+
+        # R(t) via Horner: ((c3*t + c2)*t + c1)*t + c0
+        R = super().call_operator(op_mul_s, (t, c3), {}, meta, updated=True)
+        R = super().call_operator(op_add_s, (R, c2), {}, meta, updated=True)
+        R = super().call_operator(op_mul_t, (R, t), {}, meta, updated=True)
+        R = super().call_operator(op_add_s, (R, c1), {}, meta, updated=True)
+        R = super().call_operator(op_mul_t, (R, t), {}, meta, updated=True)
+        R = super().call_operator(op_add_s, (R, c0), {}, meta, updated=True)
+
+        # S(t) via Horner: (d1*t + d0)*t + 1
+        S = super().call_operator(op_mul_s, (t, d1), {}, meta, updated=True)
+        S = super().call_operator(op_add_s, (S, d0), {}, meta, updated=True)
+        S = super().call_operator(op_mul_t, (S, t), {}, meta, updated=True)
+        S = super().call_operator(op_add_t, (S, ones), {}, meta, updated=True)
+
+        frac = super().call_operator(op_div_t, (R, S), {}, meta, updated=True)
+        frac_abs = super().call_operator(op_abs, (frac,), {}, meta, updated=True)
+        y0_tail = super().call_operator(op_mul_t, (s, frac_abs), {}, meta, updated=True)
+
+        # ---- select central vs tail (use lt to avoid le-lowering quirks) ----
+        in_central = super().call_operator(
+            op_lt_s, (a, CENTRAL_RANGE), {}, meta, updated=True
+        )
+        y0 = super().call_operator(
+            op_where, (in_central, y0_central, y0_tail), {}, meta, updated=True
+        )
+
+        # Ensure y0 doesn’t carry inf/nan before refinement / final where.
+        y0 = super().call_operator(
+            op_where, (abs_gt1, zeros, y0), {}, meta, updated=True
+        )
+        y0 = super().call_operator(
+            op_where, (abs_eq1, zeros, y0), {}, meta, updated=True
+        )
+
+        # ------------------------------------------------------------------
+        # Guarded Newton refinement
+        #
+        # We want to solve erf(y) = x for y.
+        #
+        # Newton's method update:
+        #   y_{k+1} = y_k - f(y_k) / f'(y_k)
+        # where:
+        #   f(y)  = erf(y) - x
+        #   f'(y) = d/dy erf(y) = (2/sqrt(pi)) * exp(-y^2)
+        #
+        # So the refinement step is:
+        #   y_{k+1} = y_k - (erf(y_k) - x) / ((2/sqrt(pi)) * exp(-y_k^2))
+        #
+        # Guards:
+        #   - only apply refinement for |x| < REFINE_MAX_ABS (avoid tail instability)
+        #   - skip the update if f'(y_k) is tiny (DERIV_EPS) to avoid huge steps
+        #   - skip the update if |correction| is too large (CORR_MAX) to avoid overshoot
+        # ------------------------------------------------------------------
+        refine_mask = super().call_operator(
+            op_lt_s, (a, REFINE_MAX_ABS), {}, meta, updated=True
+        )
+
+        y = y0
+        for _ in range(NEWTON_ITERS):
+            erf_y = super().call_operator(op_erf, (y,), {}, meta, updated=True)
+            err = super().call_operator(op_sub_t, (erf_y, x), {}, meta, updated=True)
+
+            y_sq = super().call_operator(op_mul_t, (y, y), {}, meta, updated=True)
+            neg_y_sq = super().call_operator(
+                op_mul_t, (y_sq, neg_ones), {}, meta, updated=True
+            )
+            exp_term = super().call_operator(
+                op_exp, (neg_y_sq,), {}, meta, updated=True
+            )
+
+            deriv = super().call_operator(
+                op_mul_s, (exp_term, TWO_OVER_SQRT_PI), {}, meta, updated=True
+            )
+
+            deriv_tiny = super().call_operator(
+                op_lt_s, (deriv, DERIV_EPS), {}, meta, updated=True
+            )
+            corr = super().call_operator(op_div_t, (err, deriv), {}, meta, updated=True)
+            corr_abs = super().call_operator(op_abs, (corr,), {}, meta, updated=True)
+            corr_huge = super().call_operator(
+                op_gt_s, (corr_abs, CORR_MAX), {}, meta, updated=True
+            )
+
+            y1 = super().call_operator(op_sub_t, (y, corr), {}, meta, updated=True)
+
+            # Apply guards: if tiny deriv or huge correction -> keep y
+            y_safe = super().call_operator(
+                op_where, (deriv_tiny, y, y1), {}, meta, updated=True
+            )
+            y_safe = super().call_operator(
+                op_where, (corr_huge, y, y_safe), {}, meta, updated=True
+            )
+
+            # Only refine where refine_mask is true
+            y = super().call_operator(
+                op_where, (refine_mask, y_safe, y), {}, meta, updated=True
+            )
+
+        y0 = y
+
+        # ---- special cases: NaN for |x|>1, +/-inf for |x|==1 ----
+        nan = super().call_operator(
+            op_div_t, (zeros, zeros), {}, meta, updated=True
+        )  # 0/0
+        pos_inf = super().call_operator(op_div_t, (ones, zeros), {}, meta, updated=True)
+        inf_signed = super().call_operator(
+            op_mul_t, (s, pos_inf), {}, meta, updated=True
+        )
+
+        out = y0
+        out = super().call_operator(
+            op_where, (abs_gt1, nan, out), {}, meta, updated=True
+        )
+        out = super().call_operator(
+            op_where, (abs_eq1, inf_signed, out), {}, meta, updated=True
+        )
+        return out
diff --git a/backends/arm/_passes/insert_table_ops.py b/backends/arm/_passes/insert_table_ops.py
index 732746b9781..035e9c6accc 100644
--- a/backends/arm/_passes/insert_table_ops.py
+++ b/backends/arm/_passes/insert_table_ops.py
@@ -35,6 +35,7 @@ class TableOps:
         exir_ops.edge.aten.erf.default: torch.erf,
         exir_ops.edge.aten.exp.default: torch.exp,
         exir_ops.edge.aten.expm1.default: torch.expm1,
+        exir_ops.edge.aten.erfinv.default: torch.erfinv,
         exir_ops.edge.aten.floor.default: torch.floor,
         exir_ops.edge.aten.log.default: torch.log,
         exir_ops.edge.aten.log1p.default: torch.log1p,
diff --git a/backends/arm/operator_support/tosa_profile_supported_op_lists.py b/backends/arm/operator_support/tosa_profile_supported_op_lists.py
index 2fef371a68f..c7a3bb9a976 100644
--- a/backends/arm/operator_support/tosa_profile_supported_op_lists.py
+++ b/backends/arm/operator_support/tosa_profile_supported_op_lists.py
@@ -51,6 +51,7 @@
     exir_ops.edge.aten.eq.Tensor,
     exir_ops.edge.aten.eq.Scalar,
     exir_ops.edge.aten.erf.default,
+    exir_ops.edge.aten.erfinv.default,
     exir_ops.edge.aten.exp.default,
     exir_ops.edge.aten.expm1.default,
     exir_ops.edge.aten.log.default,
@@ -157,6 +158,7 @@
     exir_ops.edge.aten.eq.Tensor,
     exir_ops.edge.aten.eq.Scalar,
     exir_ops.edge.aten.erf.default,
+    exir_ops.edge.aten.erfinv.default,
     exir_ops.edge.aten.exp.default,
     exir_ops.edge.aten.expm1.default,
     exir_ops.edge.aten.log1p.default,
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
index 080b9c503de..37c2b132244 100644
--- a/backends/arm/quantizer/quantization_annotator.py
+++ b/backends/arm/quantizer/quantization_annotator.py
@@ -400,6 +400,7 @@ def _match_pattern(
     torch.ops.aten.abs.default,
     torch.ops.aten.ceil.default,
     torch.ops.aten.erf.default,
+    torch.ops.aten.erfinv.default,
     torch.ops.aten.exp.default,
     torch.ops.aten.expm1.default,
     torch.ops.aten.elu.default,
diff --git a/backends/arm/scripts/collect_testname_resources.py b/backends/arm/scripts/collect_testname_resources.py
index b3dc5765b0a..65b5e4cd4c8 100644
--- a/backends/arm/scripts/collect_testname_resources.py
+++ b/backends/arm/scripts/collect_testname_resources.py
@@ -28,6 +28,7 @@
     "cond.default",
     "eye.default",
     "expm1.default",
+    "erfinv.default",
     "gather.default",
     "vector_norm.default",
     "hardsigmoid.default",
diff --git a/backends/arm/test/ops/test_erfinv.py b/backends/arm/test/ops/test_erfinv.py
new file mode 100644
index 00000000000..204a4c50455
--- /dev/null
+++ b/backends/arm/test/ops/test_erfinv.py
@@ -0,0 +1,121 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineINT,
+    EthosU85PipelineINT,
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
+
+aten_op = "torch.ops.aten.erfinv.default"
+exir_op = "executorch_exir_dialects_edge__ops_aten_erfinv_default"
+
+input_t1 = Tuple[torch.Tensor]
+
+test_data_suite = {
+    "zeros": torch.zeros(1, 10, 10, 10),
+    "small": torch.randn(100) * 0.01,
+    "mid": torch.rand(10, 10) * 1.8 - 0.9,
+    "near_pos_bound": torch.full((32,), 0.99),
+    "near_neg_bound": torch.full((32,), -0.99),
+    "pos_one": torch.full((32,), 1.0),
+    "neg_one": torch.full((32,), -1.0),
+    "ramp": torch.arange(-0.99, 0.99, 0.02),
+}
+
+
+test_data_nan_outputs = {
+    "pos_two": torch.full((32,), 2.0),
+    "neg_two": torch.full((32,), -2.0),
+}
+
+
+test_data_fp16 = {
+    "rand_fp16": (torch.rand(8, 8, dtype=torch.float16) * 1.8 - 0.9),
+    "ramp_fp16": torch.arange(-0.9, 0.9, 0.1, dtype=torch.float16),
+}
+
+
+class Erfinv(torch.nn.Module):
+    def forward(self, x: torch.Tensor):
+        return torch.erfinv(x)
+
+
+@common.parametrize(
+    "test_data", test_data_suite | test_data_nan_outputs | test_data_fp16
+)
+def test_erfinv_tosa_FP(test_data: torch.Tensor):
+    pipeline = TosaPipelineFP[input_t1](
+        Erfinv(),
+        (test_data,),
+        aten_op,
+        exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_erfinv_tosa_INT(test_data: torch.Tensor):
+    pipeline = TosaPipelineINT[input_t1](Erfinv(), (test_data,), aten_op, exir_op)
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone300
+def test_erfinv_u55_INT(test_data: torch.Tensor):
+    pipeline = EthosU55PipelineINT[input_t1](
+        Erfinv(),
+        (test_data,),
+        aten_ops=aten_op,
+        exir_ops=exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone320
+def test_erfinv_u85_INT(test_data: torch.Tensor):
+    pipeline = EthosU85PipelineINT[input_t1](
+        Erfinv(),
+        (test_data,),
+        aten_ops=aten_op,
+        exir_ops=exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data", test_data_suite | test_data_nan_outputs | test_data_fp16
+)
+@common.SkipIfNoModelConverter
+def test_erfinv_vgf_no_quant(test_data: torch.Tensor):
+    pipeline = VgfPipeline[input_t1](
+        Erfinv(),
+        (test_data,),
+        aten_op,
+        exir_op,
+        quantize=False,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.SkipIfNoModelConverter
+def test_erfinv_vgf_quant(test_data: torch.Tensor):
+    pipeline = VgfPipeline[input_t1](
+        Erfinv(),
+        (test_data,),
+        aten_op,
+        exir_op,
+        quantize=True,
+    )
+    pipeline.run()