From 0c5e168aaee1fcc5a196473c4433c6253c6164c1 Mon Sep 17 00:00:00 2001
From: r-near <163825889+r-near@users.noreply.github.com>
Date: Wed, 1 Apr 2026 21:24:32 -0700
Subject: [PATCH 1/7] winch: respect the enable_nan_canonicalization setting

---
 .../canonicalize-nan-scalar.wast              | 153 ++++++++++++++++++
 winch/codegen/src/isa/aarch64/masm.rs         |  34 +++-
 winch/codegen/src/isa/x64/masm.rs             |  23 +++
 winch/codegen/src/masm.rs                     |   3 +
 winch/codegen/src/visitor.rs                  |  47 +++++-
 5 files changed, 251 insertions(+), 9 deletions(-)
 create mode 100644 tests/misc_testsuite/canonicalize-nan-scalar.wast

diff --git a/tests/misc_testsuite/canonicalize-nan-scalar.wast b/tests/misc_testsuite/canonicalize-nan-scalar.wast
new file mode 100644
index 000000000000..8b019cdec803
--- /dev/null
+++ b/tests/misc_testsuite/canonicalize-nan-scalar.wast
@@ -0,0 +1,153 @@
+;;! nan_canonicalization = true
+
+;; Scalar counterpart to simd/canonicalize-nan.wast.
+
+(module
+  (func (export "f32.add") (param f32 f32) (result f32)
+    local.get 0
+    local.get 1
+    f32.add)
+  (func (export "f32.sub") (param f32 f32) (result f32)
+    local.get 0
+    local.get 1
+    f32.sub)
+  (func (export "f32.mul") (param f32 f32) (result f32)
+    local.get 0
+    local.get 1
+    f32.mul)
+  (func (export "f32.div") (param f32 f32) (result f32)
+    local.get 0
+    local.get 1
+    f32.div)
+  (func (export "f32.min") (param f32 f32) (result f32)
+    local.get 0
+    local.get 1
+    f32.min)
+  (func (export "f32.max") (param f32 f32) (result f32)
+    local.get 0
+    local.get 1
+    f32.max)
+  (func (export "f32.sqrt") (param f32) (result f32)
+    local.get 0
+    f32.sqrt)
+  (func (export "f32.ceil") (param f32) (result f32)
+    local.get 0
+    f32.ceil)
+  (func (export "f32.floor") (param f32) (result f32)
+    local.get 0
+    f32.floor)
+  (func (export "f32.trunc") (param f32) (result f32)
+    local.get 0
+    f32.trunc)
+  (func (export "f32.nearest") (param f32) (result f32)
+    local.get 0
+    f32.nearest)
+
+  (func (export "f64.add") (param f64 f64) (result f64)
+    local.get 0
+    local.get 1
+    f64.add)
+  (func (export "f64.sub") (param f64 f64) (result f64)
+    local.get 0
+    local.get 1
+    f64.sub)
+  (func (export "f64.mul") (param f64 f64) (result f64)
+    local.get 0
+    local.get 1
+    f64.mul)
+  (func (export "f64.div") (param f64 f64) (result f64)
+    local.get 0
+    local.get 1
+    f64.div)
+  (func (export "f64.min") (param f64 f64) (result f64)
+    local.get 0
+    local.get 1
+    f64.min)
+  (func (export "f64.max") (param f64 f64) (result f64)
+    local.get 0
+    local.get 1
+    f64.max)
+  (func (export "f64.sqrt") (param f64) (result f64)
+    local.get 0
+    f64.sqrt)
+  (func (export "f64.ceil") (param f64) (result f64)
+    local.get 0
+    f64.ceil)
+  (func (export "f64.floor") (param f64) (result f64)
+    local.get 0
+    f64.floor)
+  (func (export "f64.trunc") (param f64) (result f64)
+    local.get 0
+    f64.trunc)
+  (func (export "f64.nearest") (param f64) (result f64)
+    local.get 0
+    f64.nearest)
+
+  (func (export "reinterpret-and-demote") (param i64) (result i32)
+    local.get 0
+    f64.reinterpret_i64
+    f32.demote_f64
+    i32.reinterpret_f32)
+  (func (export "reinterpret-and-promote") (param i32) (result i64)
+    local.get 0
+    f32.reinterpret_i32
+    f64.promote_f32
+    i64.reinterpret_f64)
+
+  ;; Expose raw bits of 0/0 to verify exact canonical NaN bit patterns.
+  (func (export "f32.div-nan-bits") (result i32)
+    f32.const 0
+    f32.const 0
+    f32.div
+    i32.reinterpret_f32)
+  (func (export "f64.div-nan-bits") (result i64)
+    f64.const 0
+    f64.const 0
+    f64.div
+    i64.reinterpret_f64)
+)
+
+;; Exact bit patterns: canonical f32 NaN = 0x7fc00000, f64 = 0x7ff8000000000000
+(assert_return (invoke "f32.div-nan-bits") (i32.const 0x7fc00000))
+(assert_return (invoke "f64.div-nan-bits") (i64.const 0x7ff8000000000000))
+
+;; NaN-producing operations
+(assert_return (invoke "f32.div" (f32.const 0) (f32.const 0)) (f32.const nan:0x400000))
+(assert_return (invoke "f64.div" (f64.const 0) (f64.const 0)) (f64.const nan:0x8000000000000))
+(assert_return (invoke "f32.sqrt" (f32.const -1)) (f32.const nan:0x400000))
+(assert_return (invoke "f64.sqrt" (f64.const -1)) (f64.const nan:0x8000000000000))
+
+;; NaN propagation through f32 arithmetic
+(assert_return (invoke "f32.add" (f32.const nan) (f32.const 1)) (f32.const nan:0x400000))
+(assert_return (invoke "f32.sub" (f32.const nan) (f32.const 1)) (f32.const nan:0x400000))
+(assert_return (invoke "f32.mul" (f32.const nan) (f32.const 1)) (f32.const nan:0x400000))
+(assert_return (invoke "f32.min" (f32.const nan) (f32.const 1)) (f32.const nan:0x400000))
+(assert_return (invoke "f32.max" (f32.const nan) (f32.const 1)) (f32.const nan:0x400000))
+
+;; NaN propagation through f64 arithmetic
+(assert_return (invoke "f64.add" (f64.const nan) (f64.const 1)) (f64.const nan:0x8000000000000))
+(assert_return (invoke "f64.sub" (f64.const nan) (f64.const 1)) (f64.const nan:0x8000000000000))
+(assert_return (invoke "f64.mul" (f64.const nan) (f64.const 1)) (f64.const nan:0x8000000000000))
+(assert_return (invoke "f64.min" (f64.const nan) (f64.const 1)) (f64.const nan:0x8000000000000))
+(assert_return (invoke "f64.max" (f64.const nan) (f64.const 1)) (f64.const nan:0x8000000000000))
+
+;; Rounding NaN (f32)
+(assert_return (invoke "f32.ceil" (f32.const nan)) (f32.const nan:0x400000))
+(assert_return (invoke "f32.floor" (f32.const nan)) (f32.const nan:0x400000))
+(assert_return (invoke "f32.trunc" (f32.const nan)) (f32.const nan:0x400000))
+(assert_return (invoke "f32.nearest" (f32.const nan)) (f32.const nan:0x400000))
+
+;; Rounding NaN (f64)
+(assert_return (invoke "f64.ceil" (f64.const nan)) (f64.const nan:0x8000000000000))
+(assert_return (invoke "f64.floor" (f64.const nan)) (f64.const nan:0x8000000000000))
+(assert_return (invoke "f64.trunc" (f64.const nan)) (f64.const nan:0x8000000000000))
+(assert_return (invoke "f64.nearest" (f64.const nan)) (f64.const nan:0x8000000000000))
+
+;; Demote/promote with non-canonical NaN bit patterns
+(assert_return (invoke "reinterpret-and-demote" (i64.const 0xfffefdfccccdcecf)) (i32.const 0x7fc00000))
+(assert_return (invoke "reinterpret-and-promote" (i32.const 0xfffefdfc)) (i64.const 0x7ff8000000000000))
+
+;; Normal values pass through unchanged
+(assert_return (invoke "f32.add" (f32.const 1) (f32.const 2)) (f32.const 3))
+(assert_return (invoke "f64.div" (f64.const 10) (f64.const 2)) (f64.const 5))
+(assert_return (invoke "f32.sqrt" (f32.const 4)) (f32.const 2))
diff --git a/winch/codegen/src/isa/aarch64/masm.rs b/winch/codegen/src/isa/aarch64/masm.rs
index 26948ed15fe7..d73f9ba7f982 100644
--- a/winch/codegen/src/isa/aarch64/masm.rs
+++ b/winch/codegen/src/isa/aarch64/masm.rs
@@ -59,6 +59,8 @@ pub(crate) struct MacroAssembler {
     ptr_size: OperandSize,
     /// Scratch register scope.
     scratch_scope: RegAlloc,
+    /// Shared flags.
+    shared_flags: settings::Flags,
 }
 
 impl MacroAssembler {
@@ -71,10 +73,11 @@ impl MacroAssembler {
         Ok(Self {
             sp_max: 0,
             stack_max_use_add: None,
-            asm: Assembler::new(shared_flags, isa_flags),
+            asm: Assembler::new(shared_flags.clone(), isa_flags),
             sp_offset: 0u32,
             ptr_size: ptr_type_from_ptr_size(ptr_size.size()).try_into()?,
             scratch_scope: RegAlloc::from(scratch_gpr_bitset(), scratch_fpr_bitset()),
+            shared_flags,
         })
     }
 
@@ -713,6 +716,35 @@ impl Masm for MacroAssembler {
         Ok(())
     }
 
+    fn canonicalize_nan(&mut self, reg: WritableReg, size: OperandSize) -> Result<()> {
+        if !self.shared_flags.enable_nan_canonicalization() {
+            return Ok(());
+        }
+
+        let done_label = self.asm.buffer_mut().get_label();
+
+        self.asm.fcmp(reg.to_reg(), reg.to_reg(), size);
+        self.asm.jmp_if(Cond::Vc, done_label);
+
+        let canonical_nan: &[u8] = match size {
+            OperandSize::S32 => &0x7FC00000u32.to_le_bytes(),
+            OperandSize::S64 => &0x7FF8000000000000u64.to_le_bytes(),
+            _ => bail!(CodeGenError::unexpected_operand_size()),
+        };
+        let constant = self.asm.add_constant(canonical_nan);
+        self.asm.uload(
+            inst::AMode::Const { addr: constant },
+            reg,
+            size,
+            TRUSTED_FLAGS,
+        );
+
+        self.asm
+            .buffer_mut()
+            .bind_label(done_label, &mut Default::default());
+        Ok(())
+    }
+
     fn and(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
         match (rhs, lhs, dst) {
             (RegImm::Imm(v), rn, rd) => {
diff --git a/winch/codegen/src/isa/x64/masm.rs b/winch/codegen/src/isa/x64/masm.rs
index 7902d3339b31..37b20173b6e0 100644
--- a/winch/codegen/src/isa/x64/masm.rs
+++ b/winch/codegen/src/isa/x64/masm.rs
@@ -686,6 +686,29 @@ impl Masm for MacroAssembler {
         Ok(())
     }
 
+    fn canonicalize_nan(&mut self, reg: WritableReg, size: OperandSize) -> Result<()> {
+        if !self.shared_flags.enable_nan_canonicalization() {
+            return Ok(());
+        }
+
+        let done_label = self.asm.buffer_mut().get_label();
+
+        self.asm.ucomis(reg.to_reg(), reg.to_reg(), size);
+        self.asm.jmp_if(CC::NP, done_label);
+
+        let canonical_nan: &[u8] = match size {
+            OperandSize::S32 => &0x7FC00000u32.to_le_bytes(),
+            OperandSize::S64 => &0x7FF8000000000000u64.to_le_bytes(),
+            _ => bail!(CodeGenError::unexpected_operand_size()),
+        };
+        self.asm.load_fp_const(reg, canonical_nan, size);
+
+        self.asm
+            .buffer_mut()
+            .bind_label(done_label, &mut Default::default());
+        Ok(())
+    }
+
     fn and(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
         Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
         match (rhs, dst) {
diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs
index 0955bf46197e..64de3927cf09 100644
--- a/winch/codegen/src/masm.rs
+++ b/winch/codegen/src/masm.rs
@@ -1692,6 +1692,9 @@ pub(crate) trait MacroAssembler {
     /// Perform a floating point square root operation.
     fn float_sqrt(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()>;
 
+    /// Canonicalize NaN values in `reg` if the setting is enabled.
+    fn canonicalize_nan(&mut self, reg: WritableReg, size: OperandSize) -> Result<()>;
+
     /// Perform logical and operation.
     fn and(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()>;
 
diff --git a/winch/codegen/src/visitor.rs b/winch/codegen/src/visitor.rs
index 4d643266ed48..9cdb04cf9fcb 100644
--- a/winch/codegen/src/visitor.rs
+++ b/winch/codegen/src/visitor.rs
@@ -594,6 +594,7 @@ where
             OperandSize::S32,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_add(writable!(dst), dst, src, size)?;
+                masm.canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f32(dst))
             },
         )
@@ -605,6 +606,7 @@ where
             OperandSize::S64,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_add(writable!(dst), dst, src, size)?;
+                masm.canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f64(dst))
             },
         )
@@ -616,6 +618,7 @@ where
             OperandSize::S32,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_sub(writable!(dst), dst, src, size)?;
+                masm.canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f32(dst))
             },
         )
@@ -627,6 +630,7 @@ where
             OperandSize::S64,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_sub(writable!(dst), dst, src, size)?;
+                masm.canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f64(dst))
             },
         )
@@ -638,6 +642,7 @@ where
             OperandSize::S32,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_mul(writable!(dst), dst, src, size)?;
+                masm.canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f32(dst))
             },
         )
@@ -649,6 +654,7 @@ where
             OperandSize::S64,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_mul(writable!(dst), dst, src, size)?;
+                masm.canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f64(dst))
             },
         )
@@ -660,6 +666,7 @@ where
             OperandSize::S32,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_div(writable!(dst), dst, src, size)?;
+                masm.canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f32(dst))
             },
         )
@@ -671,6 +678,7 @@ where
             OperandSize::S64,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_div(writable!(dst), dst, src, size)?;
+                masm.canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f64(dst))
             },
         )
@@ -682,6 +690,7 @@ where
             OperandSize::S32,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_min(writable!(dst), dst, src, size)?;
+                masm.canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f32(dst))
             },
         )
@@ -693,6 +702,7 @@ where
             OperandSize::S64,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_min(writable!(dst), dst, src, size)?;
+                masm.canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f64(dst))
             },
         )
@@ -704,6 +714,7 @@ where
             OperandSize::S32,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_max(writable!(dst), dst, src, size)?;
+                masm.canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f32(dst))
             },
         )
@@ -715,6 +726,7 @@ where
             OperandSize::S64,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_max(writable!(dst), dst, src, size)?;
+                masm.canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f64(dst))
             },
         )
@@ -780,7 +792,8 @@ where
                 let builtin = env.builtins.floor_f32::<M::ABI>()?;
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
-        )
+        )?;
+        self.canonicalize_nan_for_round(OperandSize::S32)
     }
 
     fn visit_f64_floor(&mut self) -> Self::Output {
@@ -793,7 +806,8 @@ where
                 let builtin = env.builtins.floor_f64::<M::ABI>()?;
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
-        )
+        )?;
+        self.canonicalize_nan_for_round(OperandSize::S64)
     }
 
     fn visit_f32_ceil(&mut self) -> Self::Output {
@@ -806,7 +820,8 @@ where
                 let builtin = env.builtins.ceil_f32::<M::ABI>()?;
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
-        )
+        )?;
+        self.canonicalize_nan_for_round(OperandSize::S32)
     }
 
     fn visit_f64_ceil(&mut self) -> Self::Output {
@@ -819,7 +834,8 @@ where
                 let builtin = env.builtins.ceil_f64::<M::ABI>()?;
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
-        )
+        )?;
+        self.canonicalize_nan_for_round(OperandSize::S64)
     }
 
     fn visit_f32_nearest(&mut self) -> Self::Output {
@@ -832,7 +848,8 @@ where
                 let builtin = env.builtins.nearest_f32::<M::ABI>()?;
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
-        )
+        )?;
+        self.canonicalize_nan_for_round(OperandSize::S32)
     }
 
     fn visit_f64_nearest(&mut self) -> Self::Output {
@@ -845,7 +862,8 @@ where
                 let builtin = env.builtins.nearest_f64::<M::ABI>()?;
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
-        )
+        )?;
+        self.canonicalize_nan_for_round(OperandSize::S64)
     }
 
     fn visit_f32_trunc(&mut self) -> Self::Output {
@@ -858,7 +876,8 @@ where
                 let builtin = env.builtins.trunc_f32::<M::ABI>()?;
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
-        )
+        )?;
+        self.canonicalize_nan_for_round(OperandSize::S32)
     }
 
     fn visit_f64_trunc(&mut self) -> Self::Output {
@@ -871,12 +890,14 @@ where
                 let builtin = env.builtins.trunc_f64::<M::ABI>()?;
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
-        )
+        )?;
+        self.canonicalize_nan_for_round(OperandSize::S64)
     }
 
     fn visit_f32_sqrt(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.float_sqrt(writable!(reg), reg, OperandSize::S32)?;
+            masm.canonicalize_nan(writable!(reg), OperandSize::S32)?;
             Ok(TypedReg::f32(reg))
         })
     }
@@ -884,6 +905,7 @@ where
     fn visit_f64_sqrt(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.float_sqrt(writable!(reg), reg, OperandSize::S64)?;
+            masm.canonicalize_nan(writable!(reg), OperandSize::S64)?;
             Ok(TypedReg::f64(reg))
         })
     }
@@ -1097,6 +1119,7 @@ where
     fn visit_f32_demote_f64(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.demote(writable!(reg), reg)?;
+            masm.canonicalize_nan(writable!(reg), OperandSize::S32)?;
             Ok(TypedReg::f32(reg))
         })
     }
@@ -1104,6 +1127,7 @@ where
     fn visit_f64_promote_f32(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.promote(writable!(reg), reg)?;
+            masm.canonicalize_nan(writable!(reg), OperandSize::S64)?;
             Ok(TypedReg::f64(reg))
         })
     }
@@ -4600,6 +4624,13 @@ impl<'a, 'translation, 'data, M> CodeGen<'a, 'translation, 'data, M, Emission>
 where
     M: MacroAssembler,
 {
+    fn canonicalize_nan_for_round(&mut self, size: OperandSize) -> Result<()> {
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm.canonicalize_nan(writable!(result.into()), size)?;
+        self.context.stack.push(result.into());
+        Ok(())
+    }
+
     fn cmp_i32s(&mut self, kind: IntCmpKind) -> Result<()> {
         self.context.i32_binop(self.masm, |masm, dst, src, size| {
             masm.cmp_with_set(writable!(dst), src, kind, size)?;

From c476a6af0adbea68e1f822e75faec039ae7b4ca7 Mon Sep 17 00:00:00 2001
From: r-near <163825889+r-near@users.noreply.github.com>
Date: Thu, 2 Apr 2026 09:56:16 -0700
Subject: [PATCH 2/7] add disas tests for NaN canonicalization

---
 tests/disas/winch/x64/f32_add/nan_canon.wat | 40 +++++++++++++++++++
 tests/disas/winch/x64/f64_div/nan_canon.wat | 43 +++++++++++++++++++++
 2 files changed, 83 insertions(+)
 create mode 100644 tests/disas/winch/x64/f32_add/nan_canon.wat
 create mode 100644 tests/disas/winch/x64/f64_div/nan_canon.wat

diff --git a/tests/disas/winch/x64/f32_add/nan_canon.wat b/tests/disas/winch/x64/f32_add/nan_canon.wat
new file mode 100644
index 000000000000..9aa0ed26395f
--- /dev/null
+++ b/tests/disas/winch/x64/f32_add/nan_canon.wat
@@ -0,0 +1,40 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = "-Wnan-canonicalization"
+
+(module
+    (func (param f32 f32) (result f32)
+        local.get 0
+        local.get 1
+        f32.add
+    )
+)
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x18(%r11), %r11
+;;       addq    $0x20, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x69
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x20, %rsp
+;;       movq    %rdi, 0x18(%rsp)
+;;       movq    %rsi, 0x10(%rsp)
+;;       movss   %xmm0, 0xc(%rsp)
+;;       movss   %xmm1, 8(%rsp)
+;;       movss   8(%rsp), %xmm0
+;;       movss   0xc(%rsp), %xmm1
+;;       addss   %xmm0, %xmm1
+;;       ucomiss %xmm1, %xmm1
+;;       jnp     0x5d
+;;   55: movss   0x13(%rip), %xmm1
+;;       movaps  %xmm1, %xmm0
+;;       addq    $0x20, %rsp
+;;       popq    %rbp
+;;       retq
+;;   69: ud2
+;;   6b: addb    %al, (%rax)
+;;   6d: addb    %al, (%rax)
+;;   6f: addb    %al, (%rax)
+;;   71: addb    %al, %al
diff --git a/tests/disas/winch/x64/f64_div/nan_canon.wat b/tests/disas/winch/x64/f64_div/nan_canon.wat
new file mode 100644
index 000000000000..cf0285da39fd
--- /dev/null
+++ b/tests/disas/winch/x64/f64_div/nan_canon.wat
@@ -0,0 +1,43 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = "-Wnan-canonicalization"
+
+(module
+    (func (param f64 f64) (result f64)
+        local.get 0
+        local.get 1
+        f64.div
+    )
+)
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x18(%r11), %r11
+;;       addq    $0x20, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x68
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x20, %rsp
+;;       movq    %rdi, 0x18(%rsp)
+;;       movq    %rsi, 0x10(%rsp)
+;;       movsd   %xmm0, 8(%rsp)
+;;       movsd   %xmm1, (%rsp)
+;;       movsd   (%rsp), %xmm0
+;;       movsd   8(%rsp), %xmm1
+;;       divsd   %xmm0, %xmm1
+;;       ucomisd %xmm1, %xmm1
+;;       jnp     0x5c
+;;   54: movsd   0x14(%rip), %xmm1
+;;       movaps  %xmm1, %xmm0
+;;       addq    $0x20, %rsp
+;;       popq    %rbp
+;;       retq
+;;   68: ud2
+;;   6a: addb    %al, (%rax)
+;;   6c: addb    %al, (%rax)
+;;   6e: addb    %al, (%rax)
+;;   70: addb    %al, (%rax)
+;;   72: addb    %al, (%rax)
+;;   74: addb    %al, (%rax)
+;;   76: clc

From 7cdd58eadfac53fea387494408869caf34e6b042 Mon Sep 17 00:00:00 2001
From: r-near <163825889+r-near@users.noreply.github.com>
Date: Mon, 6 Apr 2026 11:05:09 -0700
Subject: [PATCH 3/7] rename canonicalize_nan to maybe_canonicalize_nan

---
 winch/codegen/src/isa/aarch64/masm.rs |  2 +-
 winch/codegen/src/isa/x64/masm.rs     |  2 +-
 winch/codegen/src/masm.rs             |  2 +-
 winch/codegen/src/visitor.rs          | 34 +++++++++++++--------------
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/winch/codegen/src/isa/aarch64/masm.rs b/winch/codegen/src/isa/aarch64/masm.rs
index d73f9ba7f982..b62498a81de6 100644
--- a/winch/codegen/src/isa/aarch64/masm.rs
+++ b/winch/codegen/src/isa/aarch64/masm.rs
@@ -716,7 +716,7 @@ impl Masm for MacroAssembler {
         Ok(())
     }
 
-    fn canonicalize_nan(&mut self, reg: WritableReg, size: OperandSize) -> Result<()> {
+    fn maybe_canonicalize_nan(&mut self, reg: WritableReg, size: OperandSize) -> Result<()> {
         if !self.shared_flags.enable_nan_canonicalization() {
             return Ok(());
         }
diff --git a/winch/codegen/src/isa/x64/masm.rs b/winch/codegen/src/isa/x64/masm.rs
index 37b20173b6e0..1566e65f264d 100644
--- a/winch/codegen/src/isa/x64/masm.rs
+++ b/winch/codegen/src/isa/x64/masm.rs
@@ -686,7 +686,7 @@ impl Masm for MacroAssembler {
         Ok(())
     }
 
-    fn canonicalize_nan(&mut self, reg: WritableReg, size: OperandSize) -> Result<()> {
+    fn maybe_canonicalize_nan(&mut self, reg: WritableReg, size: OperandSize) -> Result<()> {
         if !self.shared_flags.enable_nan_canonicalization() {
             return Ok(());
         }
diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs
index 64de3927cf09..1992525bf926 100644
--- a/winch/codegen/src/masm.rs
+++ b/winch/codegen/src/masm.rs
@@ -1693,7 +1693,7 @@ pub(crate) trait MacroAssembler {
     fn float_sqrt(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()>;
 
     /// Canonicalize NaN values in `reg` if the setting is enabled.
-    fn canonicalize_nan(&mut self, reg: WritableReg, size: OperandSize) -> Result<()>;
+    fn maybe_canonicalize_nan(&mut self, reg: WritableReg, size: OperandSize) -> Result<()>;
 
     /// Perform logical and operation.
     fn and(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()>;
diff --git a/winch/codegen/src/visitor.rs b/winch/codegen/src/visitor.rs
index 9cdb04cf9fcb..0d085aea4d1a 100644
--- a/winch/codegen/src/visitor.rs
+++ b/winch/codegen/src/visitor.rs
@@ -594,7 +594,7 @@ where
             OperandSize::S32,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_add(writable!(dst), dst, src, size)?;
-                masm.canonicalize_nan(writable!(dst), size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f32(dst))
             },
         )
@@ -606,7 +606,7 @@ where
             OperandSize::S64,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_add(writable!(dst), dst, src, size)?;
-                masm.canonicalize_nan(writable!(dst), size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f64(dst))
             },
         )
@@ -618,7 +618,7 @@ where
             OperandSize::S32,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_sub(writable!(dst), dst, src, size)?;
-                masm.canonicalize_nan(writable!(dst), size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f32(dst))
             },
         )
@@ -630,7 +630,7 @@ where
             OperandSize::S64,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_sub(writable!(dst), dst, src, size)?;
-                masm.canonicalize_nan(writable!(dst), size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f64(dst))
             },
         )
@@ -642,7 +642,7 @@ where
             OperandSize::S32,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_mul(writable!(dst), dst, src, size)?;
-                masm.canonicalize_nan(writable!(dst), size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f32(dst))
             },
         )
@@ -654,7 +654,7 @@ where
             OperandSize::S64,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_mul(writable!(dst), dst, src, size)?;
-                masm.canonicalize_nan(writable!(dst), size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f64(dst))
             },
         )
@@ -666,7 +666,7 @@ where
             OperandSize::S32,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_div(writable!(dst), dst, src, size)?;
-                masm.canonicalize_nan(writable!(dst), size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f32(dst))
             },
         )
@@ -678,7 +678,7 @@ where
             OperandSize::S64,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_div(writable!(dst), dst, src, size)?;
-                masm.canonicalize_nan(writable!(dst), size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f64(dst))
             },
         )
@@ -690,7 +690,7 @@ where
             OperandSize::S32,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_min(writable!(dst), dst, src, size)?;
-                masm.canonicalize_nan(writable!(dst), size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f32(dst))
             },
         )
@@ -702,7 +702,7 @@ where
             OperandSize::S64,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_min(writable!(dst), dst, src, size)?;
-                masm.canonicalize_nan(writable!(dst), size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f64(dst))
             },
         )
@@ -714,7 +714,7 @@ where
             OperandSize::S32,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_max(writable!(dst), dst, src, size)?;
-                masm.canonicalize_nan(writable!(dst), size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f32(dst))
             },
         )
@@ -726,7 +726,7 @@ where
             OperandSize::S64,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_max(writable!(dst), dst, src, size)?;
-                masm.canonicalize_nan(writable!(dst), size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f64(dst))
             },
         )
@@ -897,7 +897,7 @@ where
     fn visit_f32_sqrt(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.float_sqrt(writable!(reg), reg, OperandSize::S32)?;
-            masm.canonicalize_nan(writable!(reg), OperandSize::S32)?;
+            masm.maybe_canonicalize_nan(writable!(reg), OperandSize::S32)?;
             Ok(TypedReg::f32(reg))
         })
     }
@@ -905,7 +905,7 @@ where
     fn visit_f64_sqrt(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.float_sqrt(writable!(reg), reg, OperandSize::S64)?;
-            masm.canonicalize_nan(writable!(reg), OperandSize::S64)?;
+            masm.maybe_canonicalize_nan(writable!(reg), OperandSize::S64)?;
             Ok(TypedReg::f64(reg))
         })
     }
@@ -1119,7 +1119,7 @@ where
     fn visit_f32_demote_f64(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.demote(writable!(reg), reg)?;
-            masm.canonicalize_nan(writable!(reg), OperandSize::S32)?;
+            masm.maybe_canonicalize_nan(writable!(reg), OperandSize::S32)?;
             Ok(TypedReg::f32(reg))
         })
     }
@@ -1127,7 +1127,7 @@ where
     fn visit_f64_promote_f32(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.promote(writable!(reg), reg)?;
-            masm.canonicalize_nan(writable!(reg), OperandSize::S64)?;
+            masm.maybe_canonicalize_nan(writable!(reg), OperandSize::S64)?;
             Ok(TypedReg::f64(reg))
         })
     }
@@ -4626,7 +4626,7 @@ where
 {
     fn canonicalize_nan_for_round(&mut self, size: OperandSize) -> Result<()> {
         let result = self.context.pop_to_reg(self.masm, None)?;
-        self.masm.canonicalize_nan(writable!(result.into()), size)?;
+        self.masm.maybe_canonicalize_nan(writable!(result.into()), size)?;
         self.context.stack.push(result.into());
         Ok(())
     }

From 75cecea996194437e58c8f3b3b14102bd84b308e Mon Sep 17 00:00:00 2001
From: r-near <163825889+r-near@users.noreply.github.com>
Date: Mon, 6 Apr 2026 11:05:46 -0700
Subject: [PATCH 4/7] extract canonical NaN constants to shared masm module

---
 winch/codegen/src/isa/aarch64/masm.rs | 6 +++---
 winch/codegen/src/isa/x64/masm.rs     | 6 +++---
 winch/codegen/src/masm.rs             | 3 +++
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/winch/codegen/src/isa/aarch64/masm.rs b/winch/codegen/src/isa/aarch64/masm.rs
index b62498a81de6..6911a2559490 100644
--- a/winch/codegen/src/isa/aarch64/masm.rs
+++ b/winch/codegen/src/isa/aarch64/masm.rs
@@ -726,9 +726,9 @@ impl Masm for MacroAssembler {
         self.asm.fcmp(reg.to_reg(), reg.to_reg(), size);
         self.asm.jmp_if(Cond::Vc, done_label);
 
-        let canonical_nan: &[u8] = match size {
-            OperandSize::S32 => &0x7FC00000u32.to_le_bytes(),
-            OperandSize::S64 => &0x7FF8000000000000u64.to_le_bytes(),
+        let canonical_nan = match size {
+            OperandSize::S32 => crate::masm::CANONICAL_NAN_F32,
+            OperandSize::S64 => crate::masm::CANONICAL_NAN_F64,
             _ => bail!(CodeGenError::unexpected_operand_size()),
         };
         let constant = self.asm.add_constant(canonical_nan);
diff --git a/winch/codegen/src/isa/x64/masm.rs b/winch/codegen/src/isa/x64/masm.rs
index 1566e65f264d..c8b703544ff8 100644
--- a/winch/codegen/src/isa/x64/masm.rs
+++ b/winch/codegen/src/isa/x64/masm.rs
@@ -696,9 +696,9 @@ impl Masm for MacroAssembler {
         self.asm.ucomis(reg.to_reg(), reg.to_reg(), size);
         self.asm.jmp_if(CC::NP, done_label);
 
-        let canonical_nan: &[u8] = match size {
-            OperandSize::S32 => &0x7FC00000u32.to_le_bytes(),
-            OperandSize::S64 => &0x7FF8000000000000u64.to_le_bytes(),
+        let canonical_nan = match size {
+            OperandSize::S32 => crate::masm::CANONICAL_NAN_F32,
+            OperandSize::S64 => crate::masm::CANONICAL_NAN_F64,
             _ => bail!(CodeGenError::unexpected_operand_size()),
         };
         self.asm.load_fp_const(reg, canonical_nan, size);
diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs
index 1992525bf926..bb7305799a2b 100644
--- a/winch/codegen/src/masm.rs
+++ b/winch/codegen/src/masm.rs
@@ -13,6 +13,9 @@ use cranelift_codegen::{
 use std::{fmt::Debug, ops::Range};
 use wasmtime_environ::{PtrSize, WasmHeapType, WasmRefType, WasmValType};
 
+pub(crate) const CANONICAL_NAN_F32: &[u8] = &0x7FC00000u32.to_le_bytes();
+pub(crate) const CANONICAL_NAN_F64: &[u8] = &0x7FF8000000000000u64.to_le_bytes();
+
 pub(crate) use cranelift_codegen::ir::TrapCode;
 
 #[derive(Eq, PartialEq)]

From 172280745fc784d0f59c1bc7a5c06b973fd2fe12 Mon Sep 17 00:00:00 2001
From: r-near <163825889+r-near@users.noreply.github.com>
Date: Mon, 6 Apr 2026 11:06:26 -0700
Subject: [PATCH 5/7] remove canonicalize_nan_for_round, inline at call sites

---
 winch/codegen/src/visitor.rs | 47 ++++++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 15 deletions(-)

diff --git a/winch/codegen/src/visitor.rs b/winch/codegen/src/visitor.rs
index 0d085aea4d1a..de7b7b1b39c7 100644
--- a/winch/codegen/src/visitor.rs
+++ b/winch/codegen/src/visitor.rs
@@ -793,7 +793,10 @@ where
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
         )?;
-        self.canonicalize_nan_for_round(OperandSize::S32)
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm.maybe_canonicalize_nan(writable!(result.into()), OperandSize::S32)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f64_floor(&mut self) -> Self::Output {
@@ -807,7 +810,10 @@ where
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
         )?;
-        self.canonicalize_nan_for_round(OperandSize::S64)
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm.maybe_canonicalize_nan(writable!(result.into()), OperandSize::S64)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f32_ceil(&mut self) -> Self::Output {
@@ -821,7 +827,10 @@ where
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
         )?;
-        self.canonicalize_nan_for_round(OperandSize::S32)
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm.maybe_canonicalize_nan(writable!(result.into()), OperandSize::S32)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f64_ceil(&mut self) -> Self::Output {
@@ -835,7 +844,10 @@ where
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
         )?;
-        self.canonicalize_nan_for_round(OperandSize::S64)
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm.maybe_canonicalize_nan(writable!(result.into()), OperandSize::S64)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f32_nearest(&mut self) -> Self::Output {
@@ -849,7 +861,10 @@ where
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
         )?;
-        self.canonicalize_nan_for_round(OperandSize::S32)
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm.maybe_canonicalize_nan(writable!(result.into()), OperandSize::S32)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f64_nearest(&mut self) -> Self::Output {
@@ -863,7 +878,10 @@ where
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
         )?;
-        self.canonicalize_nan_for_round(OperandSize::S64)
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm.maybe_canonicalize_nan(writable!(result.into()), OperandSize::S64)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f32_trunc(&mut self) -> Self::Output {
@@ -877,7 +895,10 @@ where
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
         )?;
-        self.canonicalize_nan_for_round(OperandSize::S32)
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm.maybe_canonicalize_nan(writable!(result.into()), OperandSize::S32)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f64_trunc(&mut self) -> Self::Output {
@@ -891,7 +912,10 @@ where
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
         )?;
-        self.canonicalize_nan_for_round(OperandSize::S64)
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm.maybe_canonicalize_nan(writable!(result.into()), OperandSize::S64)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f32_sqrt(&mut self) -> Self::Output {
@@ -4624,13 +4648,6 @@ impl<'a, 'translation, 'data, M> CodeGen<'a, 'translation, 'data, M, Emission>
 where
     M: MacroAssembler,
 {
-    fn canonicalize_nan_for_round(&mut self, size: OperandSize) -> Result<()> {
-        let result = self.context.pop_to_reg(self.masm, None)?;
-        self.masm.maybe_canonicalize_nan(writable!(result.into()), size)?;
-        self.context.stack.push(result.into());
-        Ok(())
-    }
-
     fn cmp_i32s(&mut self, kind: IntCmpKind) -> Result<()> {
         self.context.i32_binop(self.masm, |masm, dst, src, size| {
             masm.cmp_with_set(writable!(dst), src, kind, size)?;

From cdef987aabb1682748559e747f6432b95f900768 Mon Sep 17 00:00:00 2001
From: r-near <163825889+r-near@users.noreply.github.com>
Date: Mon, 6 Apr 2026 11:15:19 -0700
Subject: [PATCH 6/7] implement SIMD NaN canonicalization for x64

---
 crates/test-util/src/wast.rs                  | 10 -----
 tests/disas/winch/x64/f32x4_add/nan_canon.wat | 42 ++++++++++++++++++
 winch/codegen/src/isa/aarch64/masm.rs         |  8 ++++
 winch/codegen/src/isa/x64/masm.rs             | 39 ++++++++++++++++
 winch/codegen/src/masm.rs                     | 27 ++++++++++++
 winch/codegen/src/visitor.rs                  | 44 +++++++++++++++++--
 6 files changed, 156 insertions(+), 14 deletions(-)
 create mode 100644 tests/disas/winch/x64/f32x4_add/nan_canon.wat

diff --git a/crates/test-util/src/wast.rs b/crates/test-util/src/wast.rs
index 7047c00e0a8f..259eda8155a6 100644
--- a/crates/test-util/src/wast.rs
+++ b/crates/test-util/src/wast.rs
@@ -622,16 +622,6 @@ impl WastTest {
 
             #[cfg(target_arch = "x86_64")]
             {
-                let unsupported = [
-                    // externref/reference-types related
-                    // simd-related failures
-                    "misc_testsuite/simd/canonicalize-nan.wast",
-                ];
-
-                if unsupported.iter().any(|part| self.path.ends_with(part)) {
-                    return true;
-                }
-
                 // SIMD on Winch requires AVX instructions.
                 #[cfg(target_arch = "x86_64")]
                 if !(std::is_x86_feature_detected!("avx") && std::is_x86_feature_detected!("avx2"))
diff --git a/tests/disas/winch/x64/f32x4_add/nan_canon.wat b/tests/disas/winch/x64/f32x4_add/nan_canon.wat
new file mode 100644
index 000000000000..89f1878a80a0
--- /dev/null
+++ b/tests/disas/winch/x64/f32x4_add/nan_canon.wat
@@ -0,0 +1,42 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = ["-Wnan-canonicalization", "-Ccranelift-has-avx"]
+
+(module
+    (func (param v128 v128) (result v128)
+        local.get 0
+        local.get 1
+        f32x4.add
+    )
+)
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x18(%r11), %r11
+;;       addq    $0x30, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x6c
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x30, %rsp
+;;       movq    %rdi, 0x28(%rsp)
+;;       movq    %rsi, 0x20(%rsp)
+;;       movdqu  %xmm0, 0x10(%rsp)
+;;       movdqu  %xmm1, (%rsp)
+;;       movdqu  (%rsp), %xmm0
+;;       movdqu  0x10(%rsp), %xmm1
+;;       vaddps  %xmm0, %xmm1, %xmm1
+;;       vcmpunordps %xmm1, %xmm1, %xmm15
+;;       vandnps %xmm1, %xmm15, %xmm1
+;;       vandps  0x15(%rip), %xmm15, %xmm15
+;;       vorps   %xmm1, %xmm15, %xmm1
+;;       movdqa  %xmm1, %xmm0
+;;       addq    $0x30, %rsp
+;;       popq    %rbp
+;;       retq
+;;   6c: ud2
+;;   6e: addb    %al, (%rax)
+;;   70: addb    %al, (%rax)
+;;   72: sarb    $0, (%rdi)
+;;   76: sarb    $0, (%rdi)
+;;   7a: sarb    $0, (%rdi)
diff --git a/winch/codegen/src/isa/aarch64/masm.rs b/winch/codegen/src/isa/aarch64/masm.rs
index 6911a2559490..cf8324489cfc 100644
--- a/winch/codegen/src/isa/aarch64/masm.rs
+++ b/winch/codegen/src/isa/aarch64/masm.rs
@@ -745,6 +745,14 @@ impl Masm for MacroAssembler {
         Ok(())
     }
 
+    fn maybe_canonicalize_v128_nan(
+        &mut self,
+        _reg: WritableReg,
+        _lane_size: OperandSize,
+    ) -> Result<()> {
+        bail!(CodeGenError::unimplemented_masm_instruction())
+    }
+
     fn and(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
         match (rhs, lhs, dst) {
             (RegImm::Imm(v), rn, rd) => {
diff --git a/winch/codegen/src/isa/x64/masm.rs b/winch/codegen/src/isa/x64/masm.rs
index c8b703544ff8..2294ad8c2109 100644
--- a/winch/codegen/src/isa/x64/masm.rs
+++ b/winch/codegen/src/isa/x64/masm.rs
@@ -709,6 +709,45 @@ impl Masm for MacroAssembler {
         Ok(())
     }
 
+    fn maybe_canonicalize_v128_nan(
+        &mut self,
+        reg: WritableReg,
+        lane_size: OperandSize,
+    ) -> Result<()> {
+        if !self.shared_flags.enable_nan_canonicalization() {
+            return Ok(());
+        }
+
+        self.ensure_has_avx()?;
+
+        self.with_scratch::<FloatScratch, _>(|masm, scratch| {
+            // scratch = NaN mask (all-1s for NaN lanes)
+            masm.asm.xmm_vcmpp_rrr(
+                scratch.writable(),
+                reg.to_reg(),
+                reg.to_reg(),
+                lane_size,
+                VcmpKind::Unord,
+            );
+            // reg = ~mask & original (zero out NaN lanes, keep non-NaN)
+            masm.asm
+                .xmm_vandnp_rrr(scratch.inner(), reg.to_reg(), reg, lane_size);
+            // scratch = mask & splatted canonical NaN = canonical NaN in NaN lanes only
+            let canon_nan = match lane_size {
+                OperandSize::S32 => &crate::masm::CANONICAL_NAN_F32X4[..],
+                OperandSize::S64 => &crate::masm::CANONICAL_NAN_F64X2[..],
+                _ => bail!(CodeGenError::unexpected_operand_size()),
+            };
+            let addr = masm.asm.add_constant(canon_nan);
+            masm.asm
+                .xmm_vandp_rrm(scratch.inner(), &addr, scratch.writable(), lane_size);
+            // reg = non-NaN values | canonical NaN for NaN lanes
+            masm.asm
+                .xmm_vorp_rrr(scratch.inner(), reg.to_reg(), reg, lane_size);
+            Ok(())
+        })
+    }
+
     fn and(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
         Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
         match (rhs, dst) {
diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs
index bb7305799a2b..df573fab00d0 100644
--- a/winch/codegen/src/masm.rs
+++ b/winch/codegen/src/masm.rs
@@ -16,6 +16,25 @@ use wasmtime_environ::{PtrSize, WasmHeapType, WasmRefType, WasmValType};
 pub(crate) const CANONICAL_NAN_F32: &[u8] = &0x7FC00000u32.to_le_bytes();
 pub(crate) const CANONICAL_NAN_F64: &[u8] = &0x7FF8000000000000u64.to_le_bytes();
 
+const NAN32: [u8; 4] = 0x7FC00000u32.to_le_bytes();
+const NAN64: [u8; 8] = 0x7FF8000000000000u64.to_le_bytes();
+
+pub(crate) const CANONICAL_NAN_F32X4: [u8; 16] = {
+    let n = NAN32;
+    [
+        n[0], n[1], n[2], n[3], n[0], n[1], n[2], n[3],
+        n[0], n[1], n[2], n[3], n[0], n[1], n[2], n[3],
+    ]
+};
+
+pub(crate) const CANONICAL_NAN_F64X2: [u8; 16] = {
+    let n = NAN64;
+    [
+        n[0], n[1], n[2], n[3], n[4], n[5], n[6], n[7],
+        n[0], n[1], n[2], n[3], n[4], n[5], n[6], n[7],
+    ]
+};
+
 pub(crate) use cranelift_codegen::ir::TrapCode;
 
 #[derive(Eq, PartialEq)]
@@ -1698,6 +1717,14 @@ pub(crate) trait MacroAssembler {
     /// Canonicalize NaN values in `reg` if the setting is enabled.
     fn maybe_canonicalize_nan(&mut self, reg: WritableReg, size: OperandSize) -> Result<()>;
 
+    /// Canonicalize NaN lanes in a v128 register if the setting is enabled.
+    /// `lane_size` is S32 for f32x4 or S64 for f64x2.
+    fn maybe_canonicalize_v128_nan(
+        &mut self,
+        reg: WritableReg,
+        lane_size: OperandSize,
+    ) -> Result<()>;
+
     /// Perform logical and operation.
     fn and(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()>;
 
diff --git a/winch/codegen/src/visitor.rs b/winch/codegen/src/visitor.rs
index de7b7b1b39c7..80b67994849f 100644
--- a/winch/codegen/src/visitor.rs
+++ b/winch/codegen/src/visitor.rs
@@ -3772,6 +3772,7 @@ where
     fn visit_f32x4_demote_f64x2_zero(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.v128_demote(reg, writable!(reg))?;
+            masm.maybe_canonicalize_v128_nan(writable!(reg), OperandSize::S32)?;
             Ok(TypedReg::v128(reg))
         })
     }
@@ -3779,6 +3780,7 @@ where
     fn visit_f64x2_promote_low_f32x4(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.v128_promote(reg, writable!(reg))?;
+            masm.maybe_canonicalize_v128_nan(writable!(reg), OperandSize::S64)?;
             Ok(TypedReg::v128(reg))
         })
     }
@@ -4429,6 +4431,7 @@ where
         self.context
             .binop(self.masm, OperandSize::S32, |masm, dst, src, _size| {
                 masm.v128_add(dst, src, writable!(dst), V128AddKind::F32x4)?;
+                masm.maybe_canonicalize_v128_nan(writable!(dst), OperandSize::S32)?;
                 Ok(TypedReg::v128(dst))
             })
     }
@@ -4437,6 +4440,7 @@ where
         self.context
             .binop(self.masm, OperandSize::S64, |masm, dst, src, _size| {
                 masm.v128_add(dst, src, writable!(dst), V128AddKind::F64x2)?;
+                masm.maybe_canonicalize_v128_nan(writable!(dst), OperandSize::S64)?;
                 Ok(TypedReg::v128(dst))
             })
     }
@@ -4445,6 +4449,7 @@ where
         self.context
             .binop(self.masm, OperandSize::S32, |masm, dst, src, _size| {
                 masm.v128_sub(dst, src, writable!(dst), V128SubKind::F32x4)?;
+                masm.maybe_canonicalize_v128_nan(writable!(dst), OperandSize::S32)?;
                 Ok(TypedReg::v128(dst))
             })
     }
@@ -4453,22 +4458,32 @@ where
         self.context
             .binop(self.masm, OperandSize::S64, |masm, dst, src, _size| {
                 masm.v128_sub(dst, src, writable!(dst), V128SubKind::F64x2)?;
+                masm.maybe_canonicalize_v128_nan(writable!(dst), OperandSize::S64)?;
                 Ok(TypedReg::v128(dst))
             })
     }
 
     fn visit_f32x4_mul(&mut self) -> Self::Output {
-        self.masm.v128_mul(&mut self.context, V128MulKind::F32x4)
+        self.masm.v128_mul(&mut self.context, V128MulKind::F32x4)?;
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm.maybe_canonicalize_v128_nan(writable!(result.into()), OperandSize::S32)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f64x2_mul(&mut self) -> Self::Output {
-        self.masm.v128_mul(&mut self.context, V128MulKind::F64x2)
+        self.masm.v128_mul(&mut self.context, V128MulKind::F64x2)?;
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm.maybe_canonicalize_v128_nan(writable!(result.into()), OperandSize::S64)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f32x4_div(&mut self) -> Self::Output {
         self.context
             .binop(self.masm, OperandSize::S32, |masm, dst, src, size| {
                 masm.v128_div(dst, src, writable!(dst), size)?;
+                masm.maybe_canonicalize_v128_nan(writable!(dst), OperandSize::S32)?;
                 Ok(TypedReg::v128(dst))
             })
     }
@@ -4477,6 +4492,7 @@ where
         self.context
             .binop(self.masm, OperandSize::S64, |masm, dst, src, size| {
                 masm.v128_div(dst, src, writable!(dst), size)?;
+                masm.maybe_canonicalize_v128_nan(writable!(dst), OperandSize::S64)?;
                 Ok(TypedReg::v128(dst))
             })
     }
@@ -4491,6 +4507,7 @@ where
     fn visit_f32x4_ceil(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.v128_ceil(reg, writable!(reg), OperandSize::S32)?;
+            masm.maybe_canonicalize_v128_nan(writable!(reg), OperandSize::S32)?;
             Ok(TypedReg::v128(reg))
         })
     }
@@ -4505,6 +4522,7 @@ where
     fn visit_f64x2_ceil(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.v128_ceil(reg, writable!(reg), OperandSize::S64)?;
+            masm.maybe_canonicalize_v128_nan(writable!(reg), OperandSize::S64)?;
             Ok(TypedReg::v128(reg))
         })
     }
@@ -4512,6 +4530,7 @@ where
     fn visit_f32x4_sqrt(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.v128_sqrt(reg, writable!(reg), OperandSize::S32)?;
+            masm.maybe_canonicalize_v128_nan(writable!(reg), OperandSize::S32)?;
             Ok(TypedReg::v128(reg))
         })
     }
@@ -4519,6 +4538,7 @@ where
     fn visit_f32x4_floor(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.v128_floor(reg, writable!(reg), OperandSize::S32)?;
+            masm.maybe_canonicalize_v128_nan(writable!(reg), OperandSize::S32)?;
             Ok(TypedReg::v128(reg))
         })
     }
@@ -4526,6 +4546,7 @@ where
     fn visit_f64x2_sqrt(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.v128_sqrt(reg, writable!(reg), OperandSize::S64)?;
+            masm.maybe_canonicalize_v128_nan(writable!(reg), OperandSize::S64)?;
             Ok(TypedReg::v128(reg))
         })
     }
@@ -4533,6 +4554,7 @@ where
     fn visit_f64x2_floor(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.v128_floor(reg, writable!(reg), OperandSize::S64)?;
+            masm.maybe_canonicalize_v128_nan(writable!(reg), OperandSize::S64)?;
             Ok(TypedReg::v128(reg))
         })
     }
@@ -4540,6 +4562,7 @@ where
     fn visit_f32x4_nearest(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.v128_nearest(reg, writable!(reg), OperandSize::S32)?;
+            masm.maybe_canonicalize_v128_nan(writable!(reg), OperandSize::S32)?;
             Ok(TypedReg::v128(reg))
         })
     }
@@ -4547,18 +4570,27 @@ where
     fn visit_f64x2_nearest(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.v128_nearest(reg, writable!(reg), OperandSize::S64)?;
+            masm.maybe_canonicalize_v128_nan(writable!(reg), OperandSize::S64)?;
             Ok(TypedReg::v128(reg))
         })
     }
 
     fn visit_f32x4_trunc(&mut self) -> Self::Output {
         self.masm
-            .v128_trunc(&mut self.context, V128TruncKind::F32x4)
+            .v128_trunc(&mut self.context, V128TruncKind::F32x4)?;
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm.maybe_canonicalize_v128_nan(writable!(result.into()), OperandSize::S32)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f64x2_trunc(&mut self) -> Self::Output {
         self.masm
-            .v128_trunc(&mut self.context, V128TruncKind::F64x2)
+            .v128_trunc(&mut self.context, V128TruncKind::F64x2)?;
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm.maybe_canonicalize_v128_nan(writable!(result.into()), OperandSize::S64)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_v128_load32_zero(&mut self, memarg: MemArg) -> Self::Output {
@@ -4613,6 +4645,7 @@ where
         self.context
             .binop(self.masm, OperandSize::S32, |masm, dst, src, _size| {
                 masm.v128_min(dst, src, writable!(dst), V128MinKind::F32x4)?;
+                masm.maybe_canonicalize_v128_nan(writable!(dst), OperandSize::S32)?;
                 Ok(TypedReg::v128(dst))
             })
     }
@@ -4621,6 +4654,7 @@ where
         self.context
             .binop(self.masm, OperandSize::S64, |masm, dst, src, _size| {
                 masm.v128_min(dst, src, writable!(dst), V128MinKind::F64x2)?;
+                masm.maybe_canonicalize_v128_nan(writable!(dst), OperandSize::S64)?;
                 Ok(TypedReg::v128(dst))
             })
     }
@@ -4629,6 +4663,7 @@ where
         self.context
             .binop(self.masm, OperandSize::S32, |masm, dst, src, _size| {
                 masm.v128_max(dst, src, writable!(dst), V128MaxKind::F32x4)?;
+                masm.maybe_canonicalize_v128_nan(writable!(dst), OperandSize::S32)?;
                 Ok(TypedReg::v128(dst))
             })
     }
@@ -4637,6 +4672,7 @@ where
         self.context
             .binop(self.masm, OperandSize::S64, |masm, dst, src, _size| {
                 masm.v128_max(dst, src, writable!(dst), V128MaxKind::F64x2)?;
+                masm.maybe_canonicalize_v128_nan(writable!(dst), OperandSize::S64)?;
                 Ok(TypedReg::v128(dst))
             })
     }

From 3649937f3921eb5fe14c442dcd80696aaa47e636 Mon Sep 17 00:00:00 2001
From: r-near <163825889+r-near@users.noreply.github.com>
Date: Mon, 6 Apr 2026 11:15:36 -0700
Subject: [PATCH 7/7] cargo fmt

---
 winch/codegen/src/masm.rs    |  8 ++++----
 winch/codegen/src/visitor.rs | 36 ++++++++++++++++++++++++------------
 2 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs
index df573fab00d0..1dc68778e706 100644
--- a/winch/codegen/src/masm.rs
+++ b/winch/codegen/src/masm.rs
@@ -22,16 +22,16 @@ const NAN64: [u8; 8] = 0x7FF8000000000000u64.to_le_bytes();
 pub(crate) const CANONICAL_NAN_F32X4: [u8; 16] = {
     let n = NAN32;
     [
-        n[0], n[1], n[2], n[3], n[0], n[1], n[2], n[3],
-        n[0], n[1], n[2], n[3], n[0], n[1], n[2], n[3],
+        n[0], n[1], n[2], n[3], n[0], n[1], n[2], n[3], n[0], n[1], n[2], n[3], n[0], n[1], n[2],
+        n[3],
     ]
 };
 
 pub(crate) const CANONICAL_NAN_F64X2: [u8; 16] = {
     let n = NAN64;
     [
-        n[0], n[1], n[2], n[3], n[4], n[5], n[6], n[7],
-        n[0], n[1], n[2], n[3], n[4], n[5], n[6], n[7],
+        n[0], n[1], n[2], n[3], n[4], n[5], n[6], n[7], n[0], n[1], n[2], n[3], n[4], n[5], n[6],
+        n[7],
     ]
 };
 
diff --git a/winch/codegen/src/visitor.rs b/winch/codegen/src/visitor.rs
index 80b67994849f..934628bb9ed0 100644
--- a/winch/codegen/src/visitor.rs
+++ b/winch/codegen/src/visitor.rs
@@ -794,7 +794,8 @@ where
             },
         )?;
         let result = self.context.pop_to_reg(self.masm, None)?;
-        self.masm.maybe_canonicalize_nan(writable!(result.into()), OperandSize::S32)?;
+        self.masm
+            .maybe_canonicalize_nan(writable!(result.into()), OperandSize::S32)?;
         self.context.stack.push(result.into());
         Ok(())
     }
@@ -811,7 +812,8 @@ where
             },
         )?;
         let result = self.context.pop_to_reg(self.masm, None)?;
-        self.masm.maybe_canonicalize_nan(writable!(result.into()), OperandSize::S64)?;
+        self.masm
+            .maybe_canonicalize_nan(writable!(result.into()), OperandSize::S64)?;
         self.context.stack.push(result.into());
         Ok(())
     }
@@ -828,7 +830,8 @@ where
             },
         )?;
         let result = self.context.pop_to_reg(self.masm, None)?;
-        self.masm.maybe_canonicalize_nan(writable!(result.into()), OperandSize::S32)?;
+        self.masm
+            .maybe_canonicalize_nan(writable!(result.into()), OperandSize::S32)?;
         self.context.stack.push(result.into());
         Ok(())
     }
@@ -845,7 +848,8 @@ where
             },
         )?;
         let result = self.context.pop_to_reg(self.masm, None)?;
-        self.masm.maybe_canonicalize_nan(writable!(result.into()), OperandSize::S64)?;
+        self.masm
+            .maybe_canonicalize_nan(writable!(result.into()), OperandSize::S64)?;
         self.context.stack.push(result.into());
         Ok(())
     }
@@ -862,7 +866,8 @@ where
             },
         )?;
         let result = self.context.pop_to_reg(self.masm, None)?;
-        self.masm.maybe_canonicalize_nan(writable!(result.into()), OperandSize::S32)?;
+        self.masm
+            .maybe_canonicalize_nan(writable!(result.into()), OperandSize::S32)?;
         self.context.stack.push(result.into());
         Ok(())
     }
@@ -879,7 +884,8 @@ where
             },
         )?;
         let result = self.context.pop_to_reg(self.masm, None)?;
-        self.masm.maybe_canonicalize_nan(writable!(result.into()), OperandSize::S64)?;
+        self.masm
+            .maybe_canonicalize_nan(writable!(result.into()), OperandSize::S64)?;
         self.context.stack.push(result.into());
         Ok(())
     }
@@ -896,7 +902,8 @@ where
             },
         )?;
         let result = self.context.pop_to_reg(self.masm, None)?;
-        self.masm.maybe_canonicalize_nan(writable!(result.into()), OperandSize::S32)?;
+        self.masm
+            .maybe_canonicalize_nan(writable!(result.into()), OperandSize::S32)?;
         self.context.stack.push(result.into());
         Ok(())
     }
@@ -913,7 +920,8 @@ where
             },
         )?;
         let result = self.context.pop_to_reg(self.masm, None)?;
-        self.masm.maybe_canonicalize_nan(writable!(result.into()), OperandSize::S64)?;
+        self.masm
+            .maybe_canonicalize_nan(writable!(result.into()), OperandSize::S64)?;
         self.context.stack.push(result.into());
         Ok(())
     }
@@ -4466,7 +4474,8 @@ where
     fn visit_f32x4_mul(&mut self) -> Self::Output {
         self.masm.v128_mul(&mut self.context, V128MulKind::F32x4)?;
         let result = self.context.pop_to_reg(self.masm, None)?;
-        self.masm.maybe_canonicalize_v128_nan(writable!(result.into()), OperandSize::S32)?;
+        self.masm
+            .maybe_canonicalize_v128_nan(writable!(result.into()), OperandSize::S32)?;
         self.context.stack.push(result.into());
         Ok(())
     }
@@ -4474,7 +4483,8 @@ where
     fn visit_f64x2_mul(&mut self) -> Self::Output {
         self.masm.v128_mul(&mut self.context, V128MulKind::F64x2)?;
         let result = self.context.pop_to_reg(self.masm, None)?;
-        self.masm.maybe_canonicalize_v128_nan(writable!(result.into()), OperandSize::S64)?;
+        self.masm
+            .maybe_canonicalize_v128_nan(writable!(result.into()), OperandSize::S64)?;
         self.context.stack.push(result.into());
         Ok(())
     }
@@ -4579,7 +4589,8 @@ where
         self.masm
             .v128_trunc(&mut self.context, V128TruncKind::F32x4)?;
         let result = self.context.pop_to_reg(self.masm, None)?;
-        self.masm.maybe_canonicalize_v128_nan(writable!(result.into()), OperandSize::S32)?;
+        self.masm
+            .maybe_canonicalize_v128_nan(writable!(result.into()), OperandSize::S32)?;
         self.context.stack.push(result.into());
         Ok(())
     }
@@ -4588,7 +4599,8 @@ where
         self.masm
             .v128_trunc(&mut self.context, V128TruncKind::F64x2)?;
         let result = self.context.pop_to_reg(self.masm, None)?;
-        self.masm.maybe_canonicalize_v128_nan(writable!(result.into()), OperandSize::S64)?;
+        self.masm
+            .maybe_canonicalize_v128_nan(writable!(result.into()), OperandSize::S64)?;
         self.context.stack.push(result.into());
         Ok(())
     }