diff --git a/crates/test-util/src/wast.rs b/crates/test-util/src/wast.rs
index 7047c00e0a8f..259eda8155a6 100644
--- a/crates/test-util/src/wast.rs
+++ b/crates/test-util/src/wast.rs
@@ -622,16 +622,6 @@ impl WastTest {
 
             #[cfg(target_arch = "x86_64")]
             {
-                let unsupported = [
-                    // externref/reference-types related
-                    // simd-related failures
-                    "misc_testsuite/simd/canonicalize-nan.wast",
-                ];
-
-                if unsupported.iter().any(|part| self.path.ends_with(part)) {
-                    return true;
-                }
-
                 // SIMD on Winch requires AVX instructions.
                 #[cfg(target_arch = "x86_64")]
                 if !(std::is_x86_feature_detected!("avx") && std::is_x86_feature_detected!("avx2"))
diff --git a/tests/disas/winch/x64/f32_add/nan_canon.wat b/tests/disas/winch/x64/f32_add/nan_canon.wat
new file mode 100644
index 000000000000..9aa0ed26395f
--- /dev/null
+++ b/tests/disas/winch/x64/f32_add/nan_canon.wat
@@ -0,0 +1,40 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = "-Wnan-canonicalization"
+
+(module
+    (func (param f32 f32) (result f32)
+        local.get 0
+        local.get 1
+        f32.add
+    )
+)
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x18(%r11), %r11
+;;       addq    $0x20, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x69
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x20, %rsp
+;;       movq    %rdi, 0x18(%rsp)
+;;       movq    %rsi, 0x10(%rsp)
+;;       movss   %xmm0, 0xc(%rsp)
+;;       movss   %xmm1, 8(%rsp)
+;;       movss   8(%rsp), %xmm0
+;;       movss   0xc(%rsp), %xmm1
+;;       addss   %xmm0, %xmm1
+;;       ucomiss %xmm1, %xmm1
+;;       jnp     0x5d
+;;   55: movss   0x13(%rip), %xmm1
+;;       movaps  %xmm1, %xmm0
+;;       addq    $0x20, %rsp
+;;       popq    %rbp
+;;       retq
+;;   69: ud2
+;;   6b: addb    %al, (%rax)
+;;   6d: addb    %al, (%rax)
+;;   6f: addb    %al, (%rax)
+;;   71: addb    %al, %al
diff --git a/tests/disas/winch/x64/f32x4_add/nan_canon.wat b/tests/disas/winch/x64/f32x4_add/nan_canon.wat
new file mode 100644
index 000000000000..89f1878a80a0
--- /dev/null
+++ b/tests/disas/winch/x64/f32x4_add/nan_canon.wat
@@ -0,0 +1,42 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = ["-Wnan-canonicalization", "-Ccranelift-has-avx"]
+
+(module
+    (func (param v128 v128) (result v128)
+        local.get 0
+        local.get 1
+        f32x4.add
+    )
+)
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x18(%r11), %r11
+;;       addq    $0x30, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x6c
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x30, %rsp
+;;       movq    %rdi, 0x28(%rsp)
+;;       movq    %rsi, 0x20(%rsp)
+;;       movdqu  %xmm0, 0x10(%rsp)
+;;       movdqu  %xmm1, (%rsp)
+;;       movdqu  (%rsp), %xmm0
+;;       movdqu  0x10(%rsp), %xmm1
+;;       vaddps  %xmm0, %xmm1, %xmm1
+;;       vcmpunordps %xmm1, %xmm1, %xmm15
+;;       vandnps %xmm1, %xmm15, %xmm1
+;;       vandps  0x15(%rip), %xmm15, %xmm15
+;;       vorps   %xmm1, %xmm15, %xmm1
+;;       movdqa  %xmm1, %xmm0
+;;       addq    $0x30, %rsp
+;;       popq    %rbp
+;;       retq
+;;   6c: ud2
+;;   6e: addb    %al, (%rax)
+;;   70: addb    %al, (%rax)
+;;   72: sarb    $0, (%rdi)
+;;   76: sarb    $0, (%rdi)
+;;   7a: sarb    $0, (%rdi)
diff --git a/tests/disas/winch/x64/f64_div/nan_canon.wat b/tests/disas/winch/x64/f64_div/nan_canon.wat
new file mode 100644
index 000000000000..cf0285da39fd
--- /dev/null
+++ b/tests/disas/winch/x64/f64_div/nan_canon.wat
@@ -0,0 +1,43 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = "-Wnan-canonicalization"
+
+(module
+    (func (param f64 f64) (result f64)
+        local.get 0
+        local.get 1
+        f64.div
+    )
+)
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x18(%r11), %r11
+;;       addq    $0x20, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x68
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x20, %rsp
+;;       movq    %rdi, 0x18(%rsp)
+;;       movq    %rsi, 0x10(%rsp)
+;;       movsd   %xmm0, 8(%rsp)
+;;       movsd   %xmm1, (%rsp)
+;;       movsd   (%rsp), %xmm0
+;;       movsd   8(%rsp), %xmm1
+;;       divsd   %xmm0, %xmm1
+;;       ucomisd %xmm1, %xmm1
+;;       jnp     0x5c
+;;   54: movsd   0x14(%rip), %xmm1
+;;       movaps  %xmm1, %xmm0
+;;       addq    $0x20, %rsp
+;;       popq    %rbp
+;;       retq
+;;   68: ud2
+;;   6a: addb    %al, (%rax)
+;;   6c: addb    %al, (%rax)
+;;   6e: addb    %al, (%rax)
+;;   70: addb    %al, (%rax)
+;;   72: addb    %al, (%rax)
+;;   74: addb    %al, (%rax)
+;;   76: clc
diff --git a/tests/misc_testsuite/canonicalize-nan-scalar.wast b/tests/misc_testsuite/canonicalize-nan-scalar.wast
new file mode 100644
index 000000000000..8b019cdec803
--- /dev/null
+++ b/tests/misc_testsuite/canonicalize-nan-scalar.wast
@@ -0,0 +1,153 @@
+;;! nan_canonicalization = true
+
+;; Scalar counterpart to simd/canonicalize-nan.wast.
+
+(module
+  (func (export "f32.add") (param f32 f32) (result f32)
+    local.get 0
+    local.get 1
+    f32.add)
+  (func (export "f32.sub") (param f32 f32) (result f32)
+    local.get 0
+    local.get 1
+    f32.sub)
+  (func (export "f32.mul") (param f32 f32) (result f32)
+    local.get 0
+    local.get 1
+    f32.mul)
+  (func (export "f32.div") (param f32 f32) (result f32)
+    local.get 0
+    local.get 1
+    f32.div)
+  (func (export "f32.min") (param f32 f32) (result f32)
+    local.get 0
+    local.get 1
+    f32.min)
+  (func (export "f32.max") (param f32 f32) (result f32)
+    local.get 0
+    local.get 1
+    f32.max)
+  (func (export "f32.sqrt") (param f32) (result f32)
+    local.get 0
+    f32.sqrt)
+  (func (export "f32.ceil") (param f32) (result f32)
+    local.get 0
+    f32.ceil)
+  (func (export "f32.floor") (param f32) (result f32)
+    local.get 0
+    f32.floor)
+  (func (export "f32.trunc") (param f32) (result f32)
+    local.get 0
+    f32.trunc)
+  (func (export "f32.nearest") (param f32) (result f32)
+    local.get 0
+    f32.nearest)
+
+  (func (export "f64.add") (param f64 f64) (result f64)
+    local.get 0
+    local.get 1
+    f64.add)
+  (func (export "f64.sub") (param f64 f64) (result f64)
+    local.get 0
+    local.get 1
+    f64.sub)
+  (func (export "f64.mul") (param f64 f64) (result f64)
+    local.get 0
+    local.get 1
+    f64.mul)
+  (func (export "f64.div") (param f64 f64) (result f64)
+    local.get 0
+    local.get 1
+    f64.div)
+  (func (export "f64.min") (param f64 f64) (result f64)
+    local.get 0
+    local.get 1
+    f64.min)
+  (func (export "f64.max") (param f64 f64) (result f64)
+    local.get 0
+    local.get 1
+    f64.max)
+  (func (export "f64.sqrt") (param f64) (result f64)
+    local.get 0
+    f64.sqrt)
+  (func (export "f64.ceil") (param f64) (result f64)
+    local.get 0
+    f64.ceil)
+  (func (export "f64.floor") (param f64) (result f64)
+    local.get 0
+    f64.floor)
+  (func (export "f64.trunc") (param f64) (result f64)
+    local.get 0
+    f64.trunc)
+  (func (export "f64.nearest") (param f64) (result f64)
+    local.get 0
+    f64.nearest)
+
+  (func (export "reinterpret-and-demote") (param i64) (result i32)
+    local.get 0
+    f64.reinterpret_i64
+    f32.demote_f64
+    i32.reinterpret_f32)
+  (func (export "reinterpret-and-promote") (param i32) (result i64)
+    local.get 0
+    f32.reinterpret_i32
+    f64.promote_f32
+    i64.reinterpret_f64)
+
+  ;; Expose raw bits of 0/0 to verify exact canonical NaN bit patterns.
+  (func (export "f32.div-nan-bits") (result i32)
+    f32.const 0
+    f32.const 0
+    f32.div
+    i32.reinterpret_f32)
+  (func (export "f64.div-nan-bits") (result i64)
+    f64.const 0
+    f64.const 0
+    f64.div
+    i64.reinterpret_f64)
+)
+
+;; Exact bit patterns: canonical f32 NaN = 0x7fc00000, f64 = 0x7ff8000000000000
+(assert_return (invoke "f32.div-nan-bits") (i32.const 0x7fc00000))
+(assert_return (invoke "f64.div-nan-bits") (i64.const 0x7ff8000000000000))
+
+;; NaN-producing operations
+(assert_return (invoke "f32.div" (f32.const 0) (f32.const 0)) (f32.const nan:0x400000))
+(assert_return (invoke "f64.div" (f64.const 0) (f64.const 0)) (f64.const nan:0x8000000000000))
+(assert_return (invoke "f32.sqrt" (f32.const -1)) (f32.const nan:0x400000))
+(assert_return (invoke "f64.sqrt" (f64.const -1)) (f64.const nan:0x8000000000000))
+
+;; NaN propagation through f32 arithmetic
+(assert_return (invoke "f32.add" (f32.const nan) (f32.const 1)) (f32.const nan:0x400000))
+(assert_return (invoke "f32.sub" (f32.const nan) (f32.const 1)) (f32.const nan:0x400000))
+(assert_return (invoke "f32.mul" (f32.const nan) (f32.const 1)) (f32.const nan:0x400000))
+(assert_return (invoke "f32.min" (f32.const nan) (f32.const 1)) (f32.const nan:0x400000))
+(assert_return (invoke "f32.max" (f32.const nan) (f32.const 1)) (f32.const nan:0x400000))
+
+;; NaN propagation through f64 arithmetic
+(assert_return (invoke "f64.add" (f64.const nan) (f64.const 1)) (f64.const nan:0x8000000000000))
+(assert_return (invoke "f64.sub" (f64.const nan) (f64.const 1)) (f64.const nan:0x8000000000000))
+(assert_return (invoke "f64.mul" (f64.const nan) (f64.const 1)) (f64.const nan:0x8000000000000))
+(assert_return (invoke "f64.min" (f64.const nan) (f64.const 1)) (f64.const nan:0x8000000000000))
+(assert_return (invoke "f64.max" (f64.const nan) (f64.const 1)) (f64.const nan:0x8000000000000))
+
+;; Rounding NaN (f32)
+(assert_return (invoke "f32.ceil" (f32.const nan)) (f32.const nan:0x400000))
+(assert_return (invoke "f32.floor" (f32.const nan)) (f32.const nan:0x400000))
+(assert_return (invoke "f32.trunc" (f32.const nan)) (f32.const nan:0x400000))
+(assert_return (invoke "f32.nearest" (f32.const nan)) (f32.const nan:0x400000))
+
+;; Rounding NaN (f64)
+(assert_return (invoke "f64.ceil" (f64.const nan)) (f64.const nan:0x8000000000000))
+(assert_return (invoke "f64.floor" (f64.const nan)) (f64.const nan:0x8000000000000))
+(assert_return (invoke "f64.trunc" (f64.const nan)) (f64.const nan:0x8000000000000))
+(assert_return (invoke "f64.nearest" (f64.const nan)) (f64.const nan:0x8000000000000))
+
+;; Demote/promote with non-canonical NaN bit patterns
+(assert_return (invoke "reinterpret-and-demote" (i64.const 0xfffefdfccccdcecf)) (i32.const 0x7fc00000))
+(assert_return (invoke "reinterpret-and-promote" (i32.const 0xfffefdfc)) (i64.const 0x7ff8000000000000))
+
+;; Normal values pass through unchanged
+(assert_return (invoke "f32.add" (f32.const 1) (f32.const 2)) (f32.const 3))
+(assert_return (invoke "f64.div" (f64.const 10) (f64.const 2)) (f64.const 5))
+(assert_return (invoke "f32.sqrt" (f32.const 4)) (f32.const 2))
diff --git a/winch/codegen/src/isa/aarch64/masm.rs b/winch/codegen/src/isa/aarch64/masm.rs
index 26948ed15fe7..cf8324489cfc 100644
--- a/winch/codegen/src/isa/aarch64/masm.rs
+++ b/winch/codegen/src/isa/aarch64/masm.rs
@@ -59,6 +59,8 @@ pub(crate) struct MacroAssembler {
     ptr_size: OperandSize,
     /// Scratch register scope.
     scratch_scope: RegAlloc,
+    /// Shared flags.
+    shared_flags: settings::Flags,
 }
 
 impl MacroAssembler {
@@ -71,10 +73,11 @@ impl MacroAssembler {
         Ok(Self {
             sp_max: 0,
             stack_max_use_add: None,
-            asm: Assembler::new(shared_flags, isa_flags),
+            asm: Assembler::new(shared_flags.clone(), isa_flags),
             sp_offset: 0u32,
             ptr_size: ptr_type_from_ptr_size(ptr_size.size()).try_into()?,
             scratch_scope: RegAlloc::from(scratch_gpr_bitset(), scratch_fpr_bitset()),
+            shared_flags,
         })
     }
 
@@ -713,6 +716,43 @@ impl Masm for MacroAssembler {
         Ok(())
     }
 
+    fn maybe_canonicalize_nan(&mut self, reg: WritableReg, size: OperandSize) -> Result<()> {
+        if !self.shared_flags.enable_nan_canonicalization() {
+            return Ok(());
+        }
+
+        let done_label = self.asm.buffer_mut().get_label();
+
+        self.asm.fcmp(reg.to_reg(), reg.to_reg(), size);
+        self.asm.jmp_if(Cond::Vc, done_label);
+
+        let canonical_nan = match size {
+            OperandSize::S32 => crate::masm::CANONICAL_NAN_F32,
+            OperandSize::S64 => crate::masm::CANONICAL_NAN_F64,
+            _ => bail!(CodeGenError::unexpected_operand_size()),
+        };
+        let constant = self.asm.add_constant(canonical_nan);
+        self.asm.uload(
+            inst::AMode::Const { addr: constant },
+            reg,
+            size,
+            TRUSTED_FLAGS,
+        );
+
+        self.asm
+            .buffer_mut()
+            .bind_label(done_label, &mut Default::default());
+        Ok(())
+    }
+
+    fn maybe_canonicalize_v128_nan(
+        &mut self,
+        _reg: WritableReg,
+        _lane_size: OperandSize,
+    ) -> Result<()> {
+        bail!(CodeGenError::unimplemented_masm_instruction())
+    }
+
     fn and(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
         match (rhs, lhs, dst) {
             (RegImm::Imm(v), rn, rd) => {
diff --git a/winch/codegen/src/isa/x64/masm.rs b/winch/codegen/src/isa/x64/masm.rs
index 7902d3339b31..2294ad8c2109 100644
--- a/winch/codegen/src/isa/x64/masm.rs
+++ b/winch/codegen/src/isa/x64/masm.rs
@@ -686,6 +686,68 @@ impl Masm for MacroAssembler {
         Ok(())
     }
 
+    fn maybe_canonicalize_nan(&mut self, reg: WritableReg, size: OperandSize) -> Result<()> {
+        if !self.shared_flags.enable_nan_canonicalization() {
+            return Ok(());
+        }
+
+        let done_label = self.asm.buffer_mut().get_label();
+
+        self.asm.ucomis(reg.to_reg(), reg.to_reg(), size);
+        self.asm.jmp_if(CC::NP, done_label);
+
+        let canonical_nan = match size {
+            OperandSize::S32 => crate::masm::CANONICAL_NAN_F32,
+            OperandSize::S64 => crate::masm::CANONICAL_NAN_F64,
+            _ => bail!(CodeGenError::unexpected_operand_size()),
+        };
+        self.asm.load_fp_const(reg, canonical_nan, size);
+
+        self.asm
+            .buffer_mut()
+            .bind_label(done_label, &mut Default::default());
+        Ok(())
+    }
+
+    fn maybe_canonicalize_v128_nan(
+        &mut self,
+        reg: WritableReg,
+        lane_size: OperandSize,
+    ) -> Result<()> {
+        if !self.shared_flags.enable_nan_canonicalization() {
+            return Ok(());
+        }
+
+        self.ensure_has_avx()?;
+
+        self.with_scratch::<FloatScratch, _>(|masm, scratch| {
+            // scratch = NaN mask (all-1s for NaN lanes)
+            masm.asm.xmm_vcmpp_rrr(
+                scratch.writable(),
+                reg.to_reg(),
+                reg.to_reg(),
+                lane_size,
+                VcmpKind::Unord,
+            );
+            // reg = ~mask & original (zero out NaN lanes, keep non-NaN)
+            masm.asm
+                .xmm_vandnp_rrr(scratch.inner(), reg.to_reg(), reg, lane_size);
+            // scratch = mask & splatted canonical NaN = canonical NaN in NaN lanes only
+            let canon_nan = match lane_size {
+                OperandSize::S32 => &crate::masm::CANONICAL_NAN_F32X4[..],
+                OperandSize::S64 => &crate::masm::CANONICAL_NAN_F64X2[..],
+                _ => bail!(CodeGenError::unexpected_operand_size()),
+            };
+            let addr = masm.asm.add_constant(canon_nan);
+            masm.asm
+                .xmm_vandp_rrm(scratch.inner(), &addr, scratch.writable(), lane_size);
+            // reg = non-NaN values | canonical NaN for NaN lanes
+            masm.asm
+                .xmm_vorp_rrr(scratch.inner(), reg.to_reg(), reg, lane_size);
+            Ok(())
+        })
+    }
+
     fn and(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
         Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
         match (rhs, dst) {
diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs
index 0955bf46197e..1dc68778e706 100644
--- a/winch/codegen/src/masm.rs
+++ b/winch/codegen/src/masm.rs
@@ -13,6 +13,28 @@ use cranelift_codegen::{
 use std::{fmt::Debug, ops::Range};
 use wasmtime_environ::{PtrSize, WasmHeapType, WasmRefType, WasmValType};
 
+pub(crate) const CANONICAL_NAN_F32: &[u8] = &0x7FC00000u32.to_le_bytes();
+pub(crate) const CANONICAL_NAN_F64: &[u8] = &0x7FF8000000000000u64.to_le_bytes();
+
+const NAN32: [u8; 4] = 0x7FC00000u32.to_le_bytes();
+const NAN64: [u8; 8] = 0x7FF8000000000000u64.to_le_bytes();
+
+pub(crate) const CANONICAL_NAN_F32X4: [u8; 16] = {
+    let n = NAN32;
+    [
+        n[0], n[1], n[2], n[3], n[0], n[1], n[2], n[3], n[0], n[1], n[2], n[3], n[0], n[1], n[2],
+        n[3],
+    ]
+};
+
+pub(crate) const CANONICAL_NAN_F64X2: [u8; 16] = {
+    let n = NAN64;
+    [
+        n[0], n[1], n[2], n[3], n[4], n[5], n[6], n[7], n[0], n[1], n[2], n[3], n[4], n[5], n[6],
+        n[7],
+    ]
+};
+
 pub(crate) use cranelift_codegen::ir::TrapCode;
 
 #[derive(Eq, PartialEq)]
@@ -1692,6 +1714,17 @@ pub(crate) trait MacroAssembler {
     /// Perform a floating point square root operation.
     fn float_sqrt(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()>;
 
+    /// Canonicalize NaN values in `reg` if the setting is enabled.
+    fn maybe_canonicalize_nan(&mut self, reg: WritableReg, size: OperandSize) -> Result<()>;
+
+    /// Canonicalize NaN lanes in a v128 register if the setting is enabled.
+    /// `lane_size` is S32 for f32x4 or S64 for f64x2.
+    fn maybe_canonicalize_v128_nan(
+        &mut self,
+        reg: WritableReg,
+        lane_size: OperandSize,
+    ) -> Result<()>;
+
     /// Perform logical and operation.
     fn and(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()>;
 
diff --git a/winch/codegen/src/visitor.rs b/winch/codegen/src/visitor.rs
index 4d643266ed48..934628bb9ed0 100644
--- a/winch/codegen/src/visitor.rs
+++ b/winch/codegen/src/visitor.rs
@@ -594,6 +594,7 @@ where
             OperandSize::S32,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_add(writable!(dst), dst, src, size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f32(dst))
             },
         )
@@ -605,6 +606,7 @@ where
             OperandSize::S64,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_add(writable!(dst), dst, src, size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f64(dst))
             },
         )
@@ -616,6 +618,7 @@ where
             OperandSize::S32,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_sub(writable!(dst), dst, src, size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f32(dst))
             },
         )
@@ -627,6 +630,7 @@ where
             OperandSize::S64,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_sub(writable!(dst), dst, src, size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f64(dst))
             },
         )
@@ -638,6 +642,7 @@ where
             OperandSize::S32,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_mul(writable!(dst), dst, src, size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f32(dst))
             },
         )
@@ -649,6 +654,7 @@ where
             OperandSize::S64,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_mul(writable!(dst), dst, src, size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f64(dst))
             },
         )
@@ -660,6 +666,7 @@ where
             OperandSize::S32,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_div(writable!(dst), dst, src, size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f32(dst))
             },
         )
@@ -671,6 +678,7 @@ where
             OperandSize::S64,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_div(writable!(dst), dst, src, size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f64(dst))
             },
         )
@@ -682,6 +690,7 @@ where
             OperandSize::S32,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_min(writable!(dst), dst, src, size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f32(dst))
             },
         )
@@ -693,6 +702,7 @@ where
             OperandSize::S64,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_min(writable!(dst), dst, src, size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f64(dst))
             },
         )
@@ -704,6 +714,7 @@ where
             OperandSize::S32,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_max(writable!(dst), dst, src, size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f32(dst))
             },
         )
@@ -715,6 +726,7 @@ where
             OperandSize::S64,
             &mut |masm: &mut M, dst, src, size| {
                 masm.float_max(writable!(dst), dst, src, size)?;
+                masm.maybe_canonicalize_nan(writable!(dst), size)?;
                 Ok(TypedReg::f64(dst))
             },
         )
@@ -780,7 +792,12 @@ where
                 let builtin = env.builtins.floor_f32::<M::ABI>()?;
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
-        )
+        )?;
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm
+            .maybe_canonicalize_nan(writable!(result.into()), OperandSize::S32)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f64_floor(&mut self) -> Self::Output {
@@ -793,7 +810,12 @@ where
                 let builtin = env.builtins.floor_f64::<M::ABI>()?;
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
-        )
+        )?;
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm
+            .maybe_canonicalize_nan(writable!(result.into()), OperandSize::S64)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f32_ceil(&mut self) -> Self::Output {
@@ -806,7 +828,12 @@ where
                 let builtin = env.builtins.ceil_f32::<M::ABI>()?;
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
-        )
+        )?;
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm
+            .maybe_canonicalize_nan(writable!(result.into()), OperandSize::S32)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f64_ceil(&mut self) -> Self::Output {
@@ -819,7 +846,12 @@ where
                 let builtin = env.builtins.ceil_f64::<M::ABI>()?;
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
-        )
+        )?;
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm
+            .maybe_canonicalize_nan(writable!(result.into()), OperandSize::S64)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f32_nearest(&mut self) -> Self::Output {
@@ -832,7 +864,12 @@ where
                 let builtin = env.builtins.nearest_f32::<M::ABI>()?;
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
-        )
+        )?;
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm
+            .maybe_canonicalize_nan(writable!(result.into()), OperandSize::S32)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f64_nearest(&mut self) -> Self::Output {
@@ -845,7 +882,12 @@ where
                 let builtin = env.builtins.nearest_f64::<M::ABI>()?;
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
-        )
+        )?;
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm
+            .maybe_canonicalize_nan(writable!(result.into()), OperandSize::S64)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f32_trunc(&mut self) -> Self::Output {
@@ -858,7 +900,12 @@ where
                 let builtin = env.builtins.trunc_f32::<M::ABI>()?;
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
-        )
+        )?;
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm
+            .maybe_canonicalize_nan(writable!(result.into()), OperandSize::S32)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f64_trunc(&mut self) -> Self::Output {
@@ -871,12 +918,18 @@ where
                 let builtin = env.builtins.trunc_f64::<M::ABI>()?;
                 FnCall::emit::<M>(env, masm, cx, Callee::Builtin(builtin))
             },
-        )
+        )?;
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm
+            .maybe_canonicalize_nan(writable!(result.into()), OperandSize::S64)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f32_sqrt(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.float_sqrt(writable!(reg), reg, OperandSize::S32)?;
+            masm.maybe_canonicalize_nan(writable!(reg), OperandSize::S32)?;
             Ok(TypedReg::f32(reg))
         })
     }
@@ -884,6 +937,7 @@ where
     fn visit_f64_sqrt(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.float_sqrt(writable!(reg), reg, OperandSize::S64)?;
+            masm.maybe_canonicalize_nan(writable!(reg), OperandSize::S64)?;
             Ok(TypedReg::f64(reg))
         })
     }
@@ -1097,6 +1151,7 @@ where
     fn visit_f32_demote_f64(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.demote(writable!(reg), reg)?;
+            masm.maybe_canonicalize_nan(writable!(reg), OperandSize::S32)?;
             Ok(TypedReg::f32(reg))
         })
     }
@@ -1104,6 +1159,7 @@ where
     fn visit_f64_promote_f32(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.promote(writable!(reg), reg)?;
+            masm.maybe_canonicalize_nan(writable!(reg), OperandSize::S64)?;
             Ok(TypedReg::f64(reg))
         })
     }
@@ -3724,6 +3780,7 @@ where
     fn visit_f32x4_demote_f64x2_zero(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.v128_demote(reg, writable!(reg))?;
+            masm.maybe_canonicalize_v128_nan(writable!(reg), OperandSize::S32)?;
             Ok(TypedReg::v128(reg))
         })
     }
@@ -3731,6 +3788,7 @@ where
     fn visit_f64x2_promote_low_f32x4(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.v128_promote(reg, writable!(reg))?;
+            masm.maybe_canonicalize_v128_nan(writable!(reg), OperandSize::S64)?;
             Ok(TypedReg::v128(reg))
         })
     }
@@ -4381,6 +4439,7 @@ where
         self.context
             .binop(self.masm, OperandSize::S32, |masm, dst, src, _size| {
                 masm.v128_add(dst, src, writable!(dst), V128AddKind::F32x4)?;
+                masm.maybe_canonicalize_v128_nan(writable!(dst), OperandSize::S32)?;
                 Ok(TypedReg::v128(dst))
             })
     }
@@ -4389,6 +4448,7 @@ where
         self.context
             .binop(self.masm, OperandSize::S64, |masm, dst, src, _size| {
                 masm.v128_add(dst, src, writable!(dst), V128AddKind::F64x2)?;
+                masm.maybe_canonicalize_v128_nan(writable!(dst), OperandSize::S64)?;
                 Ok(TypedReg::v128(dst))
             })
     }
@@ -4397,6 +4457,7 @@ where
         self.context
             .binop(self.masm, OperandSize::S32, |masm, dst, src, _size| {
                 masm.v128_sub(dst, src, writable!(dst), V128SubKind::F32x4)?;
+                masm.maybe_canonicalize_v128_nan(writable!(dst), OperandSize::S32)?;
                 Ok(TypedReg::v128(dst))
             })
     }
@@ -4405,22 +4466,34 @@ where
         self.context
             .binop(self.masm, OperandSize::S64, |masm, dst, src, _size| {
                 masm.v128_sub(dst, src, writable!(dst), V128SubKind::F64x2)?;
+                masm.maybe_canonicalize_v128_nan(writable!(dst), OperandSize::S64)?;
                 Ok(TypedReg::v128(dst))
             })
     }
 
     fn visit_f32x4_mul(&mut self) -> Self::Output {
-        self.masm.v128_mul(&mut self.context, V128MulKind::F32x4)
+        self.masm.v128_mul(&mut self.context, V128MulKind::F32x4)?;
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm
+            .maybe_canonicalize_v128_nan(writable!(result.into()), OperandSize::S32)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f64x2_mul(&mut self) -> Self::Output {
-        self.masm.v128_mul(&mut self.context, V128MulKind::F64x2)
+        self.masm.v128_mul(&mut self.context, V128MulKind::F64x2)?;
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm
+            .maybe_canonicalize_v128_nan(writable!(result.into()), OperandSize::S64)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f32x4_div(&mut self) -> Self::Output {
         self.context
             .binop(self.masm, OperandSize::S32, |masm, dst, src, size| {
                 masm.v128_div(dst, src, writable!(dst), size)?;
+                masm.maybe_canonicalize_v128_nan(writable!(dst), OperandSize::S32)?;
                 Ok(TypedReg::v128(dst))
             })
     }
@@ -4429,6 +4502,7 @@ where
         self.context
             .binop(self.masm, OperandSize::S64, |masm, dst, src, size| {
                 masm.v128_div(dst, src, writable!(dst), size)?;
+                masm.maybe_canonicalize_v128_nan(writable!(dst), OperandSize::S64)?;
                 Ok(TypedReg::v128(dst))
             })
     }
@@ -4443,6 +4517,7 @@ where
     fn visit_f32x4_ceil(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.v128_ceil(reg, writable!(reg), OperandSize::S32)?;
+            masm.maybe_canonicalize_v128_nan(writable!(reg), OperandSize::S32)?;
             Ok(TypedReg::v128(reg))
         })
     }
@@ -4457,6 +4532,7 @@ where
     fn visit_f64x2_ceil(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.v128_ceil(reg, writable!(reg), OperandSize::S64)?;
+            masm.maybe_canonicalize_v128_nan(writable!(reg), OperandSize::S64)?;
             Ok(TypedReg::v128(reg))
         })
     }
@@ -4464,6 +4540,7 @@ where
     fn visit_f32x4_sqrt(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.v128_sqrt(reg, writable!(reg), OperandSize::S32)?;
+            masm.maybe_canonicalize_v128_nan(writable!(reg), OperandSize::S32)?;
             Ok(TypedReg::v128(reg))
         })
     }
@@ -4471,6 +4548,7 @@ where
     fn visit_f32x4_floor(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.v128_floor(reg, writable!(reg), OperandSize::S32)?;
+            masm.maybe_canonicalize_v128_nan(writable!(reg), OperandSize::S32)?;
             Ok(TypedReg::v128(reg))
         })
     }
@@ -4478,6 +4556,7 @@ where
     fn visit_f64x2_sqrt(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.v128_sqrt(reg, writable!(reg), OperandSize::S64)?;
+            masm.maybe_canonicalize_v128_nan(writable!(reg), OperandSize::S64)?;
             Ok(TypedReg::v128(reg))
         })
     }
@@ -4485,6 +4564,7 @@ where
     fn visit_f64x2_floor(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.v128_floor(reg, writable!(reg), OperandSize::S64)?;
+            masm.maybe_canonicalize_v128_nan(writable!(reg), OperandSize::S64)?;
             Ok(TypedReg::v128(reg))
         })
     }
@@ -4492,6 +4572,7 @@ where
     fn visit_f32x4_nearest(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.v128_nearest(reg, writable!(reg), OperandSize::S32)?;
+            masm.maybe_canonicalize_v128_nan(writable!(reg), OperandSize::S32)?;
             Ok(TypedReg::v128(reg))
         })
     }
@@ -4499,18 +4580,29 @@ where
     fn visit_f64x2_nearest(&mut self) -> Self::Output {
         self.context.unop(self.masm, |masm, reg| {
             masm.v128_nearest(reg, writable!(reg), OperandSize::S64)?;
+            masm.maybe_canonicalize_v128_nan(writable!(reg), OperandSize::S64)?;
             Ok(TypedReg::v128(reg))
         })
     }
 
     fn visit_f32x4_trunc(&mut self) -> Self::Output {
         self.masm
-            .v128_trunc(&mut self.context, V128TruncKind::F32x4)
+            .v128_trunc(&mut self.context, V128TruncKind::F32x4)?;
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm
+            .maybe_canonicalize_v128_nan(writable!(result.into()), OperandSize::S32)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_f64x2_trunc(&mut self) -> Self::Output {
         self.masm
-            .v128_trunc(&mut self.context, V128TruncKind::F64x2)
+            .v128_trunc(&mut self.context, V128TruncKind::F64x2)?;
+        let result = self.context.pop_to_reg(self.masm, None)?;
+        self.masm
+            .maybe_canonicalize_v128_nan(writable!(result.into()), OperandSize::S64)?;
+        self.context.stack.push(result.into());
+        Ok(())
     }
 
     fn visit_v128_load32_zero(&mut self, memarg: MemArg) -> Self::Output {
@@ -4565,6 +4657,7 @@ where
         self.context
             .binop(self.masm, OperandSize::S32, |masm, dst, src, _size| {
                 masm.v128_min(dst, src, writable!(dst), V128MinKind::F32x4)?;
+                masm.maybe_canonicalize_v128_nan(writable!(dst), OperandSize::S32)?;
                 Ok(TypedReg::v128(dst))
             })
     }
@@ -4573,6 +4666,7 @@ where
         self.context
             .binop(self.masm, OperandSize::S64, |masm, dst, src, _size| {
                 masm.v128_min(dst, src, writable!(dst), V128MinKind::F64x2)?;
+                masm.maybe_canonicalize_v128_nan(writable!(dst), OperandSize::S64)?;
                 Ok(TypedReg::v128(dst))
             })
     }
@@ -4581,6 +4675,7 @@ where
         self.context
             .binop(self.masm, OperandSize::S32, |masm, dst, src, _size| {
                 masm.v128_max(dst, src, writable!(dst), V128MaxKind::F32x4)?;
+                masm.maybe_canonicalize_v128_nan(writable!(dst), OperandSize::S32)?;
                 Ok(TypedReg::v128(dst))
             })
     }
@@ -4589,6 +4684,7 @@ where
         self.context
             .binop(self.masm, OperandSize::S64, |masm, dst, src, _size| {
                 masm.v128_max(dst, src, writable!(dst), V128MaxKind::F64x2)?;
+                masm.maybe_canonicalize_v128_nan(writable!(dst), OperandSize::S64)?;
                 Ok(TypedReg::v128(dst))
             })
     }