From d1c7eaa16274bf30dc196336634b76683b1400bd Mon Sep 17 00:00:00 2001
From: Brian Cain <brian.cain@oss.qualcomm.com>
Date: Sat, 2 May 2026 19:44:41 -0700
Subject: [PATCH 1/7] Introduce WASM_CPU_SUPPORTS_UNALIGNED_SIMD_ACCESS for
 V128 load/store

Split V128 memory access (LOAD_V128/STORE_V128) into a separate
three-way guard independent of the scalar unaligned-access flag,
and add a target_supports_unaligned_simd field to AOTCompContext.
---
 core/config.h                                 |  13 ++
 core/iwasm/common/wasm_runtime_common.h       | 215 +++++++++++-------
 core/iwasm/compilation/aot_llvm.c             |   8 +
 core/iwasm/compilation/aot_llvm.h             |   5 +
 core/iwasm/compilation/simd/simd_load_store.c |  11 +-
 5 files changed, 166 insertions(+), 86 deletions(-)

diff --git a/core/config.h b/core/config.h
index 31404deb95..3f629c199a 100644
--- a/core/config.h
+++ b/core/config.h
@@ -276,6 +276,19 @@
 #endif
 #endif
 
+/* Whether the CPU supports unaligned SIMD/vector memory access.
+ * Some architectures have dedicated unaligned-load vector instructions,
+ * allowing V128 access at any alignment even when scalar loads require
+ * natural alignment. */
+#ifndef WASM_CPU_SUPPORTS_UNALIGNED_SIMD_ACCESS
+#if defined(BUILD_TARGET_X86_32) || defined(BUILD_TARGET_X86_64) \
+    || defined(BUILD_TARGET_AARCH64)
+#define WASM_CPU_SUPPORTS_UNALIGNED_SIMD_ACCESS 1
+#else
+#define WASM_CPU_SUPPORTS_UNALIGNED_SIMD_ACCESS 0
+#endif
+#endif
+
 /* WASM Interpreter labels-as-values feature */
 #ifndef WASM_ENABLE_LABELS_AS_VALUES
 #ifdef __GNUC__
diff --git a/core/iwasm/common/wasm_runtime_common.h b/core/iwasm/common/wasm_runtime_common.h
index 0fdece2663..e168668d51 100644
--- a/core/iwasm/common/wasm_runtime_common.h
+++ b/core/iwasm/common/wasm_runtime_common.h
@@ -272,93 +272,10 @@ STORE_U16(void *addr, uint16_t value)
     ((uint8_t *)(addr))[1] = u.u8[1];
 }
 
-static inline void
-STORE_V128(void *addr, V128 value)
-{
-    uintptr_t addr_ = (uintptr_t)(addr);
-    union {
-        V128 val;
-        uint64 u64[2];
-        uint32 u32[4];
-        uint16 u16[8];
-        uint8 u8[16];
-    } u;
-
-    if ((addr_ & (uintptr_t)15) == 0) {
-        *(V128 *)addr = value;
-    }
-    else if ((addr_ & (uintptr_t)7) == 0) {
-        u.val = value;
-        ((uint64 *)(addr))[0] = u.u64[0];
-        ((uint64 *)(addr))[1] = u.u64[1];
-    }
-    else if ((addr_ & (uintptr_t)3) == 0) {
-        u.val = value;
-        ((uint32 *)addr)[0] = u.u32[0];
-        ((uint32 *)addr)[1] = u.u32[1];
-        ((uint32 *)addr)[2] = u.u32[2];
-        ((uint32 *)addr)[3] = u.u32[3];
-    }
-    else if ((addr_ & (uintptr_t)1) == 0) {
-        u.val = value;
-        ((uint16 *)addr)[0] = u.u16[0];
-        ((uint16 *)addr)[1] = u.u16[1];
-        ((uint16 *)addr)[2] = u.u16[2];
-        ((uint16 *)addr)[3] = u.u16[3];
-        ((uint16 *)addr)[4] = u.u16[4];
-        ((uint16 *)addr)[5] = u.u16[5];
-        ((uint16 *)addr)[6] = u.u16[6];
-        ((uint16 *)addr)[7] = u.u16[7];
-    }
-    else {
-        u.val = value;
-        for (int i = 0; i < 16; i++)
-            ((uint8 *)addr)[i] = u.u8[i];
-    }
-}
+/* STORE_V128 / LOAD_V128 are defined separately below, guarded by
+ * WASM_CPU_SUPPORTS_UNALIGNED_SIMD_ACCESS (see Block after line 474). */
 
 /* For LOAD opcodes */
-static inline V128
-LOAD_V128(void *addr)
-{
-    uintptr_t addr1 = (uintptr_t)addr;
-    union {
-        V128 val;
-        uint64 u64[2];
-        uint32 u32[4];
-        uint16 u16[8];
-        uint8 u8[16];
-    } u;
-    if ((addr1 & (uintptr_t)15) == 0)
-        return *(V128 *)addr;
-
-    if ((addr1 & (uintptr_t)7) == 0) {
-        u.u64[0] = ((uint64 *)addr)[0];
-        u.u64[1] = ((uint64 *)addr)[1];
-    }
-    else if ((addr1 & (uintptr_t)3) == 0) {
-        u.u32[0] = ((uint32 *)addr)[0];
-        u.u32[1] = ((uint32 *)addr)[1];
-        u.u32[2] = ((uint32 *)addr)[2];
-        u.u32[3] = ((uint32 *)addr)[3];
-    }
-    else if ((addr1 & (uintptr_t)1) == 0) {
-        u.u16[0] = ((uint16 *)addr)[0];
-        u.u16[1] = ((uint16 *)addr)[1];
-        u.u16[2] = ((uint16 *)addr)[2];
-        u.u16[3] = ((uint16 *)addr)[3];
-        u.u16[4] = ((uint16 *)addr)[4];
-        u.u16[5] = ((uint16 *)addr)[5];
-        u.u16[6] = ((uint16 *)addr)[6];
-        u.u16[7] = ((uint16 *)addr)[7];
-    }
-    else {
-        for (int i = 0; i < 16; i++)
-            u.u8[i] = ((uint8 *)addr)[i];
-    }
-    return u.val;
-}
-
 static inline int64
 LOAD_I64(void *addr)
 {
@@ -473,6 +390,134 @@ LOAD_I16(void *addr)
 
 #endif /* WASM_CPU_SUPPORTS_UNALIGNED_ADDR_ACCESS != 0 */
 
+/*
+ * LOAD_V128 / STORE_V128 — WASM linear memory V128 access.
+ *
+ * These are guarded by WASM_CPU_SUPPORTS_UNALIGNED_SIMD_ACCESS rather than
+ * WASM_CPU_SUPPORTS_UNALIGNED_ADDR_ACCESS because some architectures have
+ * different alignment rules for scalar vs vector memory operations,
+ * e.g. architectures with dedicated unaligned-load vector instructions.
+ *
+ * PUT_V128_TO_ADDR / GET_V128_FROM_ADDR (frame-local access) remain
+ * guarded by the scalar flag above since frame locals are accessed via
+ * scalar C operations, not vector instructions.
+ */
+#if WASM_CPU_SUPPORTS_UNALIGNED_ADDR_ACCESS != 0
+
+/* Already defined: LOAD_V128, STORE_V128 as direct pointer casts */
+
+#elif WASM_CPU_SUPPORTS_UNALIGNED_SIMD_ACCESS != 0
+
+/* The target's SIMD unit supports unaligned vector access, but scalar loads
+ * require natural alignment. Use memcpy which is safe at any alignment and
+ * allows the compiler to select the best instruction sequence for the
+ * target. */
+static inline V128
+LOAD_V128(void *addr)
+{
+    V128 v;
+    memcpy(&v, addr, sizeof(V128));
+    return v;
+}
+
+static inline void
+STORE_V128(void *addr, V128 value)
+{
+    memcpy(addr, &value, sizeof(V128));
+}
+
+#else /* !UNALIGNED_ADDR_ACCESS && !UNALIGNED_SIMD_ACCESS */
+
+/* Neither scalar nor vector unaligned access is supported.
+ * Check alignment at runtime and use the widest safe access. */
+static inline void
+STORE_V128(void *addr, V128 value)
+{
+    uintptr_t addr_ = (uintptr_t)(addr);
+    union {
+        V128 val;
+        uint64 u64[2];
+        uint32 u32[4];
+        uint16 u16[8];
+        uint8 u8[16];
+    } u;
+
+    if ((addr_ & (uintptr_t)15) == 0) {
+        *(V128 *)addr = value;
+    }
+    else if ((addr_ & (uintptr_t)7) == 0) {
+        u.val = value;
+        ((uint64 *)(addr))[0] = u.u64[0];
+        ((uint64 *)(addr))[1] = u.u64[1];
+    }
+    else if ((addr_ & (uintptr_t)3) == 0) {
+        u.val = value;
+        ((uint32 *)addr)[0] = u.u32[0];
+        ((uint32 *)addr)[1] = u.u32[1];
+        ((uint32 *)addr)[2] = u.u32[2];
+        ((uint32 *)addr)[3] = u.u32[3];
+    }
+    else if ((addr_ & (uintptr_t)1) == 0) {
+        u.val = value;
+        ((uint16 *)addr)[0] = u.u16[0];
+        ((uint16 *)addr)[1] = u.u16[1];
+        ((uint16 *)addr)[2] = u.u16[2];
+        ((uint16 *)addr)[3] = u.u16[3];
+        ((uint16 *)addr)[4] = u.u16[4];
+        ((uint16 *)addr)[5] = u.u16[5];
+        ((uint16 *)addr)[6] = u.u16[6];
+        ((uint16 *)addr)[7] = u.u16[7];
+    }
+    else {
+        u.val = value;
+        for (int i = 0; i < 16; i++)
+            ((uint8 *)addr)[i] = u.u8[i];
+    }
+}
+
+static inline V128
+LOAD_V128(void *addr)
+{
+    uintptr_t addr1 = (uintptr_t)addr;
+    union {
+        V128 val;
+        uint64 u64[2];
+        uint32 u32[4];
+        uint16 u16[8];
+        uint8 u8[16];
+    } u;
+    if ((addr1 & (uintptr_t)15) == 0)
+        return *(V128 *)addr;
+
+    if ((addr1 & (uintptr_t)7) == 0) {
+        u.u64[0] = ((uint64 *)addr)[0];
+        u.u64[1] = ((uint64 *)addr)[1];
+    }
+    else if ((addr1 & (uintptr_t)3) == 0) {
+        u.u32[0] = ((uint32 *)addr)[0];
+        u.u32[1] = ((uint32 *)addr)[1];
+        u.u32[2] = ((uint32 *)addr)[2];
+        u.u32[3] = ((uint32 *)addr)[3];
+    }
+    else if ((addr1 & (uintptr_t)1) == 0) {
+        u.u16[0] = ((uint16 *)addr)[0];
+        u.u16[1] = ((uint16 *)addr)[1];
+        u.u16[2] = ((uint16 *)addr)[2];
+        u.u16[3] = ((uint16 *)addr)[3];
+        u.u16[4] = ((uint16 *)addr)[4];
+        u.u16[5] = ((uint16 *)addr)[5];
+        u.u16[6] = ((uint16 *)addr)[6];
+        u.u16[7] = ((uint16 *)addr)[7];
+    }
+    else {
+        for (int i = 0; i < 16; i++)
+            u.u8[i] = ((uint8 *)addr)[i];
+    }
+    return u.val;
+}
+
+#endif /* WASM_CPU_SUPPORTS_UNALIGNED_SIMD_ACCESS */
+
 #if WASM_ENABLE_SHARED_MEMORY != 0
 #define SHARED_MEMORY_LOCK(memory) shared_memory_lock(memory)
 #define SHARED_MEMORY_UNLOCK(memory) shared_memory_unlock(memory)
diff --git a/core/iwasm/compilation/aot_llvm.c b/core/iwasm/compilation/aot_llvm.c
index 1a9da63fac..f1ee1eecec 100644
--- a/core/iwasm/compilation/aot_llvm.c
+++ b/core/iwasm/compilation/aot_llvm.c
@@ -3413,6 +3413,14 @@ aot_create_comp_context(const AOTCompData *comp_data, aot_comp_option_t option)
         }
     }
 
+    /* Determine whether the target's SIMD/vector unit supports unaligned
+     * memory access.  x86_64 and aarch64 can handle unaligned vector
+     * loads/stores natively.  This informs alignment annotations emitted
+     * for SIMD load/store IR. */
+    comp_ctx->target_supports_unaligned_simd =
+        !strcmp(comp_ctx->target_arch, "x86_64")
+        || !strncmp(comp_ctx->target_arch, "aarch64", 7);
+
     if (!(target_data_ref =
               LLVMCreateTargetDataLayout(comp_ctx->target_machine))) {
         aot_set_last_error("create LLVM target data layout failed.");
diff --git a/core/iwasm/compilation/aot_llvm.h b/core/iwasm/compilation/aot_llvm.h
index 5bd75a38ce..d89e776eb4 100644
--- a/core/iwasm/compilation/aot_llvm.h
+++ b/core/iwasm/compilation/aot_llvm.h
@@ -494,6 +494,11 @@ typedef struct AOTCompContext {
     bool enable_segue_f64_store;
     bool enable_segue_v128_store;
 
+    /* Whether the target's SIMD/vector unit supports unaligned access.
+     * When true, SIMD load/store IR can use align 1 without the backend
+     * decomposing to byte-by-byte access. */
+    bool target_supports_unaligned_simd;
+
     /* Whether optimize the JITed code */
     bool optimize;
 
diff --git a/core/iwasm/compilation/simd/simd_load_store.c b/core/iwasm/compilation/simd/simd_load_store.c
index d3bbcc9650..ee787c95a2 100644
--- a/core/iwasm/compilation/simd/simd_load_store.c
+++ b/core/iwasm/compilation/simd/simd_load_store.c
@@ -35,7 +35,16 @@ simd_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, uint32 align,
         return NULL;
     }
 
-    LLVMSetAlignment(data, 1);
+    /* WASM SIMD does not guarantee alignment for v128 loads.
+     * On targets whose SIMD unit handles unaligned access natively
+     * (x86 SSE, aarch64 NEON), align 1 is safe and the backend will
+     * select the right instruction.
+     * On other targets, use the WASM alignment hint so the backend
+     * can generate wider (aligned) loads instead of byte-by-byte. */
+    if (comp_ctx->target_supports_unaligned_simd)
+        LLVMSetAlignment(data, 1);
+    else
+        LLVMSetAlignment(data, 1 << align);
 
     return data;
 }

From 097329aa45277f0d0031c195be2b6669f2905858 Mon Sep 17 00:00:00 2001
From: Brian Cain <brian.cain@oss.qualcomm.com>
Date: Sat, 2 May 2026 19:44:41 -0700
Subject: [PATCH 2/7] feat(hexagon): add Hexagon ISA target support

Add interpreter and AOT execution support for Qualcomm Hexagon DSP:
invokeNative trampoline, ELF relocation handler with PLT stubs,
wamrc --target=hexagon with auto -small-data, SIMD enablement, and
cross-compilation platform cmake.
---
 build-scripts/config_common.cmake             |   2 +
 .../toolchains/hexagon-linux-musl.cmake       |  22 +
 core/config.h                                 |   7 +-
 core/iwasm/aot/arch/aot_reloc_hexagon.c       | 761 ++++++++++++++++++
 core/iwasm/aot/iwasm_aot.cmake                |   2 +
 core/iwasm/common/arch/invokeNative_hexagon.s | 137 ++++
 core/iwasm/common/iwasm_common.cmake          |   2 +
 core/iwasm/common/wasm_runtime_common.h       |  13 +-
 core/iwasm/compilation/aot_llvm.c             |  48 +-
 core/iwasm/compilation/simd/simd_load_store.c |   4 +-
 product-mini/platforms/hexagon/CMakeLists.txt | 144 ++++
 .../wamr-test-suites/spec-test-script/all.py  |  25 +
 12 files changed, 1150 insertions(+), 17 deletions(-)
 create mode 100644 build-scripts/toolchains/hexagon-linux-musl.cmake
 create mode 100644 core/iwasm/aot/arch/aot_reloc_hexagon.c
 create mode 100644 core/iwasm/common/arch/invokeNative_hexagon.s
 create mode 100644 product-mini/platforms/hexagon/CMakeLists.txt

diff --git a/build-scripts/config_common.cmake b/build-scripts/config_common.cmake
index ee00203b28..e9d9848e3e 100644
--- a/build-scripts/config_common.cmake
+++ b/build-scripts/config_common.cmake
@@ -45,6 +45,8 @@ elseif (WAMR_BUILD_TARGET STREQUAL "RISCV32_ILP32")
   add_definitions(-DBUILD_TARGET_RISCV32_ILP32)
 elseif (WAMR_BUILD_TARGET STREQUAL "ARC")
   add_definitions(-DBUILD_TARGET_ARC)
+elseif (WAMR_BUILD_TARGET STREQUAL "HEXAGON")
+  add_definitions(-DBUILD_TARGET_HEXAGON)
 else ()
   message (FATAL_ERROR "-- WAMR build target isn't set")
 endif ()
diff --git a/build-scripts/toolchains/hexagon-linux-musl.cmake b/build-scripts/toolchains/hexagon-linux-musl.cmake
new file mode 100644
index 0000000000..7465f80350
--- /dev/null
+++ b/build-scripts/toolchains/hexagon-linux-musl.cmake
@@ -0,0 +1,22 @@
+# Copyright (C) 2024 Qualcomm Innovation Center, Inc.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# CMake toolchain file for cross-compiling to Hexagon Linux (musl)
+# Requires CodeLinaro hexagon-unknown-linux-musl toolchain and clang-22/lld-22.
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR hexagon)
+
+set(CMAKE_C_COMPILER hexagon-unknown-linux-musl-clang)
+set(CMAKE_CXX_COMPILER hexagon-unknown-linux-musl-clang++)
+set(CMAKE_ASM_COMPILER hexagon-unknown-linux-musl-clang)
+set(CMAKE_AR llvm-ar-22)
+set(CMAKE_RANLIB llvm-ranlib-22)
+
+set(CMAKE_C_FLAGS_INIT "-mv68 -G0")
+set(CMAKE_CXX_FLAGS_INIT "-mv68 -G0")
+set(CMAKE_EXE_LINKER_FLAGS_INIT "-static")
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
diff --git a/core/config.h b/core/config.h
index 3f629c199a..6b2d8b6694 100644
--- a/core/config.h
+++ b/core/config.h
@@ -22,7 +22,8 @@
     && !defined(BUILD_TARGET_RISCV32_ILP32D) \
     && !defined(BUILD_TARGET_RISCV32_ILP32F) \
     && !defined(BUILD_TARGET_RISCV32_ILP32) \
-    && !defined(BUILD_TARGET_ARC)
+    && !defined(BUILD_TARGET_ARC) \
+    && !defined(BUILD_TARGET_HEXAGON)
 /* clang-format on */
 #if defined(__x86_64__) || defined(__x86_64)
 #define BUILD_TARGET_X86_64
@@ -52,6 +53,8 @@
 #define BUILD_TARGET_RISCV32_ILP32D
 #elif defined(__arc__)
 #define BUILD_TARGET_ARC
+#elif defined(__hexagon__)
+#define BUILD_TARGET_HEXAGON
 #else
 #error "Build target isn't set"
 #endif
@@ -282,7 +285,7 @@
  * natural alignment. */
 #ifndef WASM_CPU_SUPPORTS_UNALIGNED_SIMD_ACCESS
 #if defined(BUILD_TARGET_X86_32) || defined(BUILD_TARGET_X86_64) \
-    || defined(BUILD_TARGET_AARCH64)
+    || defined(BUILD_TARGET_AARCH64) || defined(BUILD_TARGET_HEXAGON)
 #define WASM_CPU_SUPPORTS_UNALIGNED_SIMD_ACCESS 1
 #else
 #define WASM_CPU_SUPPORTS_UNALIGNED_SIMD_ACCESS 0
diff --git a/core/iwasm/aot/arch/aot_reloc_hexagon.c b/core/iwasm/aot/arch/aot_reloc_hexagon.c
new file mode 100644
index 0000000000..4bcfa4984d
--- /dev/null
+++ b/core/iwasm/aot/arch/aot_reloc_hexagon.c
@@ -0,0 +1,761 @@
+/*
+ * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include "aot_reloc.h"
+
+/*
+ * Hexagon ELF relocation types.
+ * Reference: "Qualcomm Hexagon Application Binary Interface User Guide"
+ * https://docs.qualcomm.com/doc/80-N2040-23/80-N2040-23_REV_K_Qualcomm_Hexagon_Application_Binary_Interface_User_Guide.pdf
+ */
+#define R_HEX_NONE 0
+#define R_HEX_B22_PCREL 1
+#define R_HEX_B15_PCREL 2
+#define R_HEX_B7_PCREL 3
+#define R_HEX_LO16 4
+#define R_HEX_HI16 5
+#define R_HEX_32 6
+#define R_HEX_16 7
+#define R_HEX_8 8
+#define R_HEX_GPREL16_0 9
+#define R_HEX_GPREL16_1 10
+#define R_HEX_GPREL16_2 11
+#define R_HEX_GPREL16_3 12
+#define R_HEX_B13_PCREL 14
+#define R_HEX_B9_PCREL 15
+#define R_HEX_B32_PCREL_X 16
+#define R_HEX_32_6_X 17
+#define R_HEX_B22_PCREL_X 18
+#define R_HEX_B15_PCREL_X 19
+#define R_HEX_B13_PCREL_X 20
+#define R_HEX_B9_PCREL_X 21
+#define R_HEX_B7_PCREL_X 22
+#define R_HEX_16_X 23
+#define R_HEX_12_X 24
+#define R_HEX_11_X 25
+#define R_HEX_10_X 26
+#define R_HEX_9_X 27
+#define R_HEX_8_X 28
+#define R_HEX_7_X 29
+#define R_HEX_6_X 30
+#define R_HEX_32_PCREL 31
+#define R_HEX_6_PCREL_X 65
+
+/*
+ * Hexagon instruction bit-field masks for relocations.
+ * These masks identify which bits of a 32-bit instruction word
+ * carry the immediate value.  The apply_mask() function disperses
+ * data bits into these positions.
+ */
+#define MASK_B22 0x01ff3ffe  /* B22_PCREL: bits [24:16],[13:1] */
+#define MASK_B15 0x00df20fe  /* B15_PCREL: bits [23:21],[17],[13:1] */
+#define MASK_B13 0x00202ffe  /* B13_PCREL: bits [21],[13:1] */
+#define MASK_B9 0x003000fe   /* B9_PCREL:  bits [21:20],[7:1] */
+#define MASK_B7 0x00001f18   /* B7_PCREL:  bits [12:8],[4:3] */
+#define MASK_LO16 0x00c03fff /* LO16/HI16: bits [19:18],[13:0] */
+#define MASK_X26 0x0fff3fff  /* 32_6_X / B32_PCREL_X: bits [27:16],[13:0] */
+
+/*
+ * Hexagon compiler runtime helpers.
+ * Hexagon has no hardware divide instruction; LLVM emits calls to
+ * compiler-rt/libgcc helper functions for integer division, modulo,
+ * and certain floating-point operations.  These must be resolvable
+ * when loading AOT modules.
+ */
+/* clang-format off */
+void __hexagon_divsi3(void);
+void __hexagon_modsi3(void);
+void __hexagon_udivsi3(void);
+void __hexagon_umodsi3(void);
+void __hexagon_divdi3(void);
+void __hexagon_moddi3(void);
+void __hexagon_udivdi3(void);
+void __hexagon_umoddi3(void);
+void __hexagon_udivmodsi4(void);
+void __hexagon_udivmoddi4(void);
+
+void __hexagon_divsf3(void);
+void __hexagon_divdf3(void);
+void __hexagon_adddf3(void);
+void __hexagon_subdf3(void);
+void __hexagon_muldf3(void);
+void __hexagon_sqrtf(void);
+void __hexagon_sqrtdf2(void);
+void __hexagon_fmadf4(void);
+void __hexagon_fmadf5(void);
+
+void __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes(void);
+/* clang-format on */
+
+/* clang-format off */
+static SymbolMap target_sym_map[] = {
+    REG_COMMON_SYMBOLS
+    /* Integer division/modulo helpers */
+    REG_SYM(__hexagon_divsi3),
+    REG_SYM(__hexagon_modsi3),
+    REG_SYM(__hexagon_udivsi3),
+    REG_SYM(__hexagon_umodsi3),
+    REG_SYM(__hexagon_divdi3),
+    REG_SYM(__hexagon_moddi3),
+    REG_SYM(__hexagon_udivdi3),
+    REG_SYM(__hexagon_umoddi3),
+    REG_SYM(__hexagon_udivmodsi4),
+    REG_SYM(__hexagon_udivmoddi4),
+    /* Floating-point helpers */
+    REG_SYM(__hexagon_divsf3),
+    REG_SYM(__hexagon_divdf3),
+    REG_SYM(__hexagon_adddf3),
+    REG_SYM(__hexagon_subdf3),
+    REG_SYM(__hexagon_muldf3),
+    REG_SYM(__hexagon_sqrtf),
+    REG_SYM(__hexagon_sqrtdf2),
+    REG_SYM(__hexagon_fmadf4),
+    REG_SYM(__hexagon_fmadf5),
+    /* Optimized memory operations */
+    REG_SYM(__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes),
+};
+/* clang-format on */
+
+static void
+set_error_buf(char *error_buf, uint32 error_buf_size, const char *string)
+{
+    if (error_buf != NULL)
+        snprintf(error_buf, error_buf_size, "%s", string);
+}
+
+SymbolMap *
+get_target_symbol_map(uint32 *sym_num)
+{
+    *sym_num = sizeof(target_sym_map) / sizeof(SymbolMap);
+    return target_sym_map;
+}
+
+void
+get_current_target(char *target_buf, uint32 target_buf_size)
+{
+    snprintf(target_buf, target_buf_size, "hexagon");
+}
+
+/*
+ * Packet parse bits [15:14]: when both zero the word is a duplex.
+ */
+#define INST_PARSE_PACKET_END 0x0000c000
+
+static bool
+is_duplex(uint32 insn)
+{
+    return (INST_PARSE_PACKET_END & insn) == 0;
+}
+
+/* Instruction opcode → relocation mask table for R_HEX_6_X */
+/* clang-format off */
+static const struct { uint32 cmp_mask; uint32 reloc_mask; } r6_masks[] = {
+    { 0x38000000, 0x0000201f }, { 0x39000000, 0x0000201f },
+    { 0x3e000000, 0x00001f80 }, { 0x3f000000, 0x00001f80 },
+    { 0x40000000, 0x000020f8 }, { 0x41000000, 0x000007e0 },
+    { 0x42000000, 0x000020f8 }, { 0x43000000, 0x000007e0 },
+    { 0x44000000, 0x000020f8 }, { 0x45000000, 0x000007e0 },
+    { 0x46000000, 0x000020f8 }, { 0x47000000, 0x000007e0 },
+    { 0x6a000000, 0x00001f80 }, { 0x7c000000, 0x001f2000 },
+    { 0x9a000000, 0x00000f60 }, { 0x9b000000, 0x00000f60 },
+    { 0x9c000000, 0x00000f60 }, { 0x9d000000, 0x00000f60 },
+    { 0x9f000000, 0x001f0100 }, { 0xab000000, 0x0000003f },
+    { 0xad000000, 0x0000003f }, { 0xaf000000, 0x00030078 },
+    { 0xd7000000, 0x006020e0 }, { 0xd8000000, 0x006020e0 },
+    { 0xdb000000, 0x006020e0 }, { 0xdf000000, 0x006020e0 },
+};
+/* clang-format on */
+
+static uint32
+get_mask_r6(uint32 insn, char *error_buf, uint32 error_buf_size)
+{
+    uint32 i;
+
+    if (is_duplex(insn))
+        return 0x03f00000;
+
+    for (i = 0; i < sizeof(r6_masks) / sizeof(r6_masks[0]); i++) {
+        if ((insn & 0xff000000) == r6_masks[i].cmp_mask)
+            return r6_masks[i].reloc_mask;
+    }
+
+    set_error_buf(error_buf, error_buf_size,
+                  "AOT module load failed: "
+                  "unrecognized instruction for 6_X relocation.");
+    return 0;
+}
+
+static uint32
+get_mask_r8(uint32 insn)
+{
+    if ((0xff000000 & insn) == 0xde000000)
+        return 0x00e020e8;
+    if ((0xff000000 & insn) == 0x3c000000)
+        return 0x0000207f;
+    return 0x00001fe0;
+}
+
+static uint32
+get_mask_r11(uint32 insn)
+{
+    if (is_duplex(insn))
+        return 0x03f00000;
+    if ((0xff000000 & insn) == 0xa1000000)
+        return 0x060020ff;
+    return 0x06003fe0;
+}
+
+static uint32
+get_mask_r16(uint32 insn, char *error_buf, uint32 error_buf_size)
+{
+    uint32 i;
+
+    if (is_duplex(insn))
+        return 0x03f00000;
+
+    /* Clear end-packet-parse bits for matching */
+    insn = insn & ~INST_PARSE_PACKET_END;
+
+    if ((0xff000000 & insn) == 0x48000000)
+        return 0x061f20ff;
+    if ((0xff000000 & insn) == 0x49000000)
+        return 0x061f3fe0;
+    if ((0xff000000 & insn) == 0x78000000)
+        return 0x00df3fe0;
+    if ((0xff000000 & insn) == 0xb0000000)
+        return 0x0fe03fe0;
+
+    if ((0xff802000 & insn) == 0x74000000)
+        return 0x00001fe0;
+    if ((0xff802000 & insn) == 0x74002000)
+        return 0x00001fe0;
+    if ((0xff802000 & insn) == 0x74800000)
+        return 0x00001fe0;
+    if ((0xff802000 & insn) == 0x74802000)
+        return 0x00001fe0;
+
+    /* Fall back to r6 table */
+    for (i = 0; i < sizeof(r6_masks) / sizeof(r6_masks[0]); i++) {
+        if ((insn & 0xff000000) == r6_masks[i].cmp_mask)
+            return r6_masks[i].reloc_mask;
+    }
+
+    set_error_buf(error_buf, error_buf_size,
+                  "AOT module load failed: "
+                  "unrecognized instruction for 16_X relocation.");
+    return 0;
+}
+
+/* Scatter bits from 'data' into positions indicated by set bits in 'mask'. */
+static uint32
+apply_mask(uint32 mask, uint32 data)
+{
+    uint32 result = 0;
+    uint32 off = 0;
+    uint32 bit;
+
+    for (bit = 0; bit < 32; bit++) {
+        uint32 val_bit = (data >> off) & 1;
+        uint32 mask_bit = (mask >> bit) & 1;
+        if (mask_bit) {
+            result |= (val_bit << bit);
+            off++;
+        }
+    }
+    return result;
+}
+
+/*
+ * PLT trampoline for Hexagon: 12-byte entries using immext + r28=##addr
+ * + jumpr r28 to perform an absolute jump to the symbol address.
+ */
+#define PLT_ITEM_SIZE 12
+
+/* Instruction templates with address = 0 */
+#define PLT_IMMEXT_TEMPLATE 0x00004000
+#define PLT_R28_TEMPLATE 0x7800c01c
+#define PLT_JUMPR_R28 0x529cc000
+
+/* Mask for the lower 6 bits in r28=# (opcode 0x78) */
+#define MASK_R28_IMM 0x00df3fe0
+
+uint32
+get_plt_item_size(void)
+{
+    return PLT_ITEM_SIZE;
+}
+
+uint32
+get_plt_table_size(void)
+{
+    return get_plt_item_size() * (sizeof(target_sym_map) / sizeof(SymbolMap));
+}
+
+void
+init_plt_table(uint8 *plt)
+{
+    uint32 i, num = sizeof(target_sym_map) / sizeof(SymbolMap);
+
+    for (i = 0; i < num; i++) {
+        uint32 addr = (uint32)(uintptr_t)target_sym_map[i].symbol_addr;
+        uint32 *p = (uint32 *)plt;
+
+        /* immext(#addr) — upper 26 bits of address */
+        p[0] = PLT_IMMEXT_TEMPLATE | apply_mask(MASK_X26, addr >> 6);
+        /* r28 = ##addr — lower 6 bits of address */
+        p[1] = PLT_R28_TEMPLATE | apply_mask(MASK_R28_IMM, addr & 0x3F);
+        /* jumpr r28 */
+        p[2] = PLT_JUMPR_R28;
+
+        plt += PLT_ITEM_SIZE;
+    }
+}
+
+static bool
+check_reloc_offset(uint32 target_section_size, uint64 reloc_offset,
+                   uint32 reloc_data_size, char *error_buf,
+                   uint32 error_buf_size)
+{
+    if (!(reloc_offset < (uint64)target_section_size
+          && reloc_offset + reloc_data_size <= (uint64)target_section_size)) {
+        set_error_buf(error_buf, error_buf_size,
+                      "AOT module load failed: invalid relocation offset.");
+        return false;
+    }
+    return true;
+}
+
+bool
+apply_relocation(AOTModule *module, uint8 *target_section_addr,
+                 uint32 target_section_size, uint64 reloc_offset,
+                 int64 reloc_addend, uint32 reloc_type, void *symbol_addr,
+                 int32 symbol_index, char *error_buf, uint32 error_buf_size)
+{
+    switch (reloc_type) {
+        case R_HEX_32:
+        {
+            /* Direct 32-bit relocation: S + A */
+            uint32 val;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            val = (uint32)(uintptr_t)symbol_addr + (int32)reloc_addend;
+            *(uint32 *)(target_section_addr + reloc_offset) = val;
+            break;
+        }
+
+        case R_HEX_32_PCREL:
+        {
+            /* 32-bit PC-relative: S + A - P */
+            int32 val;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            val = (int32)((intptr_t)symbol_addr + (intptr_t)reloc_addend
+                          - (intptr_t)(target_section_addr + reloc_offset));
+            *(int32 *)(target_section_addr + reloc_offset) = val;
+            break;
+        }
+
+        case R_HEX_B22_PCREL:
+        {
+            /*
+             * 22-bit PC-relative branch: (S + A - P) >> 2
+             * 22-bit signed field, word-aligned: +-8MB byte range.
+             * For external symbols (symbol_index >= 0), use PLT
+             * trampoline if direct branch is out of range.
+             */
+            intptr_t result;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+
+            if (symbol_index >= 0) {
+                /* External symbol: redirect through PLT */
+                uint8 *plt = (uint8 *)module->code + module->code_size
+                             - get_plt_table_size()
+                             + get_plt_item_size() * symbol_index;
+                result = (intptr_t)((uintptr_t)plt + (intptr_t)reloc_addend
+                                    - (uintptr_t)(target_section_addr
+                                                  + reloc_offset));
+            }
+            else {
+                result =
+                    (intptr_t)((uintptr_t)symbol_addr + (intptr_t)reloc_addend
+                               - (uintptr_t)(target_section_addr
+                                             + reloc_offset));
+            }
+
+            if (result >= (8 * BH_MB) || result < -(8 * BH_MB)) {
+                set_error_buf(error_buf, error_buf_size,
+                              "AOT module load failed: "
+                              "B22_PCREL target out of range.");
+                return false;
+            }
+
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(MASK_B22, (uint32)(result >> 2));
+            break;
+        }
+
+        case R_HEX_B15_PCREL:
+        {
+            /* 15-bit PC-relative branch: (S + A - P) >> 2, +-64KB range */
+            intptr_t result;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            result =
+                (intptr_t)((uintptr_t)symbol_addr + (intptr_t)reloc_addend
+                           - (uintptr_t)(target_section_addr + reloc_offset));
+
+            if (result >= 0x10000 || result < -0x10000) {
+                set_error_buf(error_buf, error_buf_size,
+                              "AOT module load failed: "
+                              "B15_PCREL target out of range.");
+                return false;
+            }
+
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(MASK_B15, (uint32)(result >> 2));
+            break;
+        }
+
+        case R_HEX_B13_PCREL:
+        {
+            /* 13-bit PC-relative branch: (S + A - P) >> 2, +-16KB range */
+            intptr_t result;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            result =
+                (intptr_t)((uintptr_t)symbol_addr + (intptr_t)reloc_addend
+                           - (uintptr_t)(target_section_addr + reloc_offset));
+
+            if (result >= 0x4000 || result < -0x4000) {
+                set_error_buf(error_buf, error_buf_size,
+                              "AOT module load failed: "
+                              "B13_PCREL target out of range.");
+                return false;
+            }
+
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(MASK_B13, (uint32)(result >> 2));
+            break;
+        }
+
+        case R_HEX_B9_PCREL:
+        {
+            /* 9-bit PC-relative branch: (S + A - P) >> 2, +-1KB range */
+            intptr_t result;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            result =
+                (intptr_t)((uintptr_t)symbol_addr + (intptr_t)reloc_addend
+                           - (uintptr_t)(target_section_addr + reloc_offset));
+
+            if (result >= 0x400 || result < -0x400) {
+                set_error_buf(error_buf, error_buf_size,
+                              "AOT module load failed: "
+                              "B9_PCREL target out of range.");
+                return false;
+            }
+
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(MASK_B9, (uint32)(result >> 2));
+            break;
+        }
+
+        case R_HEX_B7_PCREL:
+        {
+            /* 7-bit PC-relative branch: (S + A - P) >> 2, +-256B range */
+            intptr_t result;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            result =
+                (intptr_t)((uintptr_t)symbol_addr + (intptr_t)reloc_addend
+                           - (uintptr_t)(target_section_addr + reloc_offset));
+
+            if (result >= 0x100 || result < -0x100) {
+                set_error_buf(error_buf, error_buf_size,
+                              "AOT module load failed: "
+                              "B7_PCREL target out of range.");
+                return false;
+            }
+
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(MASK_B7, (uint32)(result >> 2));
+            break;
+        }
+
+        case R_HEX_LO16:
+        {
+            /* Low 16 bits of absolute address: (S + A) & 0xFFFF */
+            uint32 val;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            val = (uint32)(uintptr_t)symbol_addr + (int32)reloc_addend;
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(MASK_LO16, val & 0xFFFF);
+            break;
+        }
+
+        case R_HEX_HI16:
+        {
+            /* High 16 bits of absolute address: (S + A) >> 16 */
+            uint32 val;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            val = (uint32)(uintptr_t)symbol_addr + (int32)reloc_addend;
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(MASK_LO16, val >> 16);
+            break;
+        }
+
+        case R_HEX_B32_PCREL_X:
+        {
+            /*
+             * Extended 32-bit PC-relative for constant extender (immext).
+             * Upper 26 bits: (S + A - P) >> 6
+             */
+            intptr_t result;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            result =
+                (intptr_t)((uintptr_t)symbol_addr + (intptr_t)reloc_addend
+                           - (uintptr_t)(target_section_addr + reloc_offset));
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(MASK_X26, (uint32)(result >> 6));
+            break;
+        }
+
+        case R_HEX_32_6_X:
+        {
+            /*
+             * Extended 32-bit absolute for constant extender (immext).
+             * Upper 26 bits: (S + A) >> 6
+             */
+            uint32 val;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            val = (uint32)(uintptr_t)symbol_addr + (int32)reloc_addend;
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(MASK_X26, val >> 6);
+            break;
+        }
+
+        case R_HEX_B22_PCREL_X:
+        {
+            /*
+             * Extended 22-bit PC-relative: low 6 bits of (S + A - P).
+             * Paired with R_HEX_B32_PCREL_X on the immext.
+             */
+            intptr_t result;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            result =
+                (intptr_t)((uintptr_t)symbol_addr + (intptr_t)reloc_addend
+                           - (uintptr_t)(target_section_addr + reloc_offset));
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(MASK_B22, (uint32)result & 0x3F);
+            break;
+        }
+
+        case R_HEX_B15_PCREL_X:
+        {
+            /*
+             * Extended 15-bit PC-relative: low 6 bits of (S + A - P).
+             * Paired with R_HEX_B32_PCREL_X on the immext.
+             */
+            intptr_t result;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            result =
+                (intptr_t)((uintptr_t)symbol_addr + (intptr_t)reloc_addend
+                           - (uintptr_t)(target_section_addr + reloc_offset));
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(MASK_B15, (uint32)result & 0x3F);
+            break;
+        }
+
+        case R_HEX_B13_PCREL_X:
+        {
+            /*
+             * Extended 13-bit PC-relative: low 6 bits of (S + A - P).
+             * Paired with R_HEX_B32_PCREL_X on the immext.
+             */
+            intptr_t result;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            result =
+                (intptr_t)((uintptr_t)symbol_addr + (intptr_t)reloc_addend
+                           - (uintptr_t)(target_section_addr + reloc_offset));
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(MASK_B13, (uint32)result & 0x3F);
+            break;
+        }
+
+        case R_HEX_B9_PCREL_X:
+        {
+            /*
+             * Extended 9-bit PC-relative: low 6 bits of (S + A - P).
+             * Paired with R_HEX_B32_PCREL_X on the immext.
+             */
+            intptr_t result;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            result =
+                (intptr_t)((uintptr_t)symbol_addr + (intptr_t)reloc_addend
+                           - (uintptr_t)(target_section_addr + reloc_offset));
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(MASK_B9, (uint32)result & 0x3F);
+            break;
+        }
+
+        case R_HEX_B7_PCREL_X:
+        {
+            /*
+             * Extended 7-bit PC-relative: low 6 bits of (S + A - P).
+             * Paired with R_HEX_B32_PCREL_X on the immext.
+             */
+            intptr_t result;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            result =
+                (intptr_t)((uintptr_t)symbol_addr + (intptr_t)reloc_addend
+                           - (uintptr_t)(target_section_addr + reloc_offset));
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(MASK_B7, (uint32)result & 0x3F);
+            break;
+        }
+
+        case R_HEX_6_X:
+        {
+            /* Low 6 bits for constant-extended absolute */
+            uint32 val, insn, mask_r6;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            insn = *(uint32 *)(target_section_addr + reloc_offset);
+            mask_r6 = get_mask_r6(insn, error_buf, error_buf_size);
+            if (!mask_r6)
+                return false;
+            val = (uint32)(uintptr_t)symbol_addr + (int32)reloc_addend;
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(mask_r6, val & 0x3F);
+            break;
+        }
+
+        case R_HEX_6_PCREL_X:
+        {
+            /* Low 6 bits for constant-extended PC-relative */
+            intptr_t result;
+            uint32 insn, mask_r6;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            insn = *(uint32 *)(target_section_addr + reloc_offset);
+            mask_r6 = get_mask_r6(insn, error_buf, error_buf_size);
+            if (!mask_r6)
+                return false;
+            result =
+                (intptr_t)((uintptr_t)symbol_addr + (intptr_t)reloc_addend
+                           - (uintptr_t)(target_section_addr + reloc_offset));
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(mask_r6, (uint32)result & 0x3F);
+            break;
+        }
+
+        case R_HEX_16_X:
+        {
+            uint32 val, insn, mask_r16;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            insn = *(uint32 *)(target_section_addr + reloc_offset);
+            mask_r16 = get_mask_r16(insn, error_buf, error_buf_size);
+            if (!mask_r16)
+                return false;
+            val = (uint32)(uintptr_t)symbol_addr + (int32)reloc_addend;
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(mask_r16, val & 0x3F);
+            break;
+        }
+
+        case R_HEX_12_X:
+        {
+            /* Extended 12-bit absolute: (S + A) with fixed mask */
+            uint32 val;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            val = (uint32)(uintptr_t)symbol_addr + (int32)reloc_addend;
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(0x000007e0, val);
+            break;
+        }
+
+        case R_HEX_11_X:
+        {
+            /* Extended 11-bit absolute: low 6 bits of (S + A) */
+            uint32 val, insn;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            insn = *(uint32 *)(target_section_addr + reloc_offset);
+            val = (uint32)(uintptr_t)symbol_addr + (int32)reloc_addend;
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(get_mask_r11(insn), val & 0x3F);
+            break;
+        }
+
+        case R_HEX_10_X:
+        {
+            /* Extended 10-bit absolute: low 6 bits of (S + A) */
+            uint32 val;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            val = (uint32)(uintptr_t)symbol_addr + (int32)reloc_addend;
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(0x00203fe0, val & 0x3F);
+            break;
+        }
+
+        case R_HEX_9_X:
+        {
+            /* Extended 9-bit absolute: low 6 bits of (S + A) */
+            uint32 val;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            val = (uint32)(uintptr_t)symbol_addr + (int32)reloc_addend;
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(0x00003fe0, val & 0x3F);
+            break;
+        }
+
+        case R_HEX_8_X:
+        {
+            /* Extended 8-bit absolute: (S + A) */
+            uint32 val, insn;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            insn = *(uint32 *)(target_section_addr + reloc_offset);
+            val = (uint32)(uintptr_t)symbol_addr + (int32)reloc_addend;
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(get_mask_r8(insn), val);
+            break;
+        }
+
+        case R_HEX_7_X:
+        {
+            /* Extended 7-bit absolute: low 7 bits of (S + A) */
+            uint32 val;
+            CHECK_RELOC_OFFSET(sizeof(uint32));
+            val = (uint32)(uintptr_t)symbol_addr + (int32)reloc_addend;
+            *(uint32 *)(target_section_addr + reloc_offset) |=
+                apply_mask(0x00001f18, val & 0x7F);
+            break;
+        }
+
+        case R_HEX_16:
+        {
+            /* Direct 16-bit relocation */
+            uint16 val;
+            CHECK_RELOC_OFFSET(sizeof(uint16));
+            val = (uint16)((uintptr_t)symbol_addr + (int32)reloc_addend);
+            *(uint16 *)(target_section_addr + reloc_offset) = val;
+            break;
+        }
+
+        case R_HEX_8:
+        {
+            /* Direct 8-bit relocation: (S + A) truncated to 8 bits */
+            uint8 val;
+            CHECK_RELOC_OFFSET(sizeof(uint8));
+            val = (uint8)((uintptr_t)symbol_addr + (int32)reloc_addend);
+            *(uint8 *)(target_section_addr + reloc_offset) = val;
+            break;
+        }
+
+        case R_HEX_NONE:
+            break;
+
+        default:
+            if (error_buf != NULL)
+                snprintf(error_buf, error_buf_size,
+                         "Load relocation section failed: "
+                         "invalid relocation type %" PRIu32 ".",
+                         reloc_type);
+            return false;
+    }
+
+    return true;
+}
diff --git a/core/iwasm/aot/iwasm_aot.cmake b/core/iwasm/aot/iwasm_aot.cmake
index 1084681aca..b89173a41a 100644
--- a/core/iwasm/aot/iwasm_aot.cmake
+++ b/core/iwasm/aot/iwasm_aot.cmake
@@ -49,6 +49,8 @@ elseif (WAMR_BUILD_TARGET MATCHES "RISCV*")
   set (arch_source ${IWASM_AOT_DIR}/arch/aot_reloc_riscv.c)
 elseif (WAMR_BUILD_TARGET STREQUAL "ARC")
   set (arch_source ${IWASM_AOT_DIR}/arch/aot_reloc_arc.c)
+elseif (WAMR_BUILD_TARGET STREQUAL "HEXAGON")
+  set (arch_source ${IWASM_AOT_DIR}/arch/aot_reloc_hexagon.c)
 else ()
   message (FATAL_ERROR "Build target isn't set")
 endif ()
diff --git a/core/iwasm/common/arch/invokeNative_hexagon.s b/core/iwasm/common/arch/invokeNative_hexagon.s
new file mode 100644
index 0000000000..330ff2dd3d
--- /dev/null
+++ b/core/iwasm/common/arch/invokeNative_hexagon.s
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+        .text
+        .align 2
+        .globl invokeNative
+        .type invokeNative, @function
+
+/*
+ * void invokeNative(void (*native_code)(), uint32 argv[], uint32 argc)
+ *
+ * r0 = native_code, r1 = argv, r2 = argc
+ * Loads up to 6 words into r0-r5; remaining args are pushed to stack.
+ */
+
+invokeNative:
+        {
+            allocframe(#16)
+        }
+        {
+            memd(r29+#0) = r17:16
+        }
+        {
+            memd(r29+#8) = r19:18
+        }
+
+        // Save arguments to callee-saved registers
+        {
+            r16 = r0                    // r16 = function_ptr
+            r17 = r1                    // r17 = argv
+        }
+        {
+            r18 = r2                    // r18 = argc
+            p0 = cmp.gt(r2, #0)
+        }
+        {
+            if (!p0) jump .Lcall        // argc == 0: skip arg loading
+        }
+
+        // Load register arguments from argv (up to 6 words in r0-r5)
+        {
+            r0 = memw(r17+#0)          // r0 = argv[0] (exec_env)
+            p0 = cmp.gt(r18, #1)
+        }
+        { if (!p0) jump .Lcall }
+
+        {
+            r1 = memw(r17+#4)          // r1 = argv[1]
+            p0 = cmp.gt(r18, #2)
+        }
+        { if (!p0) jump .Lcall }
+
+        {
+            r2 = memw(r17+#8)          // r2 = argv[2]
+            p0 = cmp.gt(r18, #3)
+        }
+        { if (!p0) jump .Lcall }
+
+        {
+            r3 = memw(r17+#12)         // r3 = argv[3]
+            p0 = cmp.gt(r18, #4)
+        }
+        { if (!p0) jump .Lcall }
+
+        {
+            r4 = memw(r17+#16)         // r4 = argv[4]
+            p0 = cmp.gt(r18, #5)
+        }
+        { if (!p0) jump .Lcall }
+
+        {
+            r5 = memw(r17+#20)         // r5 = argv[5]
+            p0 = cmp.gt(r18, #6)
+        }
+        { if (!p0) jump .Lcall }
+
+        // Stack arguments: argc > 6.
+        // Copy argv[6..argc-1] to the stack, maintaining 8-byte alignment.
+        {
+            r19 = add(r18, #-6)         // r19 = number of stack args
+        }
+        {
+            r7 = asl(r19, #2)           // r7 = stack_args * 4 (bytes)
+        }
+        {
+            r7 = add(r7, #7)            // round up to 8-byte alignment
+        }
+        {
+            r7 = and(r7, #-8)
+        }
+        {
+            r29 = sub(r29, r7)          // allocate aligned stack space
+        }
+        {
+            r6 = add(r17, #24)          // r6 = &argv[6] (source)
+            r8 = r29                    // r8 = stack destination
+        }
+
+.Lcopy_loop:
+        {
+            r9 = memw(r6++#4)           // load next arg from argv
+        }
+        {
+            memw(r8++#4) = r9            // store to stack
+            r19 = add(r19, #-1)         // decrement counter
+        }
+        {
+            p0 = cmp.gt(r19, #0)
+            if (p0.new) jump:t .Lcopy_loop
+        }
+
+.Lcall:
+        {
+            callr r16                   // call native function
+        }
+
+.Lreturn:
+        {
+            r17:16 = memd(r30+#-16)
+        }
+        {
+            r19:18 = memd(r30+#-8)
+        }
+        {
+            deallocframe
+        }
+        {
+            jumpr r31
+        }
+
+        .size invokeNative, .-invokeNative
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/core/iwasm/common/iwasm_common.cmake b/core/iwasm/common/iwasm_common.cmake
index c3653f156c..7d51bbfbde 100644
--- a/core/iwasm/common/iwasm_common.cmake
+++ b/core/iwasm/common/iwasm_common.cmake
@@ -155,6 +155,8 @@ elseif (WAMR_BUILD_TARGET MATCHES "RISCV*")
   set (source_all ${c_source_all} ${IWASM_COMMON_DIR}/arch/invokeNative_riscv.S)
 elseif (WAMR_BUILD_TARGET STREQUAL "ARC")
   set (source_all ${c_source_all} ${IWASM_COMMON_DIR}/arch/invokeNative_arc.s)
+elseif (WAMR_BUILD_TARGET STREQUAL "HEXAGON")
+  set (source_all ${c_source_all} ${IWASM_COMMON_DIR}/arch/invokeNative_hexagon.s)
 else ()
   message (FATAL_ERROR "Build target isn't set")
 endif ()
diff --git a/core/iwasm/common/wasm_runtime_common.h b/core/iwasm/common/wasm_runtime_common.h
index e168668d51..17a21da2fe 100644
--- a/core/iwasm/common/wasm_runtime_common.h
+++ b/core/iwasm/common/wasm_runtime_common.h
@@ -395,8 +395,9 @@ LOAD_I16(void *addr)
  *
  * These are guarded by WASM_CPU_SUPPORTS_UNALIGNED_SIMD_ACCESS rather than
  * WASM_CPU_SUPPORTS_UNALIGNED_ADDR_ACCESS because some architectures have
- * different alignment rules for scalar vs vector memory operations,
- * e.g. architectures with dedicated unaligned-load vector instructions.
+ * different alignment rules for scalar vs vector memory operations.
+ * For example, Hexagon scalar loads require natural alignment, but HVX
+ * vector loads support unaligned access (vmemu instruction).
  *
  * PUT_V128_TO_ADDR / GET_V128_FROM_ADDR (frame-local access) remain
  * guarded by the scalar flag above since frame locals are accessed via
@@ -408,10 +409,10 @@ LOAD_I16(void *addr)
 
 #elif WASM_CPU_SUPPORTS_UNALIGNED_SIMD_ACCESS != 0
 
-/* The target's SIMD unit supports unaligned vector access, but scalar loads
- * require natural alignment. Use memcpy which is safe at any alignment and
- * allows the compiler to select the best instruction sequence for the
- * target. */
+/* The target's SIMD unit supports unaligned vector access (e.g. Hexagon HVX
+ * vmemu), but scalar loads require natural alignment. Use memcpy which is
+ * safe at any alignment and allows the compiler to select the best
+ * instruction sequence for the target. */
 static inline V128
 LOAD_V128(void *addr)
 {
diff --git a/core/iwasm/compilation/aot_llvm.c b/core/iwasm/compilation/aot_llvm.c
index f1ee1eecec..3abf588cba 100644
--- a/core/iwasm/compilation/aot_llvm.c
+++ b/core/iwasm/compilation/aot_llvm.c
@@ -191,7 +191,7 @@ aot_target_precheck_can_use_musttail(const AOTCompContext *comp_ctx)
         return false;
     }
     /*
-     * x86-64/i386: true
+     * x86-64/i386/hexagon: true
      *
      * others: assume true for now
      */
@@ -2286,7 +2286,8 @@ static ArchItem valid_archs[] = {
     { "thumbv8.1m.main", true },
     { "riscv32", true },
     { "riscv64", true },
-    { "arc", true }
+    { "arc", true },
+    { "hexagon", false }
 };
 
 static const char *valid_abis[] = {
@@ -3169,6 +3170,37 @@ aot_create_comp_context(const AOTCompData *comp_data, aot_comp_option_t option)
         }
 #endif
 
+        /*
+         * Hexagon: disable GP-relative (small-data) addressing.
+         * AOT code has no GP register set up, so GP-relative relocations
+         * (R_HEX_GPREL16_*) would fail to resolve.  Force all data
+         * through absolute addressing with -small-data.
+         *
+         * Note: features_buf is only used by one code path at a time
+         * (either this block or the aarch64 #ifdef block above), so
+         * there is no conflict in reusing it here.
+         */
+        {
+            bool is_hexagon = false;
+            if (arch && !strcmp(arch, "hexagon"))
+                is_hexagon = true;
+            else if (triple_norm && strstr(triple_norm, "hexagon"))
+                is_hexagon = true;
+
+            if (is_hexagon) {
+                if (features[0] != '\0') {
+                    if (!strstr(features, "-small-data")) {
+                        snprintf(features_buf, sizeof(features_buf),
+                                 "%s,-small-data", features);
+                        features = features_buf;
+                    }
+                }
+                else {
+                    features = "-small-data";
+                }
+            }
+        }
+
         /* Get target with triple, note that LLVMGetTargetFromTriple()
            return 0 when success, but not true. */
         if (LLVMGetTargetFromTriple(triple_norm, &target, &err) != 0) {
@@ -3386,7 +3418,8 @@ aot_create_comp_context(const AOTCompData *comp_data, aot_comp_option_t option)
 
     if (option->enable_simd && strcmp(comp_ctx->target_arch, "x86_64") != 0
         && strncmp(comp_ctx->target_arch, "aarch64", 7) != 0
-        && strcmp(comp_ctx->target_arch, "arc") != 0) {
+        && strcmp(comp_ctx->target_arch, "arc") != 0
+        && strcmp(comp_ctx->target_arch, "hexagon") != 0) {
         /* Disable simd if it isn't supported by target arch */
         option->enable_simd = false;
     }
@@ -3414,12 +3447,13 @@ aot_create_comp_context(const AOTCompData *comp_data, aot_comp_option_t option)
     }
 
     /* Determine whether the target's SIMD/vector unit supports unaligned
-     * memory access.  x86_64 and aarch64 can handle unaligned vector
-     * loads/stores natively.  This informs alignment annotations emitted
-     * for SIMD load/store IR. */
+     * memory access.  x86_64, aarch64, and Hexagon (HVX vmemu) can handle
+     * unaligned vector loads/stores.  This informs alignment annotations
+     * emitted for SIMD load/store IR. */
     comp_ctx->target_supports_unaligned_simd =
         !strcmp(comp_ctx->target_arch, "x86_64")
-        || !strncmp(comp_ctx->target_arch, "aarch64", 7);
+        || !strncmp(comp_ctx->target_arch, "aarch64", 7)
+        || !strcmp(comp_ctx->target_arch, "hexagon");
 
     if (!(target_data_ref =
               LLVMCreateTargetDataLayout(comp_ctx->target_machine))) {
diff --git a/core/iwasm/compilation/simd/simd_load_store.c b/core/iwasm/compilation/simd/simd_load_store.c
index ee787c95a2..2b53b54729 100644
--- a/core/iwasm/compilation/simd/simd_load_store.c
+++ b/core/iwasm/compilation/simd/simd_load_store.c
@@ -37,8 +37,8 @@ simd_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, uint32 align,
 
     /* WASM SIMD does not guarantee alignment for v128 loads.
      * On targets whose SIMD unit handles unaligned access natively
-     * (x86 SSE, aarch64 NEON), align 1 is safe and the backend will
-     * select the right instruction.
+     * (x86 SSE, aarch64 NEON, Hexagon HVX vmemu), align 1 is safe
+     * and the backend will select the right instruction.
      * On other targets, use the WASM alignment hint so the backend
      * can generate wider (aligned) loads instead of byte-by-byte. */
     if (comp_ctx->target_supports_unaligned_simd)
diff --git a/product-mini/platforms/hexagon/CMakeLists.txt b/product-mini/platforms/hexagon/CMakeLists.txt
new file mode 100644
index 0000000000..dc2c46a161
--- /dev/null
+++ b/product-mini/platforms/hexagon/CMakeLists.txt
@@ -0,0 +1,144 @@
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+cmake_minimum_required (VERSION 3.14)
+
+include(CheckPIESupported)
+
+project (iwasm)
+
+option(BUILD_SHARED_LIBS "Build using shared libraries" OFF)
+
+set (CMAKE_VERBOSE_MAKEFILE OFF)
+
+# Hexagon uses the Linux kernel
+set (WAMR_BUILD_PLATFORM "linux")
+set (WAMR_BUILD_TARGET "HEXAGON")
+
+# Reset default linker flags
+set (CMAKE_SHARED_LIBRARY_LINK_C_FLAGS "")
+set (CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS "")
+
+set (CMAKE_C_STANDARD 99)
+set (CMAKE_CXX_STANDARD 17)
+
+if (NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif ()
+
+if (NOT DEFINED WAMR_BUILD_INTERP)
+  # Enable Interpreter by default
+  set (WAMR_BUILD_INTERP 1)
+endif ()
+
+if (NOT DEFINED WAMR_BUILD_AOT)
+  # Enable AOT by default
+  set (WAMR_BUILD_AOT 1)
+endif ()
+
+if (NOT DEFINED WAMR_BUILD_JIT)
+  # Disable JIT by default
+  set (WAMR_BUILD_JIT 0)
+endif ()
+
+if (NOT DEFINED WAMR_BUILD_FAST_JIT)
+  # Fast JIT not supported on Hexagon
+  set (WAMR_BUILD_FAST_JIT 0)
+endif ()
+
+if (NOT DEFINED WAMR_BUILD_LIBC_BUILTIN)
+  set (WAMR_BUILD_LIBC_BUILTIN 1)
+endif ()
+
+if (NOT DEFINED WAMR_BUILD_LIBC_WASI)
+  set (WAMR_BUILD_LIBC_WASI 1)
+endif ()
+
+if (NOT DEFINED WAMR_BUILD_FAST_INTERP)
+  set (WAMR_BUILD_FAST_INTERP 1)
+endif ()
+
+if (NOT DEFINED WAMR_BUILD_MULTI_MODULE)
+  set (WAMR_BUILD_MULTI_MODULE 0)
+endif ()
+
+if (NOT DEFINED WAMR_BUILD_LIB_PTHREAD)
+  set (WAMR_BUILD_LIB_PTHREAD 0)
+endif ()
+
+if (NOT DEFINED WAMR_BUILD_LIB_WASI_THREADS)
+  set (WAMR_BUILD_LIB_WASI_THREADS 0)
+endif()
+
+if (NOT DEFINED WAMR_BUILD_MINI_LOADER)
+  set (WAMR_BUILD_MINI_LOADER 0)
+endif ()
+
+if (NOT DEFINED WAMR_BUILD_SIMD)
+  set (WAMR_BUILD_SIMD 1)
+endif ()
+
+if (NOT DEFINED WAMR_BUILD_REF_TYPES)
+  set (WAMR_BUILD_REF_TYPES 1)
+endif ()
+
+if (NOT DEFINED WAMR_BUILD_DEBUG_INTERP)
+  set (WAMR_BUILD_DEBUG_INTERP 0)
+endif ()
+
+if (WAMR_BUILD_DEBUG_INTERP EQUAL 1)
+  set (WAMR_BUILD_FAST_INTERP 0)
+  set (WAMR_BUILD_MINI_LOADER 0)
+  set (WAMR_BUILD_SIMD 0)
+endif ()
+
+set (WAMR_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+
+include (${WAMR_ROOT_DIR}/build-scripts/runtime_lib.cmake)
+
+check_pie_supported()
+
+set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gc-sections")
+
+include (${SHARED_DIR}/utils/uncommon/shared_uncommon.cmake)
+
+# Share main.c with Linux platform since Hexagon runs Linux kernel
+add_executable (iwasm
+  ${CMAKE_CURRENT_SOURCE_DIR}/../linux/main.c
+  ${UNCOMMON_SHARED_SOURCE}
+)
+
+set_version_info (iwasm)
+
+target_link_libraries(iwasm vmlib)
+
+install (TARGETS iwasm DESTINATION bin)
+
+add_library (vmlib ${WAMR_RUNTIME_LIB_SOURCE})
+
+set_version_info (vmlib)
+
+target_include_directories(vmlib INTERFACE
+  $<INSTALL_INTERFACE:include>
+)
+
+set (WAMR_PUBLIC_HEADERS
+  ${WAMR_ROOT_DIR}/core/iwasm/include/wasm_c_api.h
+  ${WAMR_ROOT_DIR}/core/iwasm/include/wasm_export.h
+  ${WAMR_ROOT_DIR}/core/iwasm/include/lib_export.h
+)
+
+set_target_properties (vmlib PROPERTIES
+  OUTPUT_NAME iwasm
+  PUBLIC_HEADER "${WAMR_PUBLIC_HEADERS}"
+)
+
+target_link_libraries (vmlib ${LLVM_AVAILABLE_LIBS} ${UV_A_LIBS} -lm -lpthread)
+
+install (TARGETS vmlib
+  EXPORT iwasmTargets
+  DESTINATION lib
+  PUBLIC_HEADER DESTINATION include
+)
+
+install_iwasm_package ()
diff --git a/tests/wamr-test-suites/spec-test-script/all.py b/tests/wamr-test-suites/spec-test-script/all.py
index 970127d0b2..0cf9dc4931 100644
--- a/tests/wamr-test-suites/spec-test-script/all.py
+++ b/tests/wamr-test-suites/spec-test-script/all.py
@@ -58,6 +58,7 @@ def get_iwasm_cmd(platform: str) -> str:
     "AARCH64_VFP",
     "ARMV7",
     "ARMV7_VFP",
+    "HEXAGON",
     "RISCV32",
     "RISCV32_ILP32F",
     "RISCV32_ILP32D",
@@ -95,6 +96,19 @@ def ignore_the_case(
     if "i386" == target and case_name in ["float_exprs", "conversions"]:
         return True
 
+    # TODO: investigate Hexagon-specific failures:
+    # - float_exprs/conversions: Hexagon does not canonicalize NaN payloads
+    #   (sNaN propagation differs from spec expectations)
+    # - i32/i64: Hexagon asl/asr instructions use signed shift amounts,
+    #   causing clang to miscompile rotl/rotr when upper bits are set
+    # - simd_*: NaN propagation in pmin/pmax and lane/splat edge cases
+    if "hexagon" == target and case_name in [
+        "float_exprs", "conversions", "f32_bitwise", "i32", "i64",
+        "simd_f32x4_pmin_pmax", "simd_f64x2_pmin_pmax",
+        "simd_lane", "simd_splat",
+    ]:
+        return True
+
     # esp32s3 qemu doesn't have PSRAM emulation
     if qemu_flag and target == 'xtensa' and case_name in ["memory_size"]:
         return True
@@ -598,9 +612,20 @@ def main():
     )
     parser.add_argument('--no-pty', action='store_true',
         help="Use direct pipes instead of pseudo-tty")
+    parser.add_argument(
+        "--interpreter",
+        default="",
+        dest="interpreter",
+        help="Specify the iwasm interpreter path (overrides the default)",
+    )
 
     options = parser.parse_args()
 
+    # Override global IWASM_CMD if --interpreter is specified
+    global IWASM_CMD
+    if options.interpreter:
+        IWASM_CMD = options.interpreter
+
     # Convert target to lower case for internal use, e.g. X86_64 -> x86_64
     # target is always exist, so no need to check it
     options.target = options.target.lower()

From 9c87a7e60682dc1dee083bb4406bf0b92968217e Mon Sep 17 00:00:00 2001
From: Brian Cain <brian.cain@oss.qualcomm.com>
Date: Sat, 2 May 2026 19:44:41 -0700
Subject: [PATCH 3/7] ci(hexagon): add spec test workflow under qemu-hexagon
 user-mode

---
 .github/workflows/compilation_on_hexagon.yml  | 204 ++++++++++++++++++
 ...ux-musl.cmake => hexagon_linux_musl.cmake} |   2 +-
 core/iwasm/common/wasm_runtime_common.c       |   2 +-
 .../spec-test-script/runtest.py               |   1 +
 tests/wamr-test-suites/test_wamr.sh           |   2 +-
 5 files changed, 208 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/compilation_on_hexagon.yml
 rename build-scripts/toolchains/{hexagon-linux-musl.cmake => hexagon_linux_musl.cmake} (90%)

diff --git a/.github/workflows/compilation_on_hexagon.yml b/.github/workflows/compilation_on_hexagon.yml
new file mode 100644
index 0000000000..79fb27edb8
--- /dev/null
+++ b/.github/workflows/compilation_on_hexagon.yml
@@ -0,0 +1,204 @@
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+name: compilation on hexagon (qemu user-mode)
+
+on:
+  pull_request:
+    types:
+      - opened
+      - synchronize
+    paths:
+      - ".github/workflows/build_llvm_libraries.yml"
+      - ".github/workflows/compilation_on_hexagon.yml"
+      - "build-scripts/**"
+      - "core/**"
+      - "!core/deps/**"
+      - "product-mini/**"
+      - "tests/wamr-test-suites/**"
+      - "wamr-compiler/**"
+  push:
+    branches:
+      - main
+      - "dev/**"
+    paths:
+      - ".github/workflows/build_llvm_libraries.yml"
+      - ".github/workflows/compilation_on_hexagon.yml"
+      - "build-scripts/**"
+      - "core/**"
+      - "!core/deps/**"
+      - "product-mini/**"
+      - "tests/wamr-test-suites/**"
+      - "wamr-compiler/**"
+  workflow_dispatch:
+
+# Cancel any in-flight jobs for the same PR/branch so there's only one active
+# at a time
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+jobs:
+  build_llvm_libraries:
+    permissions:
+      contents: read
+      actions: write
+    uses: ./.github/workflows/build_llvm_libraries.yml
+    with:
+      os: "ubuntu-22.04"
+      arch: "X86 Hexagon"
+
+  build_wamrc:
+    needs: [build_llvm_libraries]
+    runs-on: ubuntu-22.04
+    steps:
+      - name: checkout
+        uses: actions/checkout@v6.0.2
+
+      - name: Get LLVM libraries
+        id: retrieve_llvm_libs
+        uses: actions/cache@v5
+        with:
+          path: |
+            ./core/deps/llvm/build/bin
+            ./core/deps/llvm/build/include
+            ./core/deps/llvm/build/lib
+            ./core/deps/llvm/build/libexec
+            ./core/deps/llvm/build/share
+          key: ${{ needs.build_llvm_libraries.outputs.cache_key }}
+
+      - name: Quit if cache miss
+        if: steps.retrieve_llvm_libs.outputs.cache-hit != 'true'
+        run: echo "::error::can not get prebuilt llvm libraries" && exit 1
+
+      - name: Build wamrc
+        run: |
+          mkdir build && cd build
+          cmake .. -DCMAKE_BUILD_TYPE=Release
+          cmake --build . --config Release --parallel $(nproc)
+        working-directory: wamr-compiler
+
+      - name: Upload wamrc
+        uses: actions/upload-artifact@v7.0.1
+        with:
+          name: wamrc-hexagon
+          path: wamr-compiler/build/wamrc
+
+  build_iwasm:
+    runs-on: ubuntu-22.04
+    steps:
+      - name: checkout
+        uses: actions/checkout@v6.0.2
+
+      - name: Install QEMU user-mode
+        run: sudo apt-get update && sudo apt-get install -y qemu-user
+
+      - name: Install clang-22 and lld-22
+        run: |
+          wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | sudo tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc
+          echo "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-22 main" | sudo tee /etc/apt/sources.list.d/llvm-22.list
+          sudo apt-get update
+          sudo apt-get install -y clang-22 lld-22
+
+      - name: Install CodeLinaro Hexagon toolchain
+        run: |
+          wget -q https://artifacts.codelinaro.org/artifactory/codelinaro-toolchain-for-hexagon/22.1.4_/hexagon-debs-22.1.4_.tar.gz
+          mkdir hexagon-debs
+          tar xf hexagon-debs-22.1.4_.tar.gz -C hexagon-debs
+          sudo dpkg -i hexagon-debs/*.deb
+
+      - name: Cross-compile iwasm for Hexagon
+        run: |
+          cmake -S product-mini/platforms/hexagon \
+                -B build-hexagon \
+                -DCMAKE_TOOLCHAIN_FILE=${GITHUB_WORKSPACE}/build-scripts/toolchains/hexagon_linux_musl.cmake \
+                -DCMAKE_BUILD_TYPE=Release \
+                -DWAMR_DISABLE_HW_BOUND_CHECK=1 \
+                -DWAMR_BUILD_SHRUNK_MEMORY=0 \
+                -DWAMR_BUILD_SPEC_TEST=1 \
+                -DWAMR_BUILD_LIBC_WASI=0
+          cmake --build build-hexagon --parallel $(nproc)
+
+      - name: Verify iwasm binary
+        run: |
+          file build-hexagon/iwasm
+          qemu-hexagon -cpu v67 build-hexagon/iwasm --version
+
+      - name: Upload iwasm
+        uses: actions/upload-artifact@v7.0.1
+        with:
+          name: iwasm-hexagon
+          path: build-hexagon/iwasm
+
+  spec_test:
+    needs: [build_iwasm, build_wamrc]
+    runs-on: ubuntu-22.04
+    strategy:
+      fail-fast: false
+      matrix:
+        test_option:
+          - "-s spec -b -P"
+          - "-s spec -b -S -P"
+        running_mode:
+          - "classic-interp"
+          - "fast-interp"
+          - "aot"
+        exclude:
+          # SIMD AOT needs further validation
+          - test_option: "-s spec -b -S -P"
+            running_mode: "aot"
+    steps:
+      - name: checkout
+        uses: actions/checkout@v6.0.2
+
+      - name: Install QEMU user-mode
+        run: sudo apt-get update && sudo apt-get install -y qemu-user
+
+      - name: Download iwasm artifact
+        uses: actions/download-artifact@v4.2.0
+        with:
+          name: iwasm-hexagon
+          path: ./
+
+      - name: Download wamrc artifact
+        if: matrix.running_mode == 'aot'
+        uses: actions/download-artifact@v4.2.0
+        with:
+          name: wamrc-hexagon
+          path: ./
+
+      - name: Set up iwasm wrapper
+        run: |
+          chmod +x ./iwasm
+          # Place the real Hexagon binary at a known path
+          mv ./iwasm ./iwasm-hexagon-real
+          # Create a wrapper that invokes iwasm via qemu-hexagon user-mode.
+          # Use the linux platform path since Hexagon runs Linux and this
+          # avoids needing to teach every host-tool lookup about "hexagon".
+          mkdir -p product-mini/platforms/linux/build
+          printf '#!/bin/sh\nexec qemu-hexagon -cpu v67 %s/iwasm-hexagon-real "$@"\n' \
+            "${GITHUB_WORKSPACE}" > product-mini/platforms/linux/build/iwasm
+          chmod +x product-mini/platforms/linux/build/iwasm
+          # Verify it works
+          product-mini/platforms/linux/build/iwasm --version
+
+      - name: Set up wamrc
+        if: matrix.running_mode == 'aot'
+        run: |
+          chmod +x ./wamrc
+          mkdir -p wamr-compiler/build
+          cp ./wamrc wamr-compiler/build/wamrc
+
+      - name: Run spec tests
+        timeout-minutes: 60
+        run: |
+          ./test_wamr.sh ${{ matrix.test_option }} \
+            -m HEXAGON \
+            -t ${{ matrix.running_mode }} \
+            -j linux \
+            -Q \
+            ${{ matrix.running_mode == 'aot' && format('-A {0}/wamr-compiler/build/wamrc', github.workspace) || '' }}
+        working-directory: ./tests/wamr-test-suites
diff --git a/build-scripts/toolchains/hexagon-linux-musl.cmake b/build-scripts/toolchains/hexagon_linux_musl.cmake
similarity index 90%
rename from build-scripts/toolchains/hexagon-linux-musl.cmake
rename to build-scripts/toolchains/hexagon_linux_musl.cmake
index 7465f80350..06b5b2a977 100644
--- a/build-scripts/toolchains/hexagon-linux-musl.cmake
+++ b/build-scripts/toolchains/hexagon_linux_musl.cmake
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Qualcomm Innovation Center, Inc.  All rights reserved.
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 # CMake toolchain file for cross-compiling to Hexagon Linux (musl)
diff --git a/core/iwasm/common/wasm_runtime_common.c b/core/iwasm/common/wasm_runtime_common.c
index 1a16af4196..3f6fad1db2 100644
--- a/core/iwasm/common/wasm_runtime_common.c
+++ b/core/iwasm/common/wasm_runtime_common.c
@@ -5715,7 +5715,7 @@ wasm_runtime_invoke_native(WASMExecEnv *exec_env, void *func_ptr,
 
 #if defined(BUILD_TARGET_X86_32) || defined(BUILD_TARGET_ARM)    \
     || defined(BUILD_TARGET_THUMB) || defined(BUILD_TARGET_MIPS) \
-    || defined(BUILD_TARGET_XTENSA)
+    || defined(BUILD_TARGET_XTENSA) || defined(BUILD_TARGET_HEXAGON)
 typedef void (*GenericFunctionPointer)(void);
 void
 invokeNative(GenericFunctionPointer f, uint32 *args, uint32 sz);
diff --git a/tests/wamr-test-suites/spec-test-script/runtest.py b/tests/wamr-test-suites/spec-test-script/runtest.py
index fa9f5eb7da..78526095b7 100755
--- a/tests/wamr-test-suites/spec-test-script/runtest.py
+++ b/tests/wamr-test-suites/spec-test-script/runtest.py
@@ -62,6 +62,7 @@
     "riscv64_lp64f": ["--target=riscv64", "--target-abi=lp64f", "--cpu=generic-rv64", "--cpu-features=+m,+a,+c,+f", "--size-level=1"],
     "riscv64_lp64d": ["--target=riscv64", "--target-abi=lp64d", "--cpu=generic-rv64", "--cpu-features=+m,+a,+c,+f,+d", "--size-level=1"],
     "xtensa": ["--target=xtensa"],
+    "hexagon": ["--target=hexagon"],
 }
 
 # AOT compilation options mapping for XIP mode
diff --git a/tests/wamr-test-suites/test_wamr.sh b/tests/wamr-test-suites/test_wamr.sh
index 97dc84d548..48ad006b47 100755
--- a/tests/wamr-test-suites/test_wamr.sh
+++ b/tests/wamr-test-suites/test_wamr.sh
@@ -632,7 +632,7 @@ function spec_test()
         fi
     fi
 
-    if [[ ${ENABLE_QEMU} == 1 ]]; then
+    if [[ ${ENABLE_QEMU} == 1 ]] && [[ -n ${QEMU_FIRMWARE} ]]; then
         ARGS_FOR_SPEC_TEST+="--qemu "
         ARGS_FOR_SPEC_TEST+="--qemu-firmware ${QEMU_FIRMWARE} "
     fi

From 0b0dd17531a88d4a83154d93feff459eb1794f35 Mon Sep 17 00:00:00 2001
From: Brian Cain <brian.cain@oss.qualcomm.com>
Date: Sun, 3 May 2026 19:38:44 -0700
Subject: [PATCH 4/7] fixup! feat(hexagon): add Hexagon ISA target support

Add E_MACHINE_HEXAGON (164) to the AOT loader's machine type switch
so that Hexagon AOT modules are recognized during loading.
---
 core/iwasm/aot/aot_loader.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/core/iwasm/aot/aot_loader.c b/core/iwasm/aot/aot_loader.c
index 121708d669..d7efed490b 100644
--- a/core/iwasm/aot/aot_loader.c
+++ b/core/iwasm/aot/aot_loader.c
@@ -275,6 +275,7 @@ GET_U16_FROM_ADDR(const uint8 *p)
 #define E_MACHINE_ARC_COMPACT 93    /* ARC International ARCompact */
 #define E_MACHINE_ARC_COMPACT2 195  /* Synopsys ARCompact V2 */
 #define E_MACHINE_XTENSA 94         /* Tensilica Xtensa Architecture */
+#define E_MACHINE_HEXAGON 164       /* Qualcomm Hexagon */
 #define E_MACHINE_RISCV 243         /* RISC-V 32/64 */
 #define E_MACHINE_WIN_I386 0x14c    /* Windows i386 architecture */
 #define E_MACHINE_WIN_X86_64 0x8664 /* Windows x86-64 architecture */
@@ -432,6 +433,9 @@ get_aot_file_target(AOTTargetInfo *target_info, char *target_buf,
         case E_MACHINE_ARC_COMPACT2:
             machine_type = "arc";
             break;
+        case E_MACHINE_HEXAGON:
+            machine_type = "hexagon";
+            break;
         default:
             set_error_buf_v(error_buf, error_buf_size,
                             "unknown machine type %d", target_info->e_machine);

From 7ea497fab7f1b72e852cb7c1dc5cf684a7a3808c Mon Sep 17 00:00:00 2001
From: Brian Cain <brian.cain@oss.qualcomm.com>
Date: Sun, 3 May 2026 19:38:54 -0700
Subject: [PATCH 5/7] fixup! ci(hexagon): add spec test workflow under
 qemu-hexagon user-mode

Fix three issues in the spec test harness for qemu-hexagon user-mode:

- runtest.py: handle None return from invoke() on timeout instead of
  crashing with TypeError, and fix undefined variable r.buf -> runner.buf
- all.py: only pass --qemu/--qemu-firmware to runtest.py when firmware
  is set (user-mode QEMU has no firmware); always increase timeouts to
  120s when qemu_flag is active
- test_wamr.sh: always pass --qemu to all.py when -Q is set, not only
  when -F (firmware) is also provided
---
 tests/wamr-test-suites/spec-test-script/all.py     | 12 +++++++++---
 tests/wamr-test-suites/spec-test-script/runtest.py |  4 +++-
 tests/wamr-test-suites/test_wamr.sh                |  6 ++++--
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/tests/wamr-test-suites/spec-test-script/all.py b/tests/wamr-test-suites/spec-test-script/all.py
index 0cf9dc4931..31c97d0850 100644
--- a/tests/wamr-test-suites/spec-test-script/all.py
+++ b/tests/wamr-test-suites/spec-test-script/all.py
@@ -234,9 +234,15 @@ def test_case(
         CMD.append("--eh")
 
     if qemu_flag:
-        CMD.append("--qemu")
-        CMD.append("--qemu-firmware")
-        CMD.append(qemu_firmware)
+        if qemu_firmware:
+            CMD.append("--qemu")
+            CMD.append("--qemu-firmware")
+            CMD.append(qemu_firmware)
+        # Increase timeouts for QEMU emulation (default: 30s start, 20s test)
+        CMD.append("--start-timeout")
+        CMD.append("120")
+        CMD.append("--test-timeout")
+        CMD.append("120")
 
     if not clean_up_flag:
         CMD.append("--no_cleanup")
diff --git a/tests/wamr-test-suites/spec-test-script/runtest.py b/tests/wamr-test-suites/spec-test-script/runtest.py
index 78526095b7..5f56c15e17 100755
--- a/tests/wamr-test-suites/spec-test-script/runtest.py
+++ b/tests/wamr-test-suites/spec-test-script/runtest.py
@@ -268,7 +268,7 @@ def assert_prompt(runner, prompts, timeout, is_need_execute_result):
             log("Started with:\n%s" % header)
     else:
         log("Did not one of following prompt(s): %s" % repr(prompts))
-        log("    Got      : %s" % repr(r.buf))
+        log("    Got      : %s" % repr(runner.buf))
         raise Exception("Did not one of following prompt(s)")
 
 
@@ -781,6 +781,8 @@ def is_result_match_expected(out, expected):
 def test_assert(r, opts, mode, cmd, expected):
     log("Testing(%s) %s = %s" % (mode, cmd, expected))
     out = invoke(r, opts, cmd)
+    if out is None:
+        raise Exception("Timed out waiting for response to: %s" % cmd)
     if '\n' in out or ' ' in out:
         outs = [''] + out.split('\n')[1:]
         out = outs[-1]
diff --git a/tests/wamr-test-suites/test_wamr.sh b/tests/wamr-test-suites/test_wamr.sh
index 48ad006b47..eae0c5934d 100755
--- a/tests/wamr-test-suites/test_wamr.sh
+++ b/tests/wamr-test-suites/test_wamr.sh
@@ -632,9 +632,11 @@ function spec_test()
         fi
     fi
 
-    if [[ ${ENABLE_QEMU} == 1 ]] && [[ -n ${QEMU_FIRMWARE} ]]; then
+    if [[ ${ENABLE_QEMU} == 1 ]]; then
         ARGS_FOR_SPEC_TEST+="--qemu "
-        ARGS_FOR_SPEC_TEST+="--qemu-firmware ${QEMU_FIRMWARE} "
+        if [[ -n ${QEMU_FIRMWARE} ]]; then
+            ARGS_FOR_SPEC_TEST+="--qemu-firmware ${QEMU_FIRMWARE} "
+        fi
     fi
 
     if [[ ${PLATFORM} == "windows" ]]; then

From fe7bf096ffa00bae9a9d77170f86298e5803ab1d Mon Sep 17 00:00:00 2001
From: Brian Cain <brian.cain@oss.qualcomm.com>
Date: Wed, 6 May 2026 22:05:26 -0700
Subject: [PATCH 6/7] fixup! ci(hexagon): add spec test workflow under
 qemu-hexagon user-mode

Fix IWASM_QEMU_CMD to use IWASM_CMD path instead of bare "iwasm".

When qemu_flag=True without firmware (user-mode QEMU), all.py passed
"iwasm" as the --interpreter argument to runtest.py.  Since "iwasm" is
not on PATH (the wrapper lives at product-mini/platforms/linux/build/),
every spec test failed with FileNotFoundError.

Use IWASM_CMD (the standard relative path to the build directory) so
runtest.py can find the QEMU wrapper script.
---
 tests/wamr-test-suites/spec-test-script/all.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/wamr-test-suites/spec-test-script/all.py b/tests/wamr-test-suites/spec-test-script/all.py
index 31c97d0850..cd60c51eb9 100644
--- a/tests/wamr-test-suites/spec-test-script/all.py
+++ b/tests/wamr-test-suites/spec-test-script/all.py
@@ -45,7 +45,7 @@ def get_iwasm_cmd(platform: str) -> str:
 PLATFORM_NAME = platform.uname().system.lower()
 IWASM_CMD = get_iwasm_cmd(PLATFORM_NAME)
 IWASM_SGX_CMD = "../../../product-mini/platforms/linux-sgx/enclave-sample/iwasm"
-IWASM_QEMU_CMD = "iwasm"
+IWASM_QEMU_CMD = IWASM_CMD
 SPEC_TEST_DIR = "spec/test/core"
 WAST2WASM_CMD = exe_file_path("./wabt/out/gcc/Release/wat2wasm")
 SPEC_INTERPRETER_CMD = "spec/interpreter/wasm"
@@ -628,9 +628,10 @@ def main():
     options = parser.parse_args()
 
     # Override global IWASM_CMD if --interpreter is specified
-    global IWASM_CMD
+    global IWASM_CMD, IWASM_QEMU_CMD
     if options.interpreter:
         IWASM_CMD = options.interpreter
+        IWASM_QEMU_CMD = options.interpreter
 
     # Convert target to lower case for internal use, e.g. X86_64 -> x86_64
     # target is always exist, so no need to check it

From 0ccab0761c40957b0f029e03ad53df856c6fc489 Mon Sep 17 00:00:00 2001
From: Brian Cain <brian.cain@oss.qualcomm.com>
Date: Wed, 6 May 2026 22:06:22 -0700
Subject: [PATCH 7/7] fixup! Introduce WASM_CPU_SUPPORTS_UNALIGNED_SIMD_ACCESS
 for V128 load/store

Make simd_store alignment consistent with simd_load.

The simd_load function was updated to use conditional alignment based on
target_supports_unaligned_simd, but simd_store was missed.  Apply the
same logic: targets with native unaligned SIMD support (x86_64, aarch64,
hexagon) use align 1, others use the WASM alignment hint.
---
 core/iwasm/compilation/simd/simd_load_store.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/core/iwasm/compilation/simd/simd_load_store.c b/core/iwasm/compilation/simd/simd_load_store.c
index 2b53b54729..638ba5e2e8 100644
--- a/core/iwasm/compilation/simd/simd_load_store.c
+++ b/core/iwasm/compilation/simd/simd_load_store.c
@@ -310,7 +310,12 @@ simd_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, uint32 align,
         return false;
     }
 
-    LLVMSetAlignment(result, 1);
+    /* Mirror the alignment logic in simd_load: targets with native
+     * unaligned SIMD support use align 1, others use the WASM hint. */
+    if (comp_ctx->target_supports_unaligned_simd)
+        LLVMSetAlignment(result, 1);
+    else
+        LLVMSetAlignment(result, 1 << align);
 
     return true;
 }