diff --git a/be/benchmark/benchmark_fmod.hpp b/be/benchmark/benchmark_fmod.hpp
new file mode 100644
index 00000000000000..ca9718fa94b4db
--- /dev/null
+++ b/be/benchmark/benchmark_fmod.hpp
@@ -0,0 +1,442 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <benchmark/benchmark.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <random>
+#include <vector>
+
+#include "exprs/function/fmod_fast.h"
+
+namespace doris {
+namespace {
+
+enum FmodBenchCase {
+    DB_DB = 0,
+    IN_ONE_DB = 1,
+    DB_IN_ONE = 2,
+    DB_IN_TEN = 3,
+    MIXED_SIGNS_AND_ZEROS = 4,
+};
+
+void fill_actual_load_sample(int case_id, size_t size, std::vector<double>* lhs,
+                             std::vector<double>* rhs) {
+    lhs->resize(size);
+    rhs->resize(size);
+    constexpr double db_scales[] = {1234.4500, 1876.2222, 8945.7353, 5612.6245, 4646.7853,
+                                    6523.5285, 1000.1575, 6555.5678, 2587.8535, 3754.2575};
+    constexpr uint64_t rows_per_load = 5'000'000;
+    constexpr uint64_t total_rows = 50'000'000;
+
+    std::mt19937_64 rng(0xadc83b19ULL);
+    std::uniform_real_distribution<double> signed_large(-4.5e10, 4.5e10);
+    std::uniform_real_distribution<double> signed_small(-10.0, 10.0);
+
+    for (size_t i = 0; i < size; ++i) {
+        uint64_t virtual_row = (static_cast<uint64_t>(i) * total_rows) / size;
+        uint64_t block = std::min<uint64_t>(virtual_row / rows_per_load, 9);
+        double row_num = static_cast<double>((virtual_row % rows_per_load) + 1);
+        double db = row_num * db_scales[block];
+        double in_one = row_num * 2e-7;
+        double in_ten = row_num * 2e-6;
+
+        switch (case_id) {
+        case DB_DB:
+            (*lhs)[i] = db;
+            (*rhs)[i] = db;
+            break;
+        case IN_ONE_DB:
+            (*lhs)[i] = in_one;
+            (*rhs)[i] = db;
+            break;
+        case DB_IN_ONE:
+            (*lhs)[i] = db;
+            (*rhs)[i] = in_one;
+            break;
+        case DB_IN_TEN:
+            (*lhs)[i] = db;
+            (*rhs)[i] = in_ten;
+            break;
+        case MIXED_SIGNS_AND_ZEROS:
+            (*lhs)[i] = signed_large(rng);
+            (*rhs)[i] = i % 97 == 0 ? 0.0 : signed_small(rng);
+            break;
+        default:
+            (*lhs)[i] = db;
+            (*rhs)[i] = in_one;
+            break;
+        }
+    }
+}
+
+void fill_actual_load_sample_float(int case_id, size_t size, std::vector<float>* lhs,
+                                   std::vector<float>* rhs) {
+    std::vector<double> lhs_double;
+    std::vector<double> rhs_double;
+    fill_actual_load_sample(case_id, size, &lhs_double, &rhs_double);
+
+    lhs->resize(size);
+    rhs->resize(size);
+    for (size_t i = 0; i < size; ++i) {
+        (*lhs)[i] = static_cast<float>(lhs_double[i]);
+        (*rhs)[i] = static_cast<float>(rhs_double[i]);
+    }
+}
+
+void std_vector_vector(const double* lhs, const double* rhs, double* result, uint8_t* null_map,
+                       size_t size) {
+    for (size_t i = 0; i < size; ++i) {
+        uint8_t is_null = rhs[i] == 0.0;
+        null_map[i] = is_null;
+        result[i] = std::fmod(lhs[i], rhs[i] + static_cast<double>(is_null));
+    }
+}
+
+void std_vector_vector(const float* lhs, const float* rhs, float* result, uint8_t* null_map,
+                       size_t size) {
+    for (size_t i = 0; i < size; ++i) {
+        uint8_t is_null = rhs[i] == 0.0F;
+        null_map[i] = is_null;
+        float adjusted_rhs = rhs[i] + static_cast<float>(is_null);
+        result[i] = static_cast<float>(
+                std::fmod(static_cast<double>(lhs[i]), static_cast<double>(adjusted_rhs)));
+    }
+}
+
+void std_vector_constant(const double* lhs, double rhs, double* result, uint8_t* null_map,
+                         size_t size) {
+    uint8_t is_null = rhs == 0.0;
+    memset(null_map, is_null, size);
+    if (is_null) {
+        return;
+    }
+    for (size_t i = 0; i < size; ++i) {
+        result[i] = std::fmod(lhs[i], rhs);
+    }
+}
+
+void std_vector_constant(const float* lhs, float rhs, float* result, uint8_t* null_map,
+                         size_t size) {
+    uint8_t is_null = rhs == 0.0F;
+    memset(null_map, is_null, size);
+    if (is_null) {
+        return;
+    }
+    for (size_t i = 0; i < size; ++i) {
+        result[i] = static_cast<float>(
+                std::fmod(static_cast<double>(lhs[i]), static_cast<double>(rhs)));
+    }
+}
+
+void std_constant_vector(double lhs, const double* rhs, double* result, uint8_t* null_map,
+                         size_t size) {
+    for (size_t i = 0; i < size; ++i) {
+        uint8_t is_null = rhs[i] == 0.0;
+        null_map[i] = is_null;
+        result[i] = std::fmod(lhs, rhs[i] + static_cast<double>(is_null));
+    }
+}
+
+void std_constant_vector(float lhs, const float* rhs, float* result, uint8_t* null_map,
+                         size_t size) {
+    for (size_t i = 0; i < size; ++i) {
+        uint8_t is_null = rhs[i] == 0.0F;
+        null_map[i] = is_null;
+        float adjusted_rhs = rhs[i] + static_cast<float>(is_null);
+        result[i] = static_cast<float>(
+                std::fmod(static_cast<double>(lhs), static_cast<double>(adjusted_rhs)));
+    }
+}
+
+void benchmark_args(benchmark::internal::Benchmark* b) {
+    constexpr int64_t rows = 1 << 20;
+    b->Args({DB_DB, rows})
+            ->Args({IN_ONE_DB, rows})
+            ->Args({DB_IN_ONE, rows})
+            ->Args({DB_IN_TEN, rows})
+            ->Args({MIXED_SIGNS_AND_ZEROS, rows})
+            ->Unit(benchmark::kMillisecond)
+            ->UseRealTime()
+            ->Repetitions(5)
+            ->DisplayAggregatesOnly();
+}
+
+void benchmark_const_args(benchmark::internal::Benchmark* b) {
+    constexpr int64_t rows = 1 << 20;
+    b->Arg(rows)
+            ->Unit(benchmark::kMillisecond)
+            ->UseRealTime()
+            ->Repetitions(5)
+            ->DisplayAggregatesOnly();
+}
+
+static void BM_FmodDoubleVectorVectorStd(benchmark::State& state) {
+    std::vector<double> lhs;
+    std::vector<double> rhs;
+    fill_actual_load_sample(static_cast<int>(state.range(0)), state.range(1), &lhs, &rhs);
+    std::vector<double> result(lhs.size());
+    std::vector<uint8_t> null_map(lhs.size());
+
+    for (auto _ : state) {
+        std_vector_vector(lhs.data(), rhs.data(), result.data(), null_map.data(), lhs.size());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodDoubleVectorVectorFast(benchmark::State& state) {
+    std::vector<double> lhs;
+    std::vector<double> rhs;
+    fill_actual_load_sample(static_cast<int>(state.range(0)), state.range(1), &lhs, &rhs);
+    std::vector<double> result(lhs.size());
+    std::vector<uint8_t> null_map(lhs.size());
+
+    for (auto _ : state) {
+        fmod_fast::vector_vector(lhs.data(), rhs.data(), result.data(), null_map.data(),
+                                 lhs.size());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodDoubleVectorConstantStd(benchmark::State& state) {
+    std::vector<double> lhs;
+    std::vector<double> rhs;
+    fill_actual_load_sample(DB_IN_ONE, state.range(0), &lhs, &rhs);
+    std::vector<double> result(lhs.size());
+    std::vector<uint8_t> null_map(lhs.size());
+
+    for (auto _ : state) {
+        std_vector_constant(lhs.data(), 0.9999998, result.data(), null_map.data(), lhs.size());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodDoubleVectorConstantFast(benchmark::State& state) {
+    std::vector<double> lhs;
+    std::vector<double> rhs;
+    fill_actual_load_sample(DB_IN_ONE, state.range(0), &lhs, &rhs);
+    std::vector<double> result(lhs.size());
+    std::vector<uint8_t> null_map(lhs.size());
+
+    for (auto _ : state) {
+        fmod_fast::vector_constant(lhs.data(), 0.9999998, result.data(), null_map.data(),
+                                   lhs.size());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodDoubleConstZeroStd(benchmark::State& state) {
+    std::vector<double> lhs;
+    std::vector<double> rhs;
+    fill_actual_load_sample(DB_IN_ONE, state.range(0), &lhs, &rhs);
+    std::vector<double> result(lhs.size(), -777.0);
+    std::vector<uint8_t> null_map(lhs.size());
+
+    for (auto _ : state) {
+        std_vector_constant(lhs.data(), 0.0, result.data(), null_map.data(), lhs.size());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodDoubleConstZeroFast(benchmark::State& state) {
+    std::vector<double> lhs;
+    std::vector<double> rhs;
+    fill_actual_load_sample(DB_IN_ONE, state.range(0), &lhs, &rhs);
+    std::vector<double> result(lhs.size(), -777.0);
+    std::vector<uint8_t> null_map(lhs.size());
+
+    for (auto _ : state) {
+        fmod_fast::vector_constant(lhs.data(), 0.0, result.data(), null_map.data(), lhs.size());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodDoubleConstantVectorStd(benchmark::State& state) {
+    std::vector<double> lhs;
+    std::vector<double> rhs;
+    fill_actual_load_sample(IN_ONE_DB, state.range(0), &lhs, &rhs);
+    std::vector<double> result(lhs.size());
+    std::vector<uint8_t> null_map(lhs.size());
+
+    for (auto _ : state) {
+        std_constant_vector(12345.678, rhs.data(), result.data(), null_map.data(), rhs.size());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(rhs.size()));
+}
+
+static void BM_FmodDoubleConstantVectorFast(benchmark::State& state) {
+    std::vector<double> lhs;
+    std::vector<double> rhs;
+    fill_actual_load_sample(IN_ONE_DB, state.range(0), &lhs, &rhs);
+    std::vector<double> result(lhs.size());
+    std::vector<uint8_t> null_map(lhs.size());
+
+    for (auto _ : state) {
+        fmod_fast::constant_vector(12345.678, rhs.data(), result.data(), null_map.data(),
+                                   rhs.size());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(rhs.size()));
+}
+
+static void BM_FmodFloatVectorVectorStd(benchmark::State& state) {
+    std::vector<float> lhs;
+    std::vector<float> rhs;
+    fill_actual_load_sample_float(static_cast<int>(state.range(0)), state.range(1), &lhs, &rhs);
+    std::vector<float> result(lhs.size());
+    std::vector<uint8_t> null_map(lhs.size());
+
+    for (auto _ : state) {
+        std_vector_vector(lhs.data(), rhs.data(), result.data(), null_map.data(), lhs.size());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodFloatVectorVectorFast(benchmark::State& state) {
+    std::vector<float> lhs;
+    std::vector<float> rhs;
+    fill_actual_load_sample_float(static_cast<int>(state.range(0)), state.range(1), &lhs, &rhs);
+    std::vector<float> result(lhs.size());
+    std::vector<uint8_t> null_map(lhs.size());
+
+    for (auto _ : state) {
+        fmod_fast::vector_vector(lhs.data(), rhs.data(), result.data(), null_map.data(),
+                                 lhs.size());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodFloatVectorConstantStd(benchmark::State& state) {
+    std::vector<float> lhs;
+    std::vector<float> rhs;
+    fill_actual_load_sample_float(DB_IN_ONE, state.range(0), &lhs, &rhs);
+    std::vector<float> result(lhs.size());
+    std::vector<uint8_t> null_map(lhs.size());
+
+    for (auto _ : state) {
+        std_vector_constant(lhs.data(), 0.9999998F, result.data(), null_map.data(), lhs.size());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodFloatVectorConstantFast(benchmark::State& state) {
+    std::vector<float> lhs;
+    std::vector<float> rhs;
+    fill_actual_load_sample_float(DB_IN_ONE, state.range(0), &lhs, &rhs);
+    std::vector<float> result(lhs.size());
+    std::vector<uint8_t> null_map(lhs.size());
+
+    for (auto _ : state) {
+        fmod_fast::vector_constant(lhs.data(), 0.9999998F, result.data(), null_map.data(),
+                                   lhs.size());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodFloatConstZeroStd(benchmark::State& state) {
+    std::vector<float> lhs;
+    std::vector<float> rhs;
+    fill_actual_load_sample_float(DB_IN_ONE, state.range(0), &lhs, &rhs);
+    std::vector<float> result(lhs.size(), -777.0F);
+    std::vector<uint8_t> null_map(lhs.size());
+
+    for (auto _ : state) {
+        std_vector_constant(lhs.data(), 0.0F, result.data(), null_map.data(), lhs.size());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodFloatConstZeroFast(benchmark::State& state) {
+    std::vector<float> lhs;
+    std::vector<float> rhs;
+    fill_actual_load_sample_float(DB_IN_ONE, state.range(0), &lhs, &rhs);
+    std::vector<float> result(lhs.size(), -777.0F);
+    std::vector<uint8_t> null_map(lhs.size());
+
+    for (auto _ : state) {
+        fmod_fast::vector_constant(lhs.data(), 0.0F, result.data(), null_map.data(), lhs.size());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(lhs.size()));
+}
+
+static void BM_FmodFloatConstantVectorStd(benchmark::State& state) {
+    std::vector<float> lhs;
+    std::vector<float> rhs;
+    fill_actual_load_sample_float(IN_ONE_DB, state.range(0), &lhs, &rhs);
+    std::vector<float> result(lhs.size());
+    std::vector<uint8_t> null_map(lhs.size());
+
+    for (auto _ : state) {
+        std_constant_vector(12345.678F, rhs.data(), result.data(), null_map.data(), rhs.size());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(rhs.size()));
+}
+
+static void BM_FmodFloatConstantVectorFast(benchmark::State& state) {
+    std::vector<float> lhs;
+    std::vector<float> rhs;
+    fill_actual_load_sample_float(IN_ONE_DB, state.range(0), &lhs, &rhs);
+    std::vector<float> result(lhs.size());
+    std::vector<uint8_t> null_map(lhs.size());
+
+    for (auto _ : state) {
+        fmod_fast::constant_vector(12345.678F, rhs.data(), result.data(), null_map.data(),
+                                   rhs.size());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(rhs.size()));
+}
+
+} // namespace
+
+BENCHMARK(BM_FmodDoubleVectorVectorStd)->Apply(benchmark_args);
+BENCHMARK(BM_FmodDoubleVectorVectorFast)->Apply(benchmark_args);
+BENCHMARK(BM_FmodDoubleVectorConstantStd)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodDoubleVectorConstantFast)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodDoubleConstZeroStd)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodDoubleConstZeroFast)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodDoubleConstantVectorStd)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodDoubleConstantVectorFast)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodFloatVectorVectorStd)->Apply(benchmark_args);
+BENCHMARK(BM_FmodFloatVectorVectorFast)->Apply(benchmark_args);
+BENCHMARK(BM_FmodFloatVectorConstantStd)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodFloatVectorConstantFast)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodFloatConstZeroStd)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodFloatConstZeroFast)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodFloatConstantVectorStd)->Apply(benchmark_const_args);
+BENCHMARK(BM_FmodFloatConstantVectorFast)->Apply(benchmark_const_args);
+
+} // namespace doris
diff --git a/be/benchmark/benchmark_main.cpp b/be/benchmark/benchmark_main.cpp
index fa268fa2d0e759..d5bb21ba8e41b7 100644
--- a/be/benchmark/benchmark_main.cpp
+++ b/be/benchmark/benchmark_main.cpp
@@ -24,6 +24,7 @@
 #include "benchmark_column_array_view_distance.hpp"
 #include "benchmark_column_view.hpp"
 #include "benchmark_fastunion.hpp"
+#include "benchmark_fmod.hpp"
 #include "benchmark_hll_merge.hpp"
 #include "benchmark_hybrid_set.hpp"
 #include "benchmark_string.hpp"
diff --git a/be/src/exprs/function/fmod_fast.cpp b/be/src/exprs/function/fmod_fast.cpp
new file mode 100644
index 00000000000000..0f3673a90b20e3
--- /dev/null
+++ b/be/src/exprs/function/fmod_fast.cpp
@@ -0,0 +1,160 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "exprs/function/fmod_fast.h"
+
+#include <string.h>
+
+#include <cmath>
+
+#include "common/compiler_util.h"
+
+namespace doris::fmod_fast {
+namespace {
+
+#if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__))
+#define DORIS_HAS_X87_FMOD_FAST 1
+
+ALWAYS_INLINE inline double fmod_x87_fprem(double a, double b) {
+    double r;
+    asm volatile(
+            "fldl %[b]\n\t"
+            "fldl %[a]\n\t"
+            "1:\n\t"
+            "fprem\n\t"
+            "fnstsw %%ax\n\t"
+            "testb $4, %%ah\n\t"
+            "jne 1b\n\t"
+            "fstp %%st(1)\n\t"
+            "fstpl %[r]\n\t"
+            : [r] "=m"(r)
+            : [a] "m"(a), [b] "m"(b)
+            : "ax", "cc", "st");
+    return r;
+}
+#else
+#define DORIS_HAS_X87_FMOD_FAST 0
+#endif
+
+ALWAYS_INLINE inline double fmod_double(double a, double b) {
+#if DORIS_HAS_X87_FMOD_FAST
+    if (b != 0.0 && std::isfinite(a) && std::isfinite(b)) {
+        double abs_a = std::fabs(a);
+        double abs_b = std::fabs(b);
+        if (abs_a < abs_b) {
+            return a;
+        }
+        if (abs_a == abs_b) {
+            return std::copysign(0.0, a);
+        }
+        return fmod_x87_fprem(a, b);
+    }
+#endif
+    return std::fmod(a, b);
+}
+
+ALWAYS_INLINE inline float fmod_float(float a, float b) {
+    return static_cast<float>(fmod_double(static_cast<double>(a), static_cast<double>(b)));
+}
+
+ALWAYS_INLINE inline double fmod_value(double a, double b) {
+    return fmod_double(a, b);
+}
+
+ALWAYS_INLINE inline float fmod_value(float a, float b) {
+    return fmod_float(a, b);
+}
+
+template <typename T>
+ALWAYS_INLINE inline void vector_vector_impl(const T* lhs, const T* rhs, T* result,
+                                             uint8_t* null_map, size_t size) {
+    for (size_t i = 0; i < size; ++i) {
+        uint8_t is_null = rhs[i] == T(0);
+        null_map[i] = is_null;
+        T adjusted_rhs = rhs[i] + static_cast<T>(is_null);
+        result[i] = fmod_value(lhs[i], adjusted_rhs);
+    }
+}
+
+template <typename T>
+ALWAYS_INLINE inline void vector_constant_impl(const T* lhs, T rhs, T* result, uint8_t* null_map,
+                                               size_t size) {
+    uint8_t is_null = rhs == T(0);
+    memset(null_map, is_null, size);
+    if (is_null) {
+        return;
+    }
+
+    for (size_t i = 0; i < size; ++i) {
+        result[i] = fmod_value(lhs[i], rhs);
+    }
+}
+
+template <typename T>
+ALWAYS_INLINE inline void constant_vector_impl(T lhs, const T* rhs, T* result, uint8_t* null_map,
+                                               size_t size) {
+    for (size_t i = 0; i < size; ++i) {
+        uint8_t is_null = rhs[i] == T(0);
+        null_map[i] = is_null;
+        T adjusted_rhs = rhs[i] + static_cast<T>(is_null);
+        result[i] = fmod_value(lhs, adjusted_rhs);
+    }
+}
+
+} // namespace
+
+bool is_x87_fast_path_enabled() {
+    return DORIS_HAS_X87_FMOD_FAST;
+}
+
+double scalar(double a, double b) {
+    return fmod_double(a, b);
+}
+
+float scalar(float a, float b) {
+    return fmod_float(a, b);
+}
+
+void vector_vector(const double* lhs, const double* rhs, double* result, uint8_t* null_map,
+                   size_t size) {
+    vector_vector_impl(lhs, rhs, result, null_map, size);
+}
+
+void vector_vector(const float* lhs, const float* rhs, float* result, uint8_t* null_map,
+                   size_t size) {
+    vector_vector_impl(lhs, rhs, result, null_map, size);
+}
+
+void vector_constant(const double* lhs, double rhs, double* result, uint8_t* null_map,
+                     size_t size) {
+    vector_constant_impl(lhs, rhs, result, null_map, size);
+}
+
+void vector_constant(const float* lhs, float rhs, float* result, uint8_t* null_map, size_t size) {
+    vector_constant_impl(lhs, rhs, result, null_map, size);
+}
+
+void constant_vector(double lhs, const double* rhs, double* result, uint8_t* null_map,
+                     size_t size) {
+    constant_vector_impl(lhs, rhs, result, null_map, size);
+}
+
+void constant_vector(float lhs, const float* rhs, float* result, uint8_t* null_map, size_t size) {
+    constant_vector_impl(lhs, rhs, result, null_map, size);
+}
+
+} // namespace doris::fmod_fast
diff --git a/be/src/exprs/function/fmod_fast.h b/be/src/exprs/function/fmod_fast.h
new file mode 100644
index 00000000000000..6bcc7481165443
--- /dev/null
+++ b/be/src/exprs/function/fmod_fast.h
@@ -0,0 +1,41 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+namespace doris::fmod_fast {
+
+bool is_x87_fast_path_enabled();
+
+double scalar(double a, double b);
+float scalar(float a, float b);
+
+void vector_vector(const double* lhs, const double* rhs, double* result, uint8_t* null_map,
+                   size_t size);
+void vector_vector(const float* lhs, const float* rhs, float* result, uint8_t* null_map,
+                   size_t size);
+
+void vector_constant(const double* lhs, double rhs, double* result, uint8_t* null_map, size_t size);
+void vector_constant(const float* lhs, float rhs, float* result, uint8_t* null_map, size_t size);
+
+void constant_vector(double lhs, const double* rhs, double* result, uint8_t* null_map, size_t size);
+void constant_vector(float lhs, const float* rhs, float* result, uint8_t* null_map, size_t size);
+
+} // namespace doris::fmod_fast
diff --git a/be/src/exprs/function/modulo.cpp b/be/src/exprs/function/modulo.cpp
index e6f93659387432..8942475e75b467 100644
--- a/be/src/exprs/function/modulo.cpp
+++ b/be/src/exprs/function/modulo.cpp
@@ -33,6 +33,7 @@
 #include "core/types.h"
 #include "core/value/decimalv2_value.h"
 #include "exprs/function/cast_type_to_either.h"
+#include "exprs/function/fmod_fast.h"
 #include "exprs/function/simple_function_factory.h"
 
 namespace doris {
@@ -397,9 +398,13 @@ struct ModNumericImpl {
         auto& b = column_right_ptr->get_data();
         auto& c = column_result->get_data();
         auto& n = null_map->get_data();
-        size_t size = b.size();
-        for (size_t i = 0; i < size; ++i) {
-            c[i] = Impl::apply(a, b[i], n[i]);
+        if constexpr (requires { Impl::apply(a, b, c, n); }) {
+            Impl::apply(a, b, c, n);
+        } else {
+            size_t size = b.size();
+            for (size_t i = 0; i < size; ++i) {
+                c[i] = Impl::apply(a, b[i], n[i]);
+            }
         }
         return ColumnNullable::create(std::move(column_result), std::move(null_map));
     }
@@ -416,9 +421,13 @@ struct ModNumericImpl {
         auto& b = column_right_ptr->get_data();
         auto& c = column_result->get_data();
         auto& n = null_map->get_data();
-        size_t size = a.size();
-        for (size_t i = 0; i < size; ++i) {
-            c[i] = Impl::apply(a[i], b[i], n[i]);
+        if constexpr (requires { Impl::apply(a, b, c, n); }) {
+            Impl::apply(a, b, c, n);
+        } else {
+            size_t size = a.size();
+            for (size_t i = 0; i < size; ++i) {
+                c[i] = Impl::apply(a[i], b[i], n[i]);
+            }
         }
         return ColumnNullable::create(std::move(column_result), std::move(null_map));
     }
@@ -441,17 +450,42 @@ struct ModuloNumericImpl {
     static void apply(const typename ColumnType::Container& a, ArgB b,
                       typename ColumnType::Container& c, PaddedPODArray<UInt8>& null_map) {
         size_t size = c.size();
-        UInt8 is_null = b == 0;
-        memset(null_map.data(), is_null, sizeof(UInt8) * size);
-
-        if (!is_null) {
+        if constexpr (is_float_or_double(Type)) {
+            fmod_fast::vector_constant(a.data(), b, c.data(), null_map.data(), size);
+        } else {
+            UInt8 is_null = b == 0;
+            memset(null_map.data(), is_null, sizeof(UInt8) * size);
+            if (is_null) {
+                return;
+            }
             for (size_t i = 0; i < size; i++) {
-                if constexpr (is_float_or_double(Type)) {
-                    c[i] = std::fmod((double)a[i], (double)b);
-                } else {
-                    throw_if_division_leads_to_FPE(a[i], b);
-                    c[i] = a[i] % b;
-                }
+                throw_if_division_leads_to_FPE(a[i], b);
+                c[i] = a[i] % b;
+            }
+        }
+    }
+
+    static void apply(ArgA a, const typename ColumnType::Container& b,
+                      typename ColumnType::Container& c, PaddedPODArray<UInt8>& null_map) {
+        size_t size = c.size();
+        if constexpr (is_float_or_double(Type)) {
+            fmod_fast::constant_vector(a, b.data(), c.data(), null_map.data(), size);
+        } else {
+            for (size_t i = 0; i < size; ++i) {
+                c[i] = apply(a, b[i], null_map[i]);
+            }
+        }
+    }
+
+    static void apply(const typename ColumnType::Container& a,
+                      const typename ColumnType::Container& b, typename ColumnType::Container& c,
+                      PaddedPODArray<UInt8>& null_map) {
+        size_t size = c.size();
+        if constexpr (is_float_or_double(Type)) {
+            fmod_fast::vector_vector(a.data(), b.data(), c.data(), null_map.data(), size);
+        } else {
+            for (size_t i = 0; i < size; ++i) {
+                c[i] = apply(a[i], b[i], null_map[i]);
             }
         }
     }
@@ -462,7 +496,7 @@ struct ModuloNumericImpl {
         b += is_null;
 
         if constexpr (is_float_or_double(Type)) {
-            return std::fmod((double)a, (double)b);
+            return fmod_fast::scalar(a, b);
         } else {
             throw_if_division_leads_to_FPE(a, b);
             return a % b;
diff --git a/be/test/exprs/function/function_fmod_fast_test.cpp b/be/test/exprs/function/function_fmod_fast_test.cpp
new file mode 100644
index 00000000000000..ac4f7c0dd57f69
--- /dev/null
+++ b/be/test/exprs/function/function_fmod_fast_test.cpp
@@ -0,0 +1,353 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <bit>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "core/data_type/data_type_number.h"
+#include "core/types.h"
+#include "exprs/function/fmod_fast.h"
+#include "exprs/function/function_test_util.h"
+#include "testutil/any_type.h"
+
+namespace doris {
+namespace {
+
+uint64_t bits(double v) {
+    return std::bit_cast<uint64_t>(v);
+}
+
+uint32_t bits(float v) {
+    return std::bit_cast<uint32_t>(v);
+}
+
+void expect_same_double(double actual, double expected, double lhs, double rhs) {
+    if (std::isnan(expected)) {
+        ASSERT_TRUE(std::isnan(actual)) << "lhs=" << lhs << " rhs=" << rhs;
+    } else {
+        ASSERT_EQ(bits(expected), bits(actual)) << "lhs=" << lhs << " rhs=" << rhs
+                                                << " expected=" << expected << " actual=" << actual;
+    }
+}
+
+void expect_same_float(float actual, float expected, float lhs, float rhs) {
+    if (std::isnan(expected)) {
+        ASSERT_TRUE(std::isnan(actual)) << "lhs=" << lhs << " rhs=" << rhs;
+    } else {
+        ASSERT_EQ(bits(expected), bits(actual)) << "lhs=" << lhs << " rhs=" << rhs
+                                                << " expected=" << expected << " actual=" << actual;
+    }
+}
+
+double reference_fmod(double lhs, double rhs) {
+    return std::fmod(lhs, rhs);
+}
+
+float reference_fmod(float lhs, float rhs) {
+    return static_cast<float>(std::fmod(static_cast<double>(lhs), static_cast<double>(rhs)));
+}
+
+template <typename T>
+std::vector<T> interesting_values();
+
+template <>
+std::vector<double> interesting_values<double>() {
+    const double nan = std::numeric_limits<double>::quiet_NaN();
+    const double inf = std::numeric_limits<double>::infinity();
+    return {0.0,
+            -0.0,
+            1.0,
+            -1.0,
+            2.0,
+            -2.0,
+            2.5,
+            -2.5,
+            1000.1575,
+            -1000.1575,
+            44'728'676'500.0,
+            -44'728'676'500.0,
+            std::numeric_limits<double>::min(),
+            -std::numeric_limits<double>::min(),
+            std::numeric_limits<double>::denorm_min(),
+            -std::numeric_limits<double>::denorm_min(),
+            std::numeric_limits<double>::max(),
+            -std::numeric_limits<double>::max(),
+            inf,
+            -inf,
+            nan};
+}
+
+template <>
+std::vector<float> interesting_values<float>() {
+    const float nan = std::numeric_limits<float>::quiet_NaN();
+    const float inf = std::numeric_limits<float>::infinity();
+    return {0.0F,
+            -0.0F,
+            1.0F,
+            -1.0F,
+            2.0F,
+            -2.0F,
+            2.5F,
+            -2.5F,
+            1000.1575F,
+            -1000.1575F,
+            1.0e10F,
+            -1.0e10F,
+            std::numeric_limits<float>::min(),
+            -std::numeric_limits<float>::min(),
+            std::numeric_limits<float>::denorm_min(),
+            -std::numeric_limits<float>::denorm_min(),
+            std::numeric_limits<float>::max(),
+            -std::numeric_limits<float>::max(),
+            inf,
+            -inf,
+            nan};
+}
+
+template <typename T>
+void check_scalar_pair(T lhs, T rhs);
+
+template <>
+void check_scalar_pair<double>(double lhs, double rhs) {
+    expect_same_double(fmod_fast::scalar(lhs, rhs), reference_fmod(lhs, rhs), lhs, rhs);
+}
+
+template <>
+void check_scalar_pair<float>(float lhs, float rhs) {
+    expect_same_float(fmod_fast::scalar(lhs, rhs), reference_fmod(lhs, rhs), lhs, rhs);
+}
+
+template <typename T>
+void check_scalar_corner_cases() {
+    const auto values = interesting_values<T>();
+    for (T lhs : values) {
+        for (T rhs : values) {
+            check_scalar_pair(lhs, rhs);
+        }
+    }
+}
+
+template <typename T>
+void check_actual_load_distribution() {
+    constexpr double db_scales[] = {1234.4500, 1876.2222, 8945.7353, 5612.6245, 4646.7853,
+                                    6523.5285, 1000.1575, 6555.5678, 2587.8535, 3754.2575};
+    for (double scale : db_scales) {
+        for (int64_t row = 1; row <= 5'000'000; row += 9973) {
+            T db = static_cast<T>(static_cast<double>(row) * scale);
+            T in_one = static_cast<T>(static_cast<double>(row) * 2e-7);
+            T in_ten = static_cast<T>(static_cast<double>(row) * 2e-6);
+            check_scalar_pair(db, db);
+            check_scalar_pair(in_one, db);
+            check_scalar_pair(db, in_one);
+            check_scalar_pair(db, in_ten);
+        }
+    }
+}
+
+template <typename T>
+void check_random_finite_distribution() {
+    std::mt19937_64 rng(0x9e3779b97f4a7c15ULL);
+    std::uniform_real_distribution<double> large(-4.5e10, 4.5e10);
+    std::uniform_real_distribution<double> small(-10.0, 10.0);
+    std::uniform_real_distribution<double> tiny(-1e-200, 1e-200);
+    for (int i = 0; i < 20000; ++i) {
+        T lhs = static_cast<T>(large(rng));
+        T rhs = static_cast<T>(small(rng));
+        if (rhs == T(0)) {
+            rhs = static_cast<T>(0.125);
+        }
+        check_scalar_pair(lhs, rhs);
+        check_scalar_pair(static_cast<T>(small(rng)), lhs == T(0) ? static_cast<T>(1) : lhs);
+        check_scalar_pair(static_cast<T>(tiny(rng)), rhs);
+    }
+}
+
+template <typename T>
+void fill_batch_inputs(std::vector<T>* lhs, std::vector<T>* rhs) {
+    const auto values = interesting_values<T>();
+    for (size_t i = 0; i < values.size(); ++i) {
+        for (size_t j = 0; j < values.size(); ++j) {
+            lhs->push_back(values[i]);
+            rhs->push_back(values[j]);
+        }
+    }
+
+    constexpr double db_scales[] = {1234.4500, 1876.2222, 8945.7353, 5612.6245, 4646.7853,
+                                    6523.5285, 1000.1575, 6555.5678, 2587.8535, 3754.2575};
+    for (double scale : db_scales) {
+        for (int64_t row = 1; row <= 5'000'000; row += 1543) {
+            T db = static_cast<T>(static_cast<double>(row) * scale);
+            T in_one = static_cast<T>(static_cast<double>(row) * 2e-7);
+            T in_ten = static_cast<T>(static_cast<double>(row) * 2e-6);
+            lhs->push_back(db);
+            rhs->push_back(db);
+            lhs->push_back(static_cast<T>(in_one));
+            rhs->push_back(db);
+            lhs->push_back(db);
+            rhs->push_back(in_one);
+            lhs->push_back(db);
+            rhs->push_back(in_ten);
+        }
+    }
+}
+
+template <typename T>
+void check_batch_vector_vector();
+
+template <>
+void check_batch_vector_vector<double>() {
+    std::vector<double> lhs;
+    std::vector<double> rhs;
+    fill_batch_inputs(&lhs, &rhs);
+    std::vector<double> result(lhs.size());
+    std::vector<uint8_t> null_map(lhs.size());
+
+    fmod_fast::vector_vector(lhs.data(), rhs.data(), result.data(), null_map.data(), lhs.size());
+    for (size_t i = 0; i < lhs.size(); ++i) {
+        uint8_t expected_null = rhs[i] == 0.0;
+        ASSERT_EQ(expected_null, null_map[i]) << i;
+        double adjusted_rhs = rhs[i] + static_cast<double>(expected_null);
+        expect_same_double(result[i], reference_fmod(lhs[i], adjusted_rhs), lhs[i], adjusted_rhs);
+    }
+}
+
+template <>
+void check_batch_vector_vector<float>() {
+    std::vector<float> lhs;
+    std::vector<float> rhs;
+    fill_batch_inputs(&lhs, &rhs);
+    std::vector<float> result(lhs.size());
+    std::vector<uint8_t> null_map(lhs.size());
+
+    fmod_fast::vector_vector(lhs.data(), rhs.data(), result.data(), null_map.data(), lhs.size());
+    for (size_t i = 0; i < lhs.size(); ++i) {
+        uint8_t expected_null = rhs[i] == 0.0F;
+        ASSERT_EQ(expected_null, null_map[i]) << i;
+        float adjusted_rhs = rhs[i] + static_cast<float>(expected_null);
+        expect_same_float(result[i], reference_fmod(lhs[i], adjusted_rhs), lhs[i], adjusted_rhs);
+    }
+}
+
+template <typename T>
+void check_batch_const_shapes();
+
+template <>
+void check_batch_const_shapes<double>() {
+    std::vector<double> lhs;
+    std::vector<double> rhs;
+    fill_batch_inputs(&lhs, &rhs);
+    std::vector<double> result(lhs.size(), -777.0);
+    std::vector<uint8_t> null_map(lhs.size());
+
+    fmod_fast::vector_constant(lhs.data(), 0.0, result.data(), null_map.data(), lhs.size());
+    for (size_t i = 0; i < lhs.size(); ++i) {
+        ASSERT_EQ(1, null_map[i]) << i;
+        ASSERT_EQ(bits(-777.0), bits(result[i])) << i;
+    }
+
+    fmod_fast::vector_constant(lhs.data(), 0.125, result.data(), null_map.data(), lhs.size());
+    for (size_t i = 0; i < lhs.size(); ++i) {
+        ASSERT_EQ(0, null_map[i]) << i;
+        expect_same_double(result[i], reference_fmod(lhs[i], 0.125), lhs[i], 0.125);
+    }
+
+    fmod_fast::constant_vector(12345.678, rhs.data(), result.data(), null_map.data(), rhs.size());
+    for (size_t i = 0; i < rhs.size(); ++i) {
+        uint8_t expected_null = rhs[i] == 0.0;
+        ASSERT_EQ(expected_null, null_map[i]) << i;
+        double adjusted_rhs = rhs[i] + static_cast<double>(expected_null);
+        expect_same_double(result[i], reference_fmod(12345.678, adjusted_rhs), 12345.678,
+                           adjusted_rhs);
+    }
+}
+
+template <>
+void check_batch_const_shapes<float>() {
+    std::vector<float> lhs;
+    std::vector<float> rhs;
+    fill_batch_inputs(&lhs, &rhs);
+    std::vector<float> result(lhs.size(), -777.0F);
+    std::vector<uint8_t> null_map(lhs.size());
+
+    fmod_fast::vector_constant(lhs.data(), 0.0F, result.data(), null_map.data(), lhs.size());
+    for (size_t i = 0; i < lhs.size(); ++i) {
+        ASSERT_EQ(1, null_map[i]) << i;
+        ASSERT_EQ(bits(-777.0F), bits(result[i])) << i;
+    }
+
+    fmod_fast::vector_constant(lhs.data(), 0.125F, result.data(), null_map.data(), lhs.size());
+    for (size_t i = 0; i < lhs.size(); ++i) {
+        ASSERT_EQ(0, null_map[i]) << i;
+        expect_same_float(result[i], reference_fmod(lhs[i], 0.125F), lhs[i], 0.125F);
+    }
+
+    fmod_fast::constant_vector(12345.678F, rhs.data(), result.data(), null_map.data(), rhs.size());
+    for (size_t i = 0; i < rhs.size(); ++i) {
+        uint8_t expected_null = rhs[i] == 0.0F;
+        ASSERT_EQ(expected_null, null_map[i]) << i;
+        float adjusted_rhs = rhs[i] + static_cast<float>(expected_null);
+        expect_same_float(result[i], reference_fmod(12345.678F, adjusted_rhs), 12345.678F,
+                          adjusted_rhs);
+    }
+}
+
+} // namespace
+
+TEST(FunctionFmodFastTest, ScalarCornerCasesMatchStdFmod) {
+    check_scalar_corner_cases<double>();
+    check_scalar_corner_cases<float>();
+}
+
+TEST(FunctionFmodFastTest, ActualLoadDistributionMatchesStdFmod) {
+    check_actual_load_distribution<double>();
+    check_actual_load_distribution<float>();
+}
+
+TEST(FunctionFmodFastTest, RandomFiniteDistributionMatchesStdFmod) {
+    check_random_finite_distribution<double>();
+    check_random_finite_distribution<float>();
+}
+
+TEST(FunctionFmodFastTest, BatchVectorVectorMatchesStdFmod) {
+    check_batch_vector_vector<double>();
+    check_batch_vector_vector<float>();
+}
+
+TEST(FunctionFmodFastTest, BatchConstShapesMatchStdFmod) {
+    check_batch_const_shapes<double>();
+    check_batch_const_shapes<float>();
+}
+
+TEST(FunctionFmodFastTest, DorisFunctionNullSemanticsStayUnchanged) {
+    InputTypeSet input_types = {PrimitiveType::TYPE_DOUBLE, PrimitiveType::TYPE_DOUBLE};
+    DataSet data_set = {
+            {{5.5, 2.0}, reference_fmod(5.5, 2.0)},
+            {{-5.5, 2.0}, reference_fmod(-5.5, 2.0)},
+            {{5.5, -2.0}, reference_fmod(5.5, -2.0)},
+            {{1.0, 0.0}, Null()},
+            {{0.0, 0.0}, Null()},
+            {{44'728'676'500.0, 0.9999998}, reference_fmod(44'728'676'500.0, 0.9999998)}};
+    static_cast<void>(check_function<DataTypeFloat64, true>("fmod", input_types, data_set));
+    static_cast<void>(check_function<DataTypeFloat64, true>("mod", input_types, data_set));
+}
+
+} // namespace doris