diff --git a/be/benchmark/benchmark_fmod.hpp b/be/benchmark/benchmark_fmod.hpp new file mode 100644 index 00000000000000..ca9718fa94b4db --- /dev/null +++ b/be/benchmark/benchmark_fmod.hpp @@ -0,0 +1,442 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +#include "exprs/function/fmod_fast.h" + +namespace doris { +namespace { + +enum FmodBenchCase { + DB_DB = 0, + IN_ONE_DB = 1, + DB_IN_ONE = 2, + DB_IN_TEN = 3, + MIXED_SIGNS_AND_ZEROS = 4, +}; + +void fill_actual_load_sample(int case_id, size_t size, std::vector* lhs, + std::vector* rhs) { + lhs->resize(size); + rhs->resize(size); + constexpr double db_scales[] = {1234.4500, 1876.2222, 8945.7353, 5612.6245, 4646.7853, + 6523.5285, 1000.1575, 6555.5678, 2587.8535, 3754.2575}; + constexpr uint64_t rows_per_load = 5'000'000; + constexpr uint64_t total_rows = 50'000'000; + + std::mt19937_64 rng(0xadc83b19ULL); + std::uniform_real_distribution signed_large(-4.5e10, 4.5e10); + std::uniform_real_distribution signed_small(-10.0, 10.0); + + for (size_t i = 0; i < size; ++i) { + uint64_t virtual_row = (static_cast(i) * total_rows) / size; + uint64_t block = std::min(virtual_row / rows_per_load, 9); + double row_num = static_cast((virtual_row % rows_per_load) + 1); + double db = row_num * db_scales[block]; + double in_one = row_num * 2e-7; + double in_ten = row_num * 2e-6; + + switch (case_id) { + case DB_DB: + (*lhs)[i] = db; + (*rhs)[i] = db; + break; + case IN_ONE_DB: + (*lhs)[i] = in_one; + (*rhs)[i] = db; + break; + case DB_IN_ONE: + (*lhs)[i] = db; + (*rhs)[i] = in_one; + break; + case DB_IN_TEN: + (*lhs)[i] = db; + (*rhs)[i] = in_ten; + break; + case MIXED_SIGNS_AND_ZEROS: + (*lhs)[i] = signed_large(rng); + (*rhs)[i] = i % 97 == 0 ? 0.0 : signed_small(rng); + break; + default: + (*lhs)[i] = db; + (*rhs)[i] = in_one; + break; + } + } +} + +void fill_actual_load_sample_float(int case_id, size_t size, std::vector* lhs, + std::vector* rhs) { + std::vector lhs_double; + std::vector rhs_double; + fill_actual_load_sample(case_id, size, &lhs_double, &rhs_double); + + lhs->resize(size); + rhs->resize(size); + for (size_t i = 0; i < size; ++i) { + (*lhs)[i] = static_cast(lhs_double[i]); + (*rhs)[i] = static_cast(rhs_double[i]); + } +} + +void std_vector_vector(const double* lhs, const double* rhs, double* result, uint8_t* null_map, + size_t size) { + for (size_t i = 0; i < size; ++i) { + uint8_t is_null = rhs[i] == 0.0; + null_map[i] = is_null; + result[i] = std::fmod(lhs[i], rhs[i] + static_cast(is_null)); + } +} + +void std_vector_vector(const float* lhs, const float* rhs, float* result, uint8_t* null_map, + size_t size) { + for (size_t i = 0; i < size; ++i) { + uint8_t is_null = rhs[i] == 0.0F; + null_map[i] = is_null; + float adjusted_rhs = rhs[i] + static_cast(is_null); + result[i] = static_cast( + std::fmod(static_cast(lhs[i]), static_cast(adjusted_rhs))); + } +} + +void std_vector_constant(const double* lhs, double rhs, double* result, uint8_t* null_map, + size_t size) { + uint8_t is_null = rhs == 0.0; + memset(null_map, is_null, size); + if (is_null) { + return; + } + for (size_t i = 0; i < size; ++i) { + result[i] = std::fmod(lhs[i], rhs); + } +} + +void std_vector_constant(const float* lhs, float rhs, float* result, uint8_t* null_map, + size_t size) { + uint8_t is_null = rhs == 0.0F; + memset(null_map, is_null, size); + if (is_null) { + return; + } + for (size_t i = 0; i < size; ++i) { + result[i] = static_cast( + std::fmod(static_cast(lhs[i]), static_cast(rhs))); + } +} + +void std_constant_vector(double lhs, const double* rhs, double* result, uint8_t* null_map, + size_t size) { + for (size_t i = 0; i < size; ++i) { + uint8_t is_null = rhs[i] == 0.0; + null_map[i] = is_null; + result[i] = std::fmod(lhs, rhs[i] + static_cast(is_null)); + } +} + +void std_constant_vector(float lhs, const float* rhs, float* result, uint8_t* null_map, + size_t size) { + for (size_t i = 0; i < size; ++i) { + uint8_t is_null = rhs[i] == 0.0F; + null_map[i] = is_null; + float adjusted_rhs = rhs[i] + static_cast(is_null); + result[i] = static_cast( + std::fmod(static_cast(lhs), static_cast(adjusted_rhs))); + } +} + +void benchmark_args(benchmark::internal::Benchmark* b) { + constexpr int64_t rows = 1 << 20; + b->Args({DB_DB, rows}) + ->Args({IN_ONE_DB, rows}) + ->Args({DB_IN_ONE, rows}) + ->Args({DB_IN_TEN, rows}) + ->Args({MIXED_SIGNS_AND_ZEROS, rows}) + ->Unit(benchmark::kMillisecond) + ->UseRealTime() + ->Repetitions(5) + ->DisplayAggregatesOnly(); +} + +void benchmark_const_args(benchmark::internal::Benchmark* b) { + constexpr int64_t rows = 1 << 20; + b->Arg(rows) + ->Unit(benchmark::kMillisecond) + ->UseRealTime() + ->Repetitions(5) + ->DisplayAggregatesOnly(); +} + +static void BM_FmodDoubleVectorVectorStd(benchmark::State& state) { + std::vector lhs; + std::vector rhs; + fill_actual_load_sample(static_cast(state.range(0)), state.range(1), &lhs, &rhs); + std::vector result(lhs.size()); + std::vector null_map(lhs.size()); + + for (auto _ : state) { + std_vector_vector(lhs.data(), rhs.data(), result.data(), null_map.data(), lhs.size()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * static_cast(lhs.size())); +} + +static void BM_FmodDoubleVectorVectorFast(benchmark::State& state) { + std::vector lhs; + std::vector rhs; + fill_actual_load_sample(static_cast(state.range(0)), state.range(1), &lhs, &rhs); + std::vector result(lhs.size()); + std::vector null_map(lhs.size()); + + for (auto _ : state) { + fmod_fast::vector_vector(lhs.data(), rhs.data(), result.data(), null_map.data(), + lhs.size()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * static_cast(lhs.size())); +} + +static void BM_FmodDoubleVectorConstantStd(benchmark::State& state) { + std::vector lhs; + std::vector rhs; + fill_actual_load_sample(DB_IN_ONE, state.range(0), &lhs, &rhs); + std::vector result(lhs.size()); + std::vector null_map(lhs.size()); + + for (auto _ : state) { + std_vector_constant(lhs.data(), 0.9999998, result.data(), null_map.data(), lhs.size()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * static_cast(lhs.size())); +} + +static void BM_FmodDoubleVectorConstantFast(benchmark::State& state) { + std::vector lhs; + std::vector rhs; + fill_actual_load_sample(DB_IN_ONE, state.range(0), &lhs, &rhs); + std::vector result(lhs.size()); + std::vector null_map(lhs.size()); + + for (auto _ : state) { + fmod_fast::vector_constant(lhs.data(), 0.9999998, result.data(), null_map.data(), + lhs.size()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * static_cast(lhs.size())); +} + +static void BM_FmodDoubleConstZeroStd(benchmark::State& state) { + std::vector lhs; + std::vector rhs; + fill_actual_load_sample(DB_IN_ONE, state.range(0), &lhs, &rhs); + std::vector result(lhs.size(), -777.0); + std::vector null_map(lhs.size()); + + for (auto _ : state) { + std_vector_constant(lhs.data(), 0.0, result.data(), null_map.data(), lhs.size()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * static_cast(lhs.size())); +} + +static void BM_FmodDoubleConstZeroFast(benchmark::State& state) { + std::vector lhs; + std::vector rhs; + fill_actual_load_sample(DB_IN_ONE, state.range(0), &lhs, &rhs); + std::vector result(lhs.size(), -777.0); + std::vector null_map(lhs.size()); + + for (auto _ : state) { + fmod_fast::vector_constant(lhs.data(), 0.0, result.data(), null_map.data(), lhs.size()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * static_cast(lhs.size())); +} + +static void BM_FmodDoubleConstantVectorStd(benchmark::State& state) { + std::vector lhs; + std::vector rhs; + fill_actual_load_sample(IN_ONE_DB, state.range(0), &lhs, &rhs); + std::vector result(lhs.size()); + std::vector null_map(lhs.size()); + + for (auto _ : state) { + std_constant_vector(12345.678, rhs.data(), result.data(), null_map.data(), rhs.size()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * static_cast(rhs.size())); +} + +static void BM_FmodDoubleConstantVectorFast(benchmark::State& state) { + std::vector lhs; + std::vector rhs; + fill_actual_load_sample(IN_ONE_DB, state.range(0), &lhs, &rhs); + std::vector result(lhs.size()); + std::vector null_map(lhs.size()); + + for (auto _ : state) { + fmod_fast::constant_vector(12345.678, rhs.data(), result.data(), null_map.data(), + rhs.size()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * static_cast(rhs.size())); +} + +static void BM_FmodFloatVectorVectorStd(benchmark::State& state) { + std::vector lhs; + std::vector rhs; + fill_actual_load_sample_float(static_cast(state.range(0)), state.range(1), &lhs, &rhs); + std::vector result(lhs.size()); + std::vector null_map(lhs.size()); + + for (auto _ : state) { + std_vector_vector(lhs.data(), rhs.data(), result.data(), null_map.data(), lhs.size()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * static_cast(lhs.size())); +} + +static void BM_FmodFloatVectorVectorFast(benchmark::State& state) { + std::vector lhs; + std::vector rhs; + fill_actual_load_sample_float(static_cast(state.range(0)), state.range(1), &lhs, &rhs); + std::vector result(lhs.size()); + std::vector null_map(lhs.size()); + + for (auto _ : state) { + fmod_fast::vector_vector(lhs.data(), rhs.data(), result.data(), null_map.data(), + lhs.size()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * static_cast(lhs.size())); +} + +static void BM_FmodFloatVectorConstantStd(benchmark::State& state) { + std::vector lhs; + std::vector rhs; + fill_actual_load_sample_float(DB_IN_ONE, state.range(0), &lhs, &rhs); + std::vector result(lhs.size()); + std::vector null_map(lhs.size()); + + for (auto _ : state) { + std_vector_constant(lhs.data(), 0.9999998F, result.data(), null_map.data(), lhs.size()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * static_cast(lhs.size())); +} + +static void BM_FmodFloatVectorConstantFast(benchmark::State& state) { + std::vector lhs; + std::vector rhs; + fill_actual_load_sample_float(DB_IN_ONE, state.range(0), &lhs, &rhs); + std::vector result(lhs.size()); + std::vector null_map(lhs.size()); + + for (auto _ : state) { + fmod_fast::vector_constant(lhs.data(), 0.9999998F, result.data(), null_map.data(), + lhs.size()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * static_cast(lhs.size())); +} + +static void BM_FmodFloatConstZeroStd(benchmark::State& state) { + std::vector lhs; + std::vector rhs; + fill_actual_load_sample_float(DB_IN_ONE, state.range(0), &lhs, &rhs); + std::vector result(lhs.size(), -777.0F); + std::vector null_map(lhs.size()); + + for (auto _ : state) { + std_vector_constant(lhs.data(), 0.0F, result.data(), null_map.data(), lhs.size()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * static_cast(lhs.size())); +} + +static void BM_FmodFloatConstZeroFast(benchmark::State& state) { + std::vector lhs; + std::vector rhs; + fill_actual_load_sample_float(DB_IN_ONE, state.range(0), &lhs, &rhs); + std::vector result(lhs.size(), -777.0F); + std::vector null_map(lhs.size()); + + for (auto _ : state) { + fmod_fast::vector_constant(lhs.data(), 0.0F, result.data(), null_map.data(), lhs.size()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * static_cast(lhs.size())); +} + +static void BM_FmodFloatConstantVectorStd(benchmark::State& state) { + std::vector lhs; + std::vector rhs; + fill_actual_load_sample_float(IN_ONE_DB, state.range(0), &lhs, &rhs); + std::vector result(lhs.size()); + std::vector null_map(lhs.size()); + + for (auto _ : state) { + std_constant_vector(12345.678F, rhs.data(), result.data(), null_map.data(), rhs.size()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * static_cast(rhs.size())); +} + +static void BM_FmodFloatConstantVectorFast(benchmark::State& state) { + std::vector lhs; + std::vector rhs; + fill_actual_load_sample_float(IN_ONE_DB, state.range(0), &lhs, &rhs); + std::vector result(lhs.size()); + std::vector null_map(lhs.size()); + + for (auto _ : state) { + fmod_fast::constant_vector(12345.678F, rhs.data(), result.data(), null_map.data(), + rhs.size()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * static_cast(rhs.size())); +} + +} // namespace + +BENCHMARK(BM_FmodDoubleVectorVectorStd)->Apply(benchmark_args); +BENCHMARK(BM_FmodDoubleVectorVectorFast)->Apply(benchmark_args); +BENCHMARK(BM_FmodDoubleVectorConstantStd)->Apply(benchmark_const_args); +BENCHMARK(BM_FmodDoubleVectorConstantFast)->Apply(benchmark_const_args); +BENCHMARK(BM_FmodDoubleConstZeroStd)->Apply(benchmark_const_args); +BENCHMARK(BM_FmodDoubleConstZeroFast)->Apply(benchmark_const_args); +BENCHMARK(BM_FmodDoubleConstantVectorStd)->Apply(benchmark_const_args); +BENCHMARK(BM_FmodDoubleConstantVectorFast)->Apply(benchmark_const_args); +BENCHMARK(BM_FmodFloatVectorVectorStd)->Apply(benchmark_args); +BENCHMARK(BM_FmodFloatVectorVectorFast)->Apply(benchmark_args); +BENCHMARK(BM_FmodFloatVectorConstantStd)->Apply(benchmark_const_args); +BENCHMARK(BM_FmodFloatVectorConstantFast)->Apply(benchmark_const_args); +BENCHMARK(BM_FmodFloatConstZeroStd)->Apply(benchmark_const_args); +BENCHMARK(BM_FmodFloatConstZeroFast)->Apply(benchmark_const_args); +BENCHMARK(BM_FmodFloatConstantVectorStd)->Apply(benchmark_const_args); +BENCHMARK(BM_FmodFloatConstantVectorFast)->Apply(benchmark_const_args); + +} // namespace doris diff --git a/be/benchmark/benchmark_main.cpp b/be/benchmark/benchmark_main.cpp index fa268fa2d0e759..d5bb21ba8e41b7 100644 --- a/be/benchmark/benchmark_main.cpp +++ b/be/benchmark/benchmark_main.cpp @@ -24,6 +24,7 @@ #include "benchmark_column_array_view_distance.hpp" #include "benchmark_column_view.hpp" #include "benchmark_fastunion.hpp" +#include "benchmark_fmod.hpp" #include "benchmark_hll_merge.hpp" #include "benchmark_hybrid_set.hpp" #include "benchmark_string.hpp" diff --git a/be/src/exprs/function/fmod_fast.cpp b/be/src/exprs/function/fmod_fast.cpp new file mode 100644 index 00000000000000..0f3673a90b20e3 --- /dev/null +++ b/be/src/exprs/function/fmod_fast.cpp @@ -0,0 +1,160 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exprs/function/fmod_fast.h" + +#include + +#include + +#include "common/compiler_util.h" + +namespace doris::fmod_fast { +namespace { + +#if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__)) +#define DORIS_HAS_X87_FMOD_FAST 1 + +ALWAYS_INLINE inline double fmod_x87_fprem(double a, double b) { + double r; + asm volatile( + "fldl %[b]\n\t" + "fldl %[a]\n\t" + "1:\n\t" + "fprem\n\t" + "fnstsw %%ax\n\t" + "testb $4, %%ah\n\t" + "jne 1b\n\t" + "fstp %%st(1)\n\t" + "fstpl %[r]\n\t" + : [r] "=m"(r) + : [a] "m"(a), [b] "m"(b) + : "ax", "cc", "st"); + return r; +} +#else +#define DORIS_HAS_X87_FMOD_FAST 0 +#endif + +ALWAYS_INLINE inline double fmod_double(double a, double b) { +#if DORIS_HAS_X87_FMOD_FAST + if (b != 0.0 && std::isfinite(a) && std::isfinite(b)) { + double abs_a = std::fabs(a); + double abs_b = std::fabs(b); + if (abs_a < abs_b) { + return a; + } + if (abs_a == abs_b) { + return std::copysign(0.0, a); + } + return fmod_x87_fprem(a, b); + } +#endif + return std::fmod(a, b); +} + +ALWAYS_INLINE inline float fmod_float(float a, float b) { + return static_cast(fmod_double(static_cast(a), static_cast(b))); +} + +ALWAYS_INLINE inline double fmod_value(double a, double b) { + return fmod_double(a, b); +} + +ALWAYS_INLINE inline float fmod_value(float a, float b) { + return fmod_float(a, b); +} + +template +ALWAYS_INLINE inline void vector_vector_impl(const T* lhs, const T* rhs, T* result, + uint8_t* null_map, size_t size) { + for (size_t i = 0; i < size; ++i) { + uint8_t is_null = rhs[i] == T(0); + null_map[i] = is_null; + T adjusted_rhs = rhs[i] + static_cast(is_null); + result[i] = fmod_value(lhs[i], adjusted_rhs); + } +} + +template +ALWAYS_INLINE inline void vector_constant_impl(const T* lhs, T rhs, T* result, uint8_t* null_map, + size_t size) { + uint8_t is_null = rhs == T(0); + memset(null_map, is_null, size); + if (is_null) { + return; + } + + for (size_t i = 0; i < size; ++i) { + result[i] = fmod_value(lhs[i], rhs); + } +} + +template +ALWAYS_INLINE inline void constant_vector_impl(T lhs, const T* rhs, T* result, uint8_t* null_map, + size_t size) { + for (size_t i = 0; i < size; ++i) { + uint8_t is_null = rhs[i] == T(0); + null_map[i] = is_null; + T adjusted_rhs = rhs[i] + static_cast(is_null); + result[i] = fmod_value(lhs, adjusted_rhs); + } +} + +} // namespace + +bool is_x87_fast_path_enabled() { + return DORIS_HAS_X87_FMOD_FAST; +} + +double scalar(double a, double b) { + return fmod_double(a, b); +} + +float scalar(float a, float b) { + return fmod_float(a, b); +} + +void vector_vector(const double* lhs, const double* rhs, double* result, uint8_t* null_map, + size_t size) { + vector_vector_impl(lhs, rhs, result, null_map, size); +} + +void vector_vector(const float* lhs, const float* rhs, float* result, uint8_t* null_map, + size_t size) { + vector_vector_impl(lhs, rhs, result, null_map, size); +} + +void vector_constant(const double* lhs, double rhs, double* result, uint8_t* null_map, + size_t size) { + vector_constant_impl(lhs, rhs, result, null_map, size); +} + +void vector_constant(const float* lhs, float rhs, float* result, uint8_t* null_map, size_t size) { + vector_constant_impl(lhs, rhs, result, null_map, size); +} + +void constant_vector(double lhs, const double* rhs, double* result, uint8_t* null_map, + size_t size) { + constant_vector_impl(lhs, rhs, result, null_map, size); +} + +void constant_vector(float lhs, const float* rhs, float* result, uint8_t* null_map, size_t size) { + constant_vector_impl(lhs, rhs, result, null_map, size); +} + +} // namespace doris::fmod_fast diff --git a/be/src/exprs/function/fmod_fast.h b/be/src/exprs/function/fmod_fast.h new file mode 100644 index 00000000000000..6bcc7481165443 --- /dev/null +++ b/be/src/exprs/function/fmod_fast.h @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +namespace doris::fmod_fast { + +bool is_x87_fast_path_enabled(); + +double scalar(double a, double b); +float scalar(float a, float b); + +void vector_vector(const double* lhs, const double* rhs, double* result, uint8_t* null_map, + size_t size); +void vector_vector(const float* lhs, const float* rhs, float* result, uint8_t* null_map, + size_t size); + +void vector_constant(const double* lhs, double rhs, double* result, uint8_t* null_map, size_t size); +void vector_constant(const float* lhs, float rhs, float* result, uint8_t* null_map, size_t size); + +void constant_vector(double lhs, const double* rhs, double* result, uint8_t* null_map, size_t size); +void constant_vector(float lhs, const float* rhs, float* result, uint8_t* null_map, size_t size); + +} // namespace doris::fmod_fast diff --git a/be/src/exprs/function/modulo.cpp b/be/src/exprs/function/modulo.cpp index e6f93659387432..8942475e75b467 100644 --- a/be/src/exprs/function/modulo.cpp +++ b/be/src/exprs/function/modulo.cpp @@ -33,6 +33,7 @@ #include "core/types.h" #include "core/value/decimalv2_value.h" #include "exprs/function/cast_type_to_either.h" +#include "exprs/function/fmod_fast.h" #include "exprs/function/simple_function_factory.h" namespace doris { @@ -397,9 +398,13 @@ struct ModNumericImpl { auto& b = column_right_ptr->get_data(); auto& c = column_result->get_data(); auto& n = null_map->get_data(); - size_t size = b.size(); - for (size_t i = 0; i < size; ++i) { - c[i] = Impl::apply(a, b[i], n[i]); + if constexpr (requires { Impl::apply(a, b, c, n); }) { + Impl::apply(a, b, c, n); + } else { + size_t size = b.size(); + for (size_t i = 0; i < size; ++i) { + c[i] = Impl::apply(a, b[i], n[i]); + } } return ColumnNullable::create(std::move(column_result), std::move(null_map)); } @@ -416,9 +421,13 @@ struct ModNumericImpl { auto& b = column_right_ptr->get_data(); auto& c = column_result->get_data(); auto& n = null_map->get_data(); - size_t size = a.size(); - for (size_t i = 0; i < size; ++i) { - c[i] = Impl::apply(a[i], b[i], n[i]); + if constexpr (requires { Impl::apply(a, b, c, n); }) { + Impl::apply(a, b, c, n); + } else { + size_t size = a.size(); + for (size_t i = 0; i < size; ++i) { + c[i] = Impl::apply(a[i], b[i], n[i]); + } } return ColumnNullable::create(std::move(column_result), std::move(null_map)); } @@ -441,17 +450,42 @@ struct ModuloNumericImpl { static void apply(const typename ColumnType::Container& a, ArgB b, typename ColumnType::Container& c, PaddedPODArray& null_map) { size_t size = c.size(); - UInt8 is_null = b == 0; - memset(null_map.data(), is_null, sizeof(UInt8) * size); - - if (!is_null) { + if constexpr (is_float_or_double(Type)) { + fmod_fast::vector_constant(a.data(), b, c.data(), null_map.data(), size); + } else { + UInt8 is_null = b == 0; + memset(null_map.data(), is_null, sizeof(UInt8) * size); + if (is_null) { + return; + } for (size_t i = 0; i < size; i++) { - if constexpr (is_float_or_double(Type)) { - c[i] = std::fmod((double)a[i], (double)b); - } else { - throw_if_division_leads_to_FPE(a[i], b); - c[i] = a[i] % b; - } + throw_if_division_leads_to_FPE(a[i], b); + c[i] = a[i] % b; + } + } + } + + static void apply(ArgA a, const typename ColumnType::Container& b, + typename ColumnType::Container& c, PaddedPODArray& null_map) { + size_t size = c.size(); + if constexpr (is_float_or_double(Type)) { + fmod_fast::constant_vector(a, b.data(), c.data(), null_map.data(), size); + } else { + for (size_t i = 0; i < size; ++i) { + c[i] = apply(a, b[i], null_map[i]); + } + } + } + + static void apply(const typename ColumnType::Container& a, + const typename ColumnType::Container& b, typename ColumnType::Container& c, + PaddedPODArray& null_map) { + size_t size = c.size(); + if constexpr (is_float_or_double(Type)) { + fmod_fast::vector_vector(a.data(), b.data(), c.data(), null_map.data(), size); + } else { + for (size_t i = 0; i < size; ++i) { + c[i] = apply(a[i], b[i], null_map[i]); } } } @@ -462,7 +496,7 @@ struct ModuloNumericImpl { b += is_null; if constexpr (is_float_or_double(Type)) { - return std::fmod((double)a, (double)b); + return fmod_fast::scalar(a, b); } else { throw_if_division_leads_to_FPE(a, b); return a % b; diff --git a/be/test/exprs/function/function_fmod_fast_test.cpp b/be/test/exprs/function/function_fmod_fast_test.cpp new file mode 100644 index 00000000000000..ac4f7c0dd57f69 --- /dev/null +++ b/be/test/exprs/function/function_fmod_fast_test.cpp @@ -0,0 +1,353 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include "core/data_type/data_type_number.h" +#include "core/types.h" +#include "exprs/function/fmod_fast.h" +#include "exprs/function/function_test_util.h" +#include "testutil/any_type.h" + +namespace doris { +namespace { + +uint64_t bits(double v) { + return std::bit_cast(v); +} + +uint32_t bits(float v) { + return std::bit_cast(v); +} + +void expect_same_double(double actual, double expected, double lhs, double rhs) { + if (std::isnan(expected)) { + ASSERT_TRUE(std::isnan(actual)) << "lhs=" << lhs << " rhs=" << rhs; + } else { + ASSERT_EQ(bits(expected), bits(actual)) << "lhs=" << lhs << " rhs=" << rhs + << " expected=" << expected << " actual=" << actual; + } +} + +void expect_same_float(float actual, float expected, float lhs, float rhs) { + if (std::isnan(expected)) { + ASSERT_TRUE(std::isnan(actual)) << "lhs=" << lhs << " rhs=" << rhs; + } else { + ASSERT_EQ(bits(expected), bits(actual)) << "lhs=" << lhs << " rhs=" << rhs + << " expected=" << expected << " actual=" << actual; + } +} + +double reference_fmod(double lhs, double rhs) { + return std::fmod(lhs, rhs); +} + +float reference_fmod(float lhs, float rhs) { + return static_cast(std::fmod(static_cast(lhs), static_cast(rhs))); +} + +template +std::vector interesting_values(); + +template <> +std::vector interesting_values() { + const double nan = std::numeric_limits::quiet_NaN(); + const double inf = std::numeric_limits::infinity(); + return {0.0, + -0.0, + 1.0, + -1.0, + 2.0, + -2.0, + 2.5, + -2.5, + 1000.1575, + -1000.1575, + 44'728'676'500.0, + -44'728'676'500.0, + std::numeric_limits::min(), + -std::numeric_limits::min(), + std::numeric_limits::denorm_min(), + -std::numeric_limits::denorm_min(), + std::numeric_limits::max(), + -std::numeric_limits::max(), + inf, + -inf, + nan}; +} + +template <> +std::vector interesting_values() { + const float nan = std::numeric_limits::quiet_NaN(); + const float inf = std::numeric_limits::infinity(); + return {0.0F, + -0.0F, + 1.0F, + -1.0F, + 2.0F, + -2.0F, + 2.5F, + -2.5F, + 1000.1575F, + -1000.1575F, + 1.0e10F, + -1.0e10F, + std::numeric_limits::min(), + -std::numeric_limits::min(), + std::numeric_limits::denorm_min(), + -std::numeric_limits::denorm_min(), + std::numeric_limits::max(), + -std::numeric_limits::max(), + inf, + -inf, + nan}; +} + +template +void check_scalar_pair(T lhs, T rhs); + +template <> +void check_scalar_pair(double lhs, double rhs) { + expect_same_double(fmod_fast::scalar(lhs, rhs), reference_fmod(lhs, rhs), lhs, rhs); +} + +template <> +void check_scalar_pair(float lhs, float rhs) { + expect_same_float(fmod_fast::scalar(lhs, rhs), reference_fmod(lhs, rhs), lhs, rhs); +} + +template +void check_scalar_corner_cases() { + const auto values = interesting_values(); + for (T lhs : values) { + for (T rhs : values) { + check_scalar_pair(lhs, rhs); + } + } +} + +template +void check_actual_load_distribution() { + constexpr double db_scales[] = {1234.4500, 1876.2222, 8945.7353, 5612.6245, 4646.7853, + 6523.5285, 1000.1575, 6555.5678, 2587.8535, 3754.2575}; + for (double scale : db_scales) { + for (int64_t row = 1; row <= 5'000'000; row += 9973) { + T db = static_cast(static_cast(row) * scale); + T in_one = static_cast(static_cast(row) * 2e-7); + T in_ten = static_cast(static_cast(row) * 2e-6); + check_scalar_pair(db, db); + check_scalar_pair(in_one, db); + check_scalar_pair(db, in_one); + check_scalar_pair(db, in_ten); + } + } +} + +template +void check_random_finite_distribution() { + std::mt19937_64 rng(0x9e3779b97f4a7c15ULL); + std::uniform_real_distribution large(-4.5e10, 4.5e10); + std::uniform_real_distribution small(-10.0, 10.0); + std::uniform_real_distribution tiny(-1e-200, 1e-200); + for (int i = 0; i < 20000; ++i) { + T lhs = static_cast(large(rng)); + T rhs = static_cast(small(rng)); + if (rhs == T(0)) { + rhs = static_cast(0.125); + } + check_scalar_pair(lhs, rhs); + check_scalar_pair(static_cast(small(rng)), lhs == T(0) ? static_cast(1) : lhs); + check_scalar_pair(static_cast(tiny(rng)), rhs); + } +} + +template +void fill_batch_inputs(std::vector* lhs, std::vector* rhs) { + const auto values = interesting_values(); + for (size_t i = 0; i < values.size(); ++i) { + for (size_t j = 0; j < values.size(); ++j) { + lhs->push_back(values[i]); + rhs->push_back(values[j]); + } + } + + constexpr double db_scales[] = {1234.4500, 1876.2222, 8945.7353, 5612.6245, 4646.7853, + 6523.5285, 1000.1575, 6555.5678, 2587.8535, 3754.2575}; + for (double scale : db_scales) { + for (int64_t row = 1; row <= 5'000'000; row += 1543) { + T db = static_cast(static_cast(row) * scale); + T in_one = static_cast(static_cast(row) * 2e-7); + T in_ten = static_cast(static_cast(row) * 2e-6); + lhs->push_back(db); + rhs->push_back(db); + lhs->push_back(static_cast(in_one)); + rhs->push_back(db); + lhs->push_back(db); + rhs->push_back(in_one); + lhs->push_back(db); + rhs->push_back(in_ten); + } + } +} + +template +void check_batch_vector_vector(); + +template <> +void check_batch_vector_vector() { + std::vector lhs; + std::vector rhs; + fill_batch_inputs(&lhs, &rhs); + std::vector result(lhs.size()); + std::vector null_map(lhs.size()); + + fmod_fast::vector_vector(lhs.data(), rhs.data(), result.data(), null_map.data(), lhs.size()); + for (size_t i = 0; i < lhs.size(); ++i) { + uint8_t expected_null = rhs[i] == 0.0; + ASSERT_EQ(expected_null, null_map[i]) << i; + double adjusted_rhs = rhs[i] + static_cast(expected_null); + expect_same_double(result[i], reference_fmod(lhs[i], adjusted_rhs), lhs[i], adjusted_rhs); + } +} + +template <> +void check_batch_vector_vector() { + std::vector lhs; + std::vector rhs; + fill_batch_inputs(&lhs, &rhs); + std::vector result(lhs.size()); + std::vector null_map(lhs.size()); + + fmod_fast::vector_vector(lhs.data(), rhs.data(), result.data(), null_map.data(), lhs.size()); + for (size_t i = 0; i < lhs.size(); ++i) { + uint8_t expected_null = rhs[i] == 0.0F; + ASSERT_EQ(expected_null, null_map[i]) << i; + float adjusted_rhs = rhs[i] + static_cast(expected_null); + expect_same_float(result[i], reference_fmod(lhs[i], adjusted_rhs), lhs[i], adjusted_rhs); + } +} + +template +void check_batch_const_shapes(); + +template <> +void check_batch_const_shapes() { + std::vector lhs; + std::vector rhs; + fill_batch_inputs(&lhs, &rhs); + std::vector result(lhs.size(), -777.0); + std::vector null_map(lhs.size()); + + fmod_fast::vector_constant(lhs.data(), 0.0, result.data(), null_map.data(), lhs.size()); + for (size_t i = 0; i < lhs.size(); ++i) { + ASSERT_EQ(1, null_map[i]) << i; + ASSERT_EQ(bits(-777.0), bits(result[i])) << i; + } + + fmod_fast::vector_constant(lhs.data(), 0.125, result.data(), null_map.data(), lhs.size()); + for (size_t i = 0; i < lhs.size(); ++i) { + ASSERT_EQ(0, null_map[i]) << i; + expect_same_double(result[i], reference_fmod(lhs[i], 0.125), lhs[i], 0.125); + } + + fmod_fast::constant_vector(12345.678, rhs.data(), result.data(), null_map.data(), rhs.size()); + for (size_t i = 0; i < rhs.size(); ++i) { + uint8_t expected_null = rhs[i] == 0.0; + ASSERT_EQ(expected_null, null_map[i]) << i; + double adjusted_rhs = rhs[i] + static_cast(expected_null); + expect_same_double(result[i], reference_fmod(12345.678, adjusted_rhs), 12345.678, + adjusted_rhs); + } +} + +template <> +void check_batch_const_shapes() { + std::vector lhs; + std::vector rhs; + fill_batch_inputs(&lhs, &rhs); + std::vector result(lhs.size(), -777.0F); + std::vector null_map(lhs.size()); + + fmod_fast::vector_constant(lhs.data(), 0.0F, result.data(), null_map.data(), lhs.size()); + for (size_t i = 0; i < lhs.size(); ++i) { + ASSERT_EQ(1, null_map[i]) << i; + ASSERT_EQ(bits(-777.0F), bits(result[i])) << i; + } + + fmod_fast::vector_constant(lhs.data(), 0.125F, result.data(), null_map.data(), lhs.size()); + for (size_t i = 0; i < lhs.size(); ++i) { + ASSERT_EQ(0, null_map[i]) << i; + expect_same_float(result[i], reference_fmod(lhs[i], 0.125F), lhs[i], 0.125F); + } + + fmod_fast::constant_vector(12345.678F, rhs.data(), result.data(), null_map.data(), rhs.size()); + for (size_t i = 0; i < rhs.size(); ++i) { + uint8_t expected_null = rhs[i] == 0.0F; + ASSERT_EQ(expected_null, null_map[i]) << i; + float adjusted_rhs = rhs[i] + static_cast(expected_null); + expect_same_float(result[i], reference_fmod(12345.678F, adjusted_rhs), 12345.678F, + adjusted_rhs); + } +} + +} // namespace + +TEST(FunctionFmodFastTest, ScalarCornerCasesMatchStdFmod) { + check_scalar_corner_cases(); + check_scalar_corner_cases(); +} + +TEST(FunctionFmodFastTest, ActualLoadDistributionMatchesStdFmod) { + check_actual_load_distribution(); + check_actual_load_distribution(); +} + +TEST(FunctionFmodFastTest, RandomFiniteDistributionMatchesStdFmod) { + check_random_finite_distribution(); + check_random_finite_distribution(); +} + +TEST(FunctionFmodFastTest, BatchVectorVectorMatchesStdFmod) { + check_batch_vector_vector(); + check_batch_vector_vector(); +} + +TEST(FunctionFmodFastTest, BatchConstShapesMatchStdFmod) { + check_batch_const_shapes(); + check_batch_const_shapes(); +} + +TEST(FunctionFmodFastTest, DorisFunctionNullSemanticsStayUnchanged) { + InputTypeSet input_types = {PrimitiveType::TYPE_DOUBLE, PrimitiveType::TYPE_DOUBLE}; + DataSet data_set = { + {{5.5, 2.0}, reference_fmod(5.5, 2.0)}, + {{-5.5, 2.0}, reference_fmod(-5.5, 2.0)}, + {{5.5, -2.0}, reference_fmod(5.5, -2.0)}, + {{1.0, 0.0}, Null()}, + {{0.0, 0.0}, Null()}, + {{44'728'676'500.0, 0.9999998}, reference_fmod(44'728'676'500.0, 0.9999998)}}; + static_cast(check_function("fmod", input_types, data_set)); + static_cast(check_function("mod", input_types, data_set)); +} + +} // namespace doris