diff --git a/benchmarks/utf8_decode_benchmark.py b/benchmarks/utf8_decode_benchmark.py new file mode 100644 index 0000000000..c520c5f4c6 --- /dev/null +++ b/benchmarks/utf8_decode_benchmark.py @@ -0,0 +1,327 @@ +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Benchmarks for UTF-8 and ASCII deserialization in the Cython row parser. + +This optimization replaces the two-step to_bytes(buf).decode('utf8') with +a direct PyUnicode_DecodeUTF8(buf.ptr, buf.size, NULL) call, eliminating +an intermediate bytes object allocation per text cell. + +Requires: pip install pytest-benchmark + +Run with: pytest benchmarks/utf8_decode_benchmark.py -v --benchmark-sort=name +Compare before/after by running on master vs this branch. + +Correctness tests live in tests/unit/cython/test_deserializers.py. +""" + +import struct +import pytest + +from cassandra.obj_parser import ListParser +from cassandra.bytesio import BytesIOReader +from cassandra.parsing import ParseDesc +from cassandra.deserializers import make_deserializers +from cassandra.cqltypes import UTF8Type, AsciiType, Int32Type +from cassandra.policies import ColDesc + + +def _build_text_rows_buffer(num_rows, num_cols, text_data): + """Build a binary buffer representing num_rows x num_cols of text data. + + Format: [int32 row_count] [row1] [row2] ... + Each row: [cell1] [cell2] ... + Each cell: [int32 length] [data bytes] + """ + parts = [struct.pack(">i", num_rows)] + cell = struct.pack(">i", len(text_data)) + text_data + row = cell * num_cols + parts.append(row * num_rows) + return b"".join(parts) + + +def _build_mixed_rows_buffer(num_rows, text_data, int_value=42): + """Build a buffer with mixed columns: 3 text + 2 int32.""" + parts = [struct.pack(">i", num_rows)] + text_cell = struct.pack(">i", len(text_data)) + text_data + int_cell = struct.pack(">i", 4) + struct.pack(">i", int_value) + row = text_cell + text_cell + text_cell + int_cell + int_cell + parts.append(row * num_rows) + return b"".join(parts) + + +def _make_text_desc(num_cols, protocol_version=4): + """Create a ParseDesc for num_cols text columns.""" + coltypes = [UTF8Type] * num_cols + colnames = [f"col{i}" for i in range(num_cols)] + coldescs = [ColDesc("ks", "tbl", f"col{i}") for i in range(num_cols)] + desers = make_deserializers(coltypes) + return ParseDesc(colnames, coltypes, None, coldescs, desers, protocol_version) + + +def _make_ascii_desc(num_cols, protocol_version=4): + """Create a ParseDesc for num_cols ASCII columns.""" + coltypes = [AsciiType] * num_cols + colnames = [f"col{i}" for i in range(num_cols)] + coldescs = [ColDesc("ks", "tbl", f"col{i}") for i in range(num_cols)] + desers = make_deserializers(coltypes) + return ParseDesc(colnames, coltypes, None, coldescs, desers, protocol_version) + + +def _make_mixed_desc(protocol_version=4): + """Create a ParseDesc for 3 text + 2 int32 columns.""" + coltypes = [UTF8Type, UTF8Type, UTF8Type, Int32Type, Int32Type] + colnames = ["text0", "text1", "text2", "int0", "int1"] + coldescs = [ColDesc("ks", "tbl", n) for n in colnames] + desers = make_deserializers(coltypes) + return ParseDesc(colnames, coltypes, None, coldescs, desers, protocol_version) + + +# --------------------------------------------------------------------------- +# Cython pipeline benchmarks — UTF-8 +# --------------------------------------------------------------------------- + + +class TestUTF8CythonPipeline: + """Benchmark the full Cython row parsing pipeline with UTF-8 text columns. + + These benchmarks measure the end-to-end cost of parsing result sets + through the optimized Cython path. The optimization replaces + to_bytes(buf).decode('utf8') with PyUnicode_DecodeUTF8(buf.ptr, buf.size, NULL), + eliminating one intermediate bytes allocation per text cell. + """ + + def test_bench_utf8_1row_1col_short(self, benchmark): + """1 row x 1 col, short string (11 bytes) — isolates per-call overhead.""" + text = b"hello world" + buf = _build_text_rows_buffer(1, 1, text) + desc = _make_text_desc(1) + parser = ListParser() + + def parse(): + reader = BytesIOReader(buf) + return parser.parse_rows(reader, desc) + + result = benchmark(parse) + assert len(result) == 1 + assert result[0][0] == "hello world" + + def test_bench_utf8_1row_10col_short(self, benchmark): + """1 row x 10 cols, short strings — measures per-column overhead.""" + text = b"hello world" + buf = _build_text_rows_buffer(1, 10, text) + desc = _make_text_desc(10) + parser = ListParser() + + def parse(): + reader = BytesIOReader(buf) + return parser.parse_rows(reader, desc) + + result = benchmark(parse) + assert len(result) == 1 + assert len(result[0]) == 10 + + def test_bench_utf8_100rows_5col_medium(self, benchmark): + """100 rows x 5 cols, medium string (46 bytes) — typical workload.""" + text = b"Hello, this is a test string for benchmarking!" + buf = _build_text_rows_buffer(100, 5, text) + desc = _make_text_desc(5) + parser = ListParser() + + def parse(): + reader = BytesIOReader(buf) + return parser.parse_rows(reader, desc) + + result = benchmark(parse) + assert len(result) == 100 + assert result[0][0] == text.decode("utf8") + + def test_bench_utf8_1000rows_5col_medium(self, benchmark): + """1000 rows x 5 cols, medium string — high-throughput scenario.""" + text = b"Hello, this is a test string for benchmarking!" + buf = _build_text_rows_buffer(1000, 5, text) + desc = _make_text_desc(5) + parser = ListParser() + + def parse(): + reader = BytesIOReader(buf) + return parser.parse_rows(reader, desc) + + result = benchmark(parse) + assert len(result) == 1000 + + def test_bench_utf8_100rows_5col_long(self, benchmark): + """100 rows x 5 cols, long string (200 bytes) — larger values.""" + text = b"A" * 200 + buf = _build_text_rows_buffer(100, 5, text) + desc = _make_text_desc(5) + parser = ListParser() + + def parse(): + reader = BytesIOReader(buf) + return parser.parse_rows(reader, desc) + + result = benchmark(parse) + assert len(result) == 100 + assert result[0][0] == "A" * 200 + + def test_bench_utf8_100rows_5col_multibyte(self, benchmark): + """100 rows x 5 cols, multibyte UTF-8 string — tests non-ASCII.""" + text = "Héllo wörld! こんにちは 🌍".encode("utf-8") + buf = _build_text_rows_buffer(100, 5, text) + desc = _make_text_desc(5) + parser = ListParser() + + def parse(): + reader = BytesIOReader(buf) + return parser.parse_rows(reader, desc) + + result = benchmark(parse) + assert len(result) == 100 + assert result[0][0] == text.decode("utf-8") + + +# --------------------------------------------------------------------------- +# Cython pipeline benchmarks — ASCII +# --------------------------------------------------------------------------- + + +class TestASCIICythonPipeline: + """Benchmark the Cython row parsing pipeline with ASCII text columns.""" + + def test_bench_ascii_100rows_5col_medium(self, benchmark): + """100 rows x 5 cols, medium ASCII string.""" + text = b"Hello, this is a test ASCII string for benchmarking!" + buf = _build_text_rows_buffer(100, 5, text) + desc = _make_ascii_desc(5) + parser = ListParser() + + def parse(): + reader = BytesIOReader(buf) + return parser.parse_rows(reader, desc) + + result = benchmark(parse) + assert len(result) == 100 + assert result[0][0] == text.decode("ascii") + + def test_bench_ascii_1000rows_5col_medium(self, benchmark): + """1000 rows x 5 cols, medium ASCII string.""" + text = b"Hello, this is a test ASCII string for benchmarking!" + buf = _build_text_rows_buffer(1000, 5, text) + desc = _make_ascii_desc(5) + parser = ListParser() + + def parse(): + reader = BytesIOReader(buf) + return parser.parse_rows(reader, desc) + + result = benchmark(parse) + assert len(result) == 1000 + + +# --------------------------------------------------------------------------- +# Mixed columns benchmark +# --------------------------------------------------------------------------- + + +class TestMixedColumnsPipeline: + """Benchmark with mixed column types (text + int) for realism.""" + + def test_bench_mixed_100rows_3text_2int(self, benchmark): + """100 rows x (3 text + 2 int) — realistic mixed schema.""" + text = b"Hello, this is a test string for benchmarking!" + buf = _build_mixed_rows_buffer(100, text) + desc = _make_mixed_desc() + parser = ListParser() + + def parse(): + reader = BytesIOReader(buf) + return parser.parse_rows(reader, desc) + + result = benchmark(parse) + assert len(result) == 100 + assert result[0][0] == text.decode("utf8") + assert result[0][3] == 42 + + +# --------------------------------------------------------------------------- +# Python-level reference (bytes.decode) for comparison +# --------------------------------------------------------------------------- + + +class TestPythonDecodeReference: + """Python-level microbenchmark showing the overhead of creating + intermediate bytes objects before decode, which is what the + original Cython code did (to_bytes(buf).decode('utf8')). + + These benchmarks isolate the bytes-creation overhead that the + PyUnicode_DecodeUTF8 optimization eliminates. + """ + + def test_bench_python_bytes_decode_short(self, benchmark): + """Python reference: bytes.decode('utf8') for 500 short strings.""" + data = b"hello world" + + def decode_loop(): + result = None + for _ in range(500): + result = data.decode("utf8") + return result + + result = benchmark(decode_loop) + assert result == "hello world" + + def test_bench_python_copy_then_decode_short(self, benchmark): + """Python reference: bytes(data).decode('utf8') for 500 short strings. + This simulates the old to_bytes(buf).decode() pattern, where + to_bytes() creates a new bytes object from the C buffer.""" + data = b"hello world" + mv = memoryview(data) + + def decode_loop(): + result = None + for _ in range(500): + copied = bytes(mv) # simulates to_bytes(buf) + result = copied.decode("utf8") + return result + + result = benchmark(decode_loop) + assert result == "hello world" + + def test_bench_python_bytes_decode_medium(self, benchmark): + """Python reference: bytes.decode('utf8') for 500 medium strings.""" + data = b"Hello, this is a test string for benchmarking!" + + def decode_loop(): + result = None + for _ in range(500): + result = data.decode("utf8") + return result + + result = benchmark(decode_loop) + + def test_bench_python_copy_then_decode_medium(self, benchmark): + """Python reference: bytes(memoryview).decode('utf8') for 500 medium strings.""" + data = b"Hello, this is a test string for benchmarking!" + mv = memoryview(data) + + def decode_loop(): + result = None + for _ in range(500): + copied = bytes(mv) # simulates to_bytes(buf) + result = copied.decode("utf8") + return result + + result = benchmark(decode_loop) diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index 98e8676bbc..2ccc4ef093 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -14,6 +14,7 @@ from libc.stdint cimport int32_t, uint16_t +from cpython.unicode cimport PyUnicode_DecodeASCII, PyUnicode_DecodeUTF8 include 'cython_marshal.pyx' from cassandra.buffer cimport Buffer, to_bytes, slice_buffer @@ -88,7 +89,7 @@ cdef class DesAsciiType(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): if buf.size == 0: return "" - return to_bytes(buf).decode('ascii') + return PyUnicode_DecodeASCII(buf.ptr, buf.size, NULL) cdef class DesFloatType(Deserializer): @@ -173,8 +174,7 @@ cdef class DesUTF8Type(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): if buf.size == 0: return "" - cdef val = to_bytes(buf) - return val.decode('utf8') + return PyUnicode_DecodeUTF8(buf.ptr, buf.size, NULL) cdef class DesVarcharType(DesUTF8Type): diff --git a/tests/unit/cython/test_deserializers.py b/tests/unit/cython/test_deserializers.py new file mode 100644 index 0000000000..fae6e50336 --- /dev/null +++ b/tests/unit/cython/test_deserializers.py @@ -0,0 +1,145 @@ +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Correctness tests for the Cython UTF-8 and ASCII deserializers. + +These verify that the optimized PyUnicode_DecodeUTF8/DecodeASCII code path +in cassandra/deserializers.pyx produces correct results for edge cases. +""" + +import struct +import unittest + +try: + from cassandra.obj_parser import ListParser + from cassandra.bytesio import BytesIOReader + from cassandra.parsing import ParseDesc + from cassandra.deserializers import make_deserializers + from cassandra.cqltypes import UTF8Type, AsciiType + from cassandra.policies import ColDesc + + HAS_CYTHON = True +except ImportError: + HAS_CYTHON = False + + +def _build_text_rows_buffer(num_rows, num_cols, text_data): + """Build a binary buffer representing num_rows x num_cols of text data. + + Format: [int32 row_count] [row1] [row2] ... + Each row: [cell1] [cell2] ... + Each cell: [int32 length] [data bytes] + """ + parts = [struct.pack(">i", num_rows)] + cell = struct.pack(">i", len(text_data)) + text_data + row = cell * num_cols + parts.append(row * num_rows) + return b"".join(parts) + + +def _make_text_desc(num_cols, protocol_version=4): + """Create a ParseDesc for num_cols text columns.""" + coltypes = [UTF8Type] * num_cols + colnames = [f"col{i}" for i in range(num_cols)] + coldescs = [ColDesc("ks", "tbl", f"col{i}") for i in range(num_cols)] + desers = make_deserializers(coltypes) + return ParseDesc(colnames, coltypes, None, coldescs, desers, protocol_version) + + +def _make_ascii_desc(num_cols, protocol_version=4): + """Create a ParseDesc for num_cols ASCII columns.""" + coltypes = [AsciiType] * num_cols + colnames = [f"col{i}" for i in range(num_cols)] + coldescs = [ColDesc("ks", "tbl", f"col{i}") for i in range(num_cols)] + desers = make_deserializers(coltypes) + return ParseDesc(colnames, coltypes, None, coldescs, desers, protocol_version) + + +@unittest.skipUnless(HAS_CYTHON, "Cython extensions not available") +class TestCythonDeserializerCorrectness(unittest.TestCase): + """Verify that the optimized Cython decode produces correct results.""" + + def test_utf8_empty_string(self): + """Empty string should return empty string.""" + buf = _build_text_rows_buffer(1, 1, b"") + desc = _make_text_desc(1) + parser = ListParser() + reader = BytesIOReader(buf) + rows = parser.parse_rows(reader, desc) + self.assertEqual(rows[0][0], "") + + def test_utf8_ascii_only(self): + """Pure ASCII content.""" + text = b"Hello, World! 12345" + buf = _build_text_rows_buffer(1, 1, text) + desc = _make_text_desc(1) + parser = ListParser() + reader = BytesIOReader(buf) + rows = parser.parse_rows(reader, desc) + self.assertEqual(rows[0][0], "Hello, World! 12345") + + def test_utf8_multibyte(self): + """Multibyte UTF-8 characters.""" + text = "Héllo wörld! こんにちは 🌍".encode("utf-8") + buf = _build_text_rows_buffer(1, 1, text) + desc = _make_text_desc(1) + parser = ListParser() + reader = BytesIOReader(buf) + rows = parser.parse_rows(reader, desc) + self.assertEqual(rows[0][0], "Héllo wörld! こんにちは 🌍") + + def test_utf8_long_string(self): + """Long string (10KB).""" + text = ("x" * 10000).encode("utf-8") + buf = _build_text_rows_buffer(1, 1, text) + desc = _make_text_desc(1) + parser = ListParser() + reader = BytesIOReader(buf) + rows = parser.parse_rows(reader, desc) + self.assertEqual(rows[0][0], "x" * 10000) + + def test_ascii_basic(self): + """Basic ASCII decode.""" + text = b"Simple ASCII text 12345 !@#" + buf = _build_text_rows_buffer(1, 1, text) + desc = _make_ascii_desc(1) + parser = ListParser() + reader = BytesIOReader(buf) + rows = parser.parse_rows(reader, desc) + self.assertEqual(rows[0][0], "Simple ASCII text 12345 !@#") + + def test_utf8_null_value(self): + """NULL value (negative length) should return None.""" + # Build buffer: 1 row, 1 column with length = -1 (NULL) + buf = struct.pack(">i", 1) + struct.pack(">i", -1) + desc = _make_text_desc(1) + parser = ListParser() + reader = BytesIOReader(buf) + rows = parser.parse_rows(reader, desc) + self.assertIsNone(rows[0][0]) + + def test_utf8_multiple_rows_columns(self): + """Multiple rows and columns.""" + texts = [b"alpha", b"beta", b"gamma"] + # Build buffer with 3 rows x 1 col, different values + parts = [struct.pack(">i", 3)] + for t in texts: + parts.append(struct.pack(">i", len(t)) + t) + buf = b"".join(parts) + desc = _make_text_desc(1) + parser = ListParser() + reader = BytesIOReader(buf) + rows = parser.parse_rows(reader, desc) + self.assertEqual([r[0] for r in rows], ["alpha", "beta", "gamma"])