Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
47730f3
perf: Skip bounds check for initial elements in 2^n hypercube
mkitti Feb 13, 2026
865df2a
lint:Use a list comprehension rather than a for loop
mkitti Feb 13, 2026
ce0099c
pref:Add decode_morton_vectorized
mkitti Feb 13, 2026
1b1076f
perf:Replace math.log2() with bit_length()
mkitti Feb 13, 2026
47a68eb
perf:Use magic numbers for 2D and 3D
mkitti Feb 13, 2026
6fb6d00
perf:Add 4D Morton magic numbers
mkitti Feb 13, 2026
db31842
perf:Add Morton magic numbers for 5D
mkitti Feb 13, 2026
f9952f1
perf:Remove singleton dimensions to reduce ndims
mkitti Feb 13, 2026
aedce5a
Add changes
mkitti Feb 13, 2026
ef18210
fix:Address type annotation and linting issues
mkitti Feb 13, 2026
24dcbd5
perf:Remove magic number functions
mkitti Feb 13, 2026
7b3db07
test:Add power of 2 sharding indexing tests
mkitti Feb 13, 2026
443b5d4
test: Add Morton order benchmarks with cache clearing
mkitti Feb 13, 2026
1cdcbdf
fix:Bound LRU cache of _morton_order to 16
mkitti Feb 13, 2026
536f520
Merge branch 'main' into mkitti-morton-decode-optimization
d-v-b Feb 13, 2026
65205b3
Merge branch 'main' into mkitti-morton-decode-optimization
d-v-b Feb 13, 2026
c872e2b
test:Add a single chunk test for a large shard
mkitti Feb 13, 2026
1cad983
test:Add indexing benchmarks for writing
mkitti Feb 16, 2026
a666211
tests:Add single chunk write test for sharding
mkitti Feb 16, 2026
6129cd3
perf: Vectorize get_chunk_slice for faster sharded writes
mkitti Feb 17, 2026
157d283
Merge branch 'main' into mkitti-get-chunk-slice-vectorization
d-v-b Feb 18, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/3708.misc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Optimize Morton order computation with hypercube optimization, vectorized decoding, magic number bit manipulation for 2D-5D, and singleton dimension removal, providing 10-45x speedup for typical chunk shapes.
1 change: 1 addition & 0 deletions changes/3713.misc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Vectorize get_chunk_slice for faster sharded array writes.
74 changes: 73 additions & 1 deletion src/zarr/codecs/sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
from zarr.core.indexing import (
BasicIndexer,
SelectorTuple,
_morton_order,
c_order_iter,
get_indexer,
morton_order_iter,
Expand Down Expand Up @@ -138,6 +139,45 @@ def get_chunk_slice(self, chunk_coords: tuple[int, ...]) -> tuple[int, int] | No
else:
return (int(chunk_start), int(chunk_start + chunk_len))

def get_chunk_slices_vectorized(
self, chunk_coords_array: npt.NDArray[np.uint64]
) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.uint64], npt.NDArray[np.bool_]]:
"""Get chunk slices for multiple coordinates at once.

Parameters
----------
chunk_coords_array : ndarray of shape (n_chunks, n_dims)
Array of chunk coordinates to look up.

Returns
-------
starts : ndarray of shape (n_chunks,)
Start byte positions for each chunk.
ends : ndarray of shape (n_chunks,)
End byte positions for each chunk.
valid : ndarray of shape (n_chunks,)
Boolean mask indicating which chunks are non-empty.
"""
# Localize coordinates via modulo (vectorized)
shard_shape = np.array(self.offsets_and_lengths.shape[:-1], dtype=np.uint64)
localized = chunk_coords_array % shard_shape

# Build index tuple for advanced indexing
index_tuple = tuple(localized[:, i] for i in range(localized.shape[1]))

# Fetch all offsets and lengths at once
offsets_and_lengths = self.offsets_and_lengths[index_tuple]
starts = offsets_and_lengths[:, 0]
lengths = offsets_and_lengths[:, 1]

# Check for valid (non-empty) chunks
valid = starts != MAX_UINT_64

# Compute end positions
ends = starts + lengths

return starts, ends, valid

def set_chunk_slice(self, chunk_coords: tuple[int, ...], chunk_slice: slice | None) -> None:
localized_chunk = self._localize_chunk(chunk_coords)
if chunk_slice is None:
Expand Down Expand Up @@ -219,6 +259,35 @@ def __len__(self) -> int:
def __iter__(self) -> Iterator[tuple[int, ...]]:
return c_order_iter(self.index.offsets_and_lengths.shape[:-1])

def to_dict_vectorized(
self,
chunk_coords_array: npt.NDArray[np.uint64],
chunk_coords_tuples: tuple[tuple[int, ...], ...],
) -> dict[tuple[int, ...], Buffer | None]:
"""Build a dict of chunk coordinates to buffers using vectorized lookup.

Parameters
----------
chunk_coords_array : ndarray of shape (n_chunks, n_dims)
Array of chunk coordinates for vectorized index lookup.
chunk_coords_tuples : tuple of tuples
The same coordinates as tuples, used as dict keys to avoid conversion.

Returns
-------
dict mapping chunk coordinate tuples to Buffer or None
"""
starts, ends, valid = self.index.get_chunk_slices_vectorized(chunk_coords_array)

result: dict[tuple[int, ...], Buffer | None] = {}
for i, coords in enumerate(chunk_coords_tuples):
if valid[i]:
result[coords] = self.buf[int(starts[i]) : int(ends[i])]
else:
result[coords] = None

return result


@dataclass(frozen=True)
class ShardingCodec(
Expand Down Expand Up @@ -505,7 +574,10 @@ async def _encode_partial_single(
chunks_per_shard=chunks_per_shard,
)
shard_reader = shard_reader or _ShardReader.create_empty(chunks_per_shard)
shard_dict = {k: shard_reader.get(k) for k in morton_order_iter(chunks_per_shard)}
# Use vectorized lookup for better performance
morton_coords = _morton_order(chunks_per_shard)
chunk_coords_array = np.array(morton_coords, dtype=np.uint64)
shard_dict = shard_reader.to_dict_vectorized(chunk_coords_array, morton_coords)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why does to_dict_vectorized take the same values as 2 different types? Can't we pass in morton_coords and have the array version created internally as an implementation detail?


indexer = list(
get_indexer(
Expand Down
94 changes: 90 additions & 4 deletions src/zarr/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1452,7 +1452,7 @@ def make_slice_selection(selection: Any) -> list[slice]:
def decode_morton(z: int, chunk_shape: tuple[int, ...]) -> tuple[int, ...]:
# Inspired by compressed morton code as implemented in Neuroglancer
# https://github.com/google/neuroglancer/blob/master/src/neuroglancer/datasource/precomputed/volume.md#compressed-morton-code
bits = tuple(math.ceil(math.log2(c)) for c in chunk_shape)
bits = tuple((c - 1).bit_length() for c in chunk_shape)
max_coords_bits = max(bits)
input_bit = 0
input_value = z
Expand All @@ -1467,16 +1467,102 @@ def decode_morton(z: int, chunk_shape: tuple[int, ...]) -> tuple[int, ...]:
return tuple(out)


@lru_cache
def decode_morton_vectorized(
z: npt.NDArray[np.intp], chunk_shape: tuple[int, ...]
) -> npt.NDArray[np.intp]:
"""Vectorized Morton code decoding for multiple z values.

Parameters
----------
z : ndarray
1D array of Morton codes to decode.
chunk_shape : tuple of int
Shape defining the coordinate space.

Returns
-------
ndarray
2D array of shape (len(z), len(chunk_shape)) containing decoded coordinates.
"""
n_dims = len(chunk_shape)
bits = tuple((c - 1).bit_length() for c in chunk_shape)

max_coords_bits = max(bits) if bits else 0
out = np.zeros((len(z), n_dims), dtype=np.intp)

input_bit = 0
for coord_bit in range(max_coords_bits):
for dim in range(n_dims):
if coord_bit < bits[dim]:
# Extract bit at position input_bit from all z values
bit_values = (z >> input_bit) & 1
# Place bit at coord_bit position in dimension dim
out[:, dim] |= bit_values << coord_bit
input_bit += 1

return out


@lru_cache(maxsize=16)
def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
n_total = product(chunk_shape)
order: list[tuple[int, ...]] = []
i = 0
if n_total == 0:
return ()

# Optimization: Remove singleton dimensions to enable magic number usage
# for shapes like (1,1,32,32,32). Compute Morton on squeezed shape, then expand.
singleton_dims = tuple(i for i, s in enumerate(chunk_shape) if s == 1)
if singleton_dims:
squeezed_shape = tuple(s for s in chunk_shape if s != 1)
if squeezed_shape:
# Compute Morton order on squeezed shape
squeezed_order = _morton_order(squeezed_shape)
# Expand coordinates to include singleton dimensions (always 0)
expanded: list[tuple[int, ...]] = []
for coord in squeezed_order:
full_coord: list[int] = []
squeezed_idx = 0
for i in range(len(chunk_shape)):
if chunk_shape[i] == 1:
full_coord.append(0)
else:
full_coord.append(coord[squeezed_idx])
squeezed_idx += 1
expanded.append(tuple(full_coord))
return tuple(expanded)
else:
# All dimensions are singletons, just return the single point
return ((0,) * len(chunk_shape),)

n_dims = len(chunk_shape)

# Find the largest power-of-2 hypercube that fits within chunk_shape.
# Within this hypercube, Morton codes are guaranteed to be in bounds.
min_dim = min(chunk_shape)
if min_dim >= 1:
power = min_dim.bit_length() - 1 # floor(log2(min_dim))
hypercube_size = 1 << power # 2^power
n_hypercube = hypercube_size**n_dims
else:
n_hypercube = 0

# Within the hypercube, no bounds checking needed - use vectorized decoding
order: list[tuple[int, ...]]
if n_hypercube > 0:
z_values = np.arange(n_hypercube, dtype=np.intp)
hypercube_coords = decode_morton_vectorized(z_values, chunk_shape)
order = [tuple(row) for row in hypercube_coords]
else:
order = []

# For remaining elements, bounds checking is needed
i = n_hypercube
while len(order) < n_total:
m = decode_morton(i, chunk_shape)
if all(x < y for x, y in zip(m, chunk_shape, strict=False)):
order.append(m)
i += 1

return tuple(order)


Expand Down
Loading