zarr-developers · mkitti · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026
diff --git a/changes/3708.misc.md b/changes/3708.misc.md
@@ -0,0 +1 @@
+Optimize Morton order computation with hypercube optimization, vectorized decoding, magic number bit manipulation for 2D-5D, and singleton dimension removal, providing 10-45x speedup for typical chunk shapes.
diff --git a/changes/3713.misc.md b/changes/3713.misc.md
@@ -0,0 +1 @@
+Vectorize get_chunk_slice for faster sharded array writes.
diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
@@ -46,6 +46,7 @@
 from zarr.core.indexing import (
     BasicIndexer,
     SelectorTuple,
+    _morton_order,
     c_order_iter,
     get_indexer,
     morton_order_iter,
@@ -138,6 +139,45 @@ def get_chunk_slice(self, chunk_coords: tuple[int, ...]) -> tuple[int, int] | No
         else:
             return (int(chunk_start), int(chunk_start + chunk_len))
 
+    def get_chunk_slices_vectorized(
+        self, chunk_coords_array: npt.NDArray[np.uint64]
+    ) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.uint64], npt.NDArray[np.bool_]]:
+        """Get chunk slices for multiple coordinates at once.
+
+        Parameters
+        ----------
+        chunk_coords_array : ndarray of shape (n_chunks, n_dims)
+            Array of chunk coordinates to look up.
+
+        Returns
+        -------
+        starts : ndarray of shape (n_chunks,)
+            Start byte positions for each chunk.
+        ends : ndarray of shape (n_chunks,)
+            End byte positions for each chunk.
+        valid : ndarray of shape (n_chunks,)
+            Boolean mask indicating which chunks are non-empty.
+        """
+        # Localize coordinates via modulo (vectorized)
+        shard_shape = np.array(self.offsets_and_lengths.shape[:-1], dtype=np.uint64)
+        localized = chunk_coords_array % shard_shape
+
+        # Build index tuple for advanced indexing
+        index_tuple = tuple(localized[:, i] for i in range(localized.shape[1]))
+
+        # Fetch all offsets and lengths at once
+        offsets_and_lengths = self.offsets_and_lengths[index_tuple]
+        starts = offsets_and_lengths[:, 0]
+        lengths = offsets_and_lengths[:, 1]
+
+        # Check for valid (non-empty) chunks
+        valid = starts != MAX_UINT_64
+
+        # Compute end positions
+        ends = starts + lengths
+
+        return starts, ends, valid
+
     def set_chunk_slice(self, chunk_coords: tuple[int, ...], chunk_slice: slice | None) -> None:
         localized_chunk = self._localize_chunk(chunk_coords)
         if chunk_slice is None:
@@ -219,6 +259,35 @@ def __len__(self) -> int:
     def __iter__(self) -> Iterator[tuple[int, ...]]:
         return c_order_iter(self.index.offsets_and_lengths.shape[:-1])
 
+    def to_dict_vectorized(
+        self,
+        chunk_coords_array: npt.NDArray[np.uint64],
+        chunk_coords_tuples: tuple[tuple[int, ...], ...],
+    ) -> dict[tuple[int, ...], Buffer | None]:
+        """Build a dict of chunk coordinates to buffers using vectorized lookup.
+
+        Parameters
+        ----------
+        chunk_coords_array : ndarray of shape (n_chunks, n_dims)
+            Array of chunk coordinates for vectorized index lookup.
+        chunk_coords_tuples : tuple of tuples
+            The same coordinates as tuples, used as dict keys to avoid conversion.
+
+        Returns
+        -------
+        dict mapping chunk coordinate tuples to Buffer or None
+        """
+        starts, ends, valid = self.index.get_chunk_slices_vectorized(chunk_coords_array)
+
+        result: dict[tuple[int, ...], Buffer | None] = {}
+        for i, coords in enumerate(chunk_coords_tuples):
+            if valid[i]:
+                result[coords] = self.buf[int(starts[i]) : int(ends[i])]
+            else:
+                result[coords] = None
+
+        return result
+
 
 @dataclass(frozen=True)
 class ShardingCodec(
@@ -505,7 +574,10 @@ async def _encode_partial_single(
             chunks_per_shard=chunks_per_shard,
         )
         shard_reader = shard_reader or _ShardReader.create_empty(chunks_per_shard)
-        shard_dict = {k: shard_reader.get(k) for k in morton_order_iter(chunks_per_shard)}
+        # Use vectorized lookup for better performance
+        morton_coords = _morton_order(chunks_per_shard)
+        chunk_coords_array = np.array(morton_coords, dtype=np.uint64)
+        shard_dict = shard_reader.to_dict_vectorized(chunk_coords_array, morton_coords)
 
         indexer = list(
             get_indexer(

diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py
@@ -1452,7 +1452,7 @@ def make_slice_selection(selection: Any) -> list[slice]:
 def decode_morton(z: int, chunk_shape: tuple[int, ...]) -> tuple[int, ...]:
     # Inspired by compressed morton code as implemented in Neuroglancer
     # https://github.com/google/neuroglancer/blob/master/src/neuroglancer/datasource/precomputed/volume.md#compressed-morton-code
-    bits = tuple(math.ceil(math.log2(c)) for c in chunk_shape)
+    bits = tuple((c - 1).bit_length() for c in chunk_shape)
     max_coords_bits = max(bits)
     input_bit = 0
     input_value = z
@@ -1467,16 +1467,102 @@ def decode_morton(z: int, chunk_shape: tuple[int, ...]) -> tuple[int, ...]:
     return tuple(out)
 
 
-@lru_cache
+def decode_morton_vectorized(
+    z: npt.NDArray[np.intp], chunk_shape: tuple[int, ...]
+) -> npt.NDArray[np.intp]:
+    """Vectorized Morton code decoding for multiple z values.
+
+    Parameters
+    ----------
+    z : ndarray
+        1D array of Morton codes to decode.
+    chunk_shape : tuple of int
+        Shape defining the coordinate space.
+
+    Returns
+    -------
+    ndarray
+        2D array of shape (len(z), len(chunk_shape)) containing decoded coordinates.
+    """
+    n_dims = len(chunk_shape)
+    bits = tuple((c - 1).bit_length() for c in chunk_shape)
+
+    max_coords_bits = max(bits) if bits else 0
+    out = np.zeros((len(z), n_dims), dtype=np.intp)
+
+    input_bit = 0
+    for coord_bit in range(max_coords_bits):
+        for dim in range(n_dims):
+            if coord_bit < bits[dim]:
+                # Extract bit at position input_bit from all z values
+                bit_values = (z >> input_bit) & 1
+                # Place bit at coord_bit position in dimension dim
+                out[:, dim] |= bit_values << coord_bit
+                input_bit += 1
+
+    return out
+
+
+@lru_cache(maxsize=16)
 def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
     n_total = product(chunk_shape)
-    order: list[tuple[int, ...]] = []
-    i = 0
+    if n_total == 0:
+        return ()
+
+    # Optimization: Remove singleton dimensions to enable magic number usage
+    # for shapes like (1,1,32,32,32). Compute Morton on squeezed shape, then expand.
+    singleton_dims = tuple(i for i, s in enumerate(chunk_shape) if s == 1)
+    if singleton_dims:
+        squeezed_shape = tuple(s for s in chunk_shape if s != 1)
+        if squeezed_shape:
+            # Compute Morton order on squeezed shape
+            squeezed_order = _morton_order(squeezed_shape)
+            # Expand coordinates to include singleton dimensions (always 0)
+            expanded: list[tuple[int, ...]] = []
+            for coord in squeezed_order:
+                full_coord: list[int] = []
+                squeezed_idx = 0
+                for i in range(len(chunk_shape)):
+                    if chunk_shape[i] == 1:
+                        full_coord.append(0)
+                    else:
+                        full_coord.append(coord[squeezed_idx])
+                        squeezed_idx += 1
+                expanded.append(tuple(full_coord))
+            return tuple(expanded)
+        else:
+            # All dimensions are singletons, just return the single point
+            return ((0,) * len(chunk_shape),)
+
+    n_dims = len(chunk_shape)
+
+    # Find the largest power-of-2 hypercube that fits within chunk_shape.
+    # Within this hypercube, Morton codes are guaranteed to be in bounds.
+    min_dim = min(chunk_shape)
+    if min_dim >= 1:
+        power = min_dim.bit_length() - 1  # floor(log2(min_dim))
+        hypercube_size = 1 << power  # 2^power
+        n_hypercube = hypercube_size**n_dims
+    else:
+        n_hypercube = 0
+
+    # Within the hypercube, no bounds checking needed - use vectorized decoding
+    order: list[tuple[int, ...]]
+    if n_hypercube > 0:
+        z_values = np.arange(n_hypercube, dtype=np.intp)
+        hypercube_coords = decode_morton_vectorized(z_values, chunk_shape)
+        order = [tuple(row) for row in hypercube_coords]
+    else:
+        order = []
+
+    # For remaining elements, bounds checking is needed
+    i = n_hypercube
     while len(order) < n_total:
         m = decode_morton(i, chunk_shape)
         if all(x < y for x, y in zip(m, chunk_shape, strict=False)):
             order.append(m)
         i += 1
+
     return tuple(order)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Optimize Morton order computation with hypercube optimization, vectorized decoding, magic number bit manipulation for 2D-5D, and singleton dimension removal, providing 10-45x speedup for typical chunk shapes.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Vectorize get_chunk_slice for faster sharded array writes.