diff --git a/changes/3580.bugfix.md b/changes/3580.bugfix.md new file mode 100644 index 0000000000..48a492bd16 --- /dev/null +++ b/changes/3580.bugfix.md @@ -0,0 +1 @@ +Fix `ZipStore` leaving duplicate entries in the zip central directory when array metadata is written more than once (e.g. after `resize()`). The central directory is now deduplicated on `close()`, keeping only the most recent entry for each filename. diff --git a/src/zarr/storage/_zip.py b/src/zarr/storage/_zip.py index 72bf9e335a..cc6754a790 100644 --- a/src/zarr/storage/_zip.py +++ b/src/zarr/storage/_zip.py @@ -4,6 +4,7 @@ import shutil import threading import time +import warnings import zipfile from pathlib import Path from typing import TYPE_CHECKING, Any, Literal @@ -122,8 +123,36 @@ def close(self) -> None: # docstring inherited super().close() with self._lock: + self._dedup_central_directory() self._zf.close() + def _dedup_central_directory(self) -> None: + """Remove duplicate entries from the zip central directory. + + When an array is resized or metadata is updated multiple times, ZipStore + writes a new entry for the same key each time (ZIP files are append-only). + This leaves duplicate filenames in the central directory, which confuses + many zip readers and wastes space. Before closing, we keep only the + *last* (most recent) entry for every filename so that the on-disk central + directory is clean. + + ``NameToInfo`` is already a ``{filename: latest_ZipInfo}`` mapping because + each ``writestr``/``write`` call does ``NameToInfo[name] = zinfo``, so + later writes overwrite earlier ones. We therefore only need to rebuild + ``filelist`` from the values of ``NameToInfo``; ``NameToInfo`` itself + requires no update. + + The new list is computed before being assigned so that the ZipFile object + is never left in an inconsistent state if an exception is raised. + """ + if self._zf.mode not in ("w", "a", "x"): + return + # namelist() and getinfo() are public API. + # dict.fromkeys preserves first-seen order while deduplicating names; + # getinfo() returns the latest ZipInfo for each name (NameToInfo[name]). + unique_names = dict.fromkeys(self._zf.namelist()) + self._zf.filelist = [self._zf.getinfo(name) for name in unique_names] + async def clear(self) -> None: # docstring inherited with self._lock: @@ -205,7 +234,12 @@ def _set(self, key: str, value: Buffer) -> None: keyinfo.external_attr |= 0x10 # MS-DOS directory flag else: keyinfo.external_attr = 0o644 << 16 # ?rw-r--r-- - self._zf.writestr(keyinfo, value.to_bytes()) + # ZIP files are append-only; writing an existing key creates a second entry. + # We intentionally allow this and remove duplicates in _dedup_central_directory() + # when the store is closed. + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + self._zf.writestr(keyinfo, value.to_bytes()) async def set(self, key: str, value: Buffer) -> None: # docstring inherited diff --git a/tests/test_store/test_zip.py b/tests/test_store/test_zip.py index 744ee82945..08560ed685 100644 --- a/tests/test_store/test_zip.py +++ b/tests/test_store/test_zip.py @@ -152,3 +152,28 @@ async def test_move(self, tmp_path: Path) -> None: assert destination.exists() assert not origin.exists() assert np.array_equal(array[...], np.arange(10)) + + def test_no_duplicate_entries_after_resize(self, tmp_path: Path) -> None: + # Regression test for https://github.com/zarr-developers/zarr-python/issues/3580 + # Resizing an array (which rewrites zarr.json) used to leave duplicate + # filenames in the zip central directory. + zip_path = tmp_path / "data.zip" + store = ZipStore(zip_path, mode="w") + arr = zarr.create_array(store, shape=(5,), chunks=(5,), dtype="i4") + arr[:] = np.arange(5) + + # Resize triggers metadata rewrite, producing a second zarr.json entry + arr.resize((10,)) + arr[5:] = np.arange(5, 10) + store.close() + + # Verify no duplicate filenames in the central directory + with zipfile.ZipFile(zip_path, "r") as zf: + names = [info.filename for info in zf.infolist()] + assert len(names) == len(set(names)), f"Duplicate entries found: {names}" + + # Verify data integrity + store2 = ZipStore(zip_path, mode="r") + arr2 = zarr.open_array(store2) + assert arr2.shape == (10,) + assert np.array_equal(arr2[:], np.arange(10))