Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyiceberg/table/snapshots.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ def _partition_summary(self, update_metrics: UpdateMetrics) -> str:


def update_snapshot_summaries(summary: Summary, previous_summary: Mapping[str, str] | None = None) -> Summary:
if summary.operation not in {Operation.APPEND, Operation.OVERWRITE, Operation.DELETE}:
if summary.operation not in {Operation.APPEND, Operation.OVERWRITE, Operation.DELETE, Operation.REPLACE}:
raise ValueError(f"Operation not implemented: {summary.operation}")

if not previous_summary:
Expand Down
98 changes: 98 additions & 0 deletions pyiceberg/table/update/snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,96 @@ def _get_entries(manifest: ManifestFile) -> list[ManifestEntry]:
return []


class _RewriteFiles(_SnapshotProducer["_RewriteFiles"]):
"""A snapshot producer that rewrites data files."""

def __init__(self, operation: Operation, transaction: Transaction, io: FileIO, snapshot_properties: dict[str, str]):
super().__init__(operation, transaction, io, snapshot_properties=snapshot_properties)

def _commit(self) -> UpdatesAndRequirements:
# Only produce a commit when there is something to rewrite
if self._deleted_data_files or self._added_data_files:
# Grab the entries that we actually found in the table's manifests
deleted_entries = self._deleted_entries()
found_deleted_files = {entry.data_file for entry in deleted_entries}

# If the user asked to delete files that aren't in the table, abort.
if len(found_deleted_files) != len(self._deleted_data_files):
raise ValueError("Cannot delete files that are not present in the table")

added_records = sum(f.record_count for f in self._added_data_files)
deleted_records = sum(entry.data_file.record_count for entry in deleted_entries)

if added_records > deleted_records:
raise ValueError(f"Invalid replace: records added ({added_records}) exceeds records removed ({deleted_records})")

return super()._commit()
else:
return (), ()

def _deleted_entries(self) -> list[ManifestEntry]:
"""Check if we need to mark the files as deleted."""
if self._parent_snapshot_id is not None:
previous_snapshot = self._transaction.table_metadata.snapshot_by_id(self._parent_snapshot_id)
if previous_snapshot is None:
raise ValueError(f"Could not find the previous snapshot: {self._parent_snapshot_id}")

executor = ExecutorFactory.get_or_create()

def _get_entries(manifest: ManifestFile) -> list[ManifestEntry]:
return [
ManifestEntry.from_args(
status=ManifestEntryStatus.DELETED,
snapshot_id=self.snapshot_id,
sequence_number=entry.sequence_number,
file_sequence_number=entry.file_sequence_number,
data_file=entry.data_file,
)
for entry in manifest.fetch_manifest_entry(self._io, discard_deleted=True)
if entry.data_file.content == DataFileContent.DATA and entry.data_file in self._deleted_data_files
]

list_of_entries = executor.map(_get_entries, previous_snapshot.manifests(self._io))
return list(itertools.chain(*list_of_entries))
else:
return []

def _existing_manifests(self) -> list[ManifestFile]:
"""To determine if there are any existing manifests."""
existing_files = []
if snapshot := self._transaction.table_metadata.snapshot_by_name(name=self._target_branch):
for manifest_file in snapshot.manifests(io=self._io):
entries_to_write: set[ManifestEntry] = set()
found_deleted_entries: set[ManifestEntry] = set()

for entry in manifest_file.fetch_manifest_entry(io=self._io, discard_deleted=True):
if entry.data_file in self._deleted_data_files:
found_deleted_entries.add(entry)
else:
entries_to_write.add(entry)

if len(found_deleted_entries) == 0:
existing_files.append(manifest_file)
continue

if len(entries_to_write) == 0:
continue

with self.new_manifest_writer(self.spec(manifest_file.partition_spec_id)) as writer:
for entry in entries_to_write:
writer.add_entry(
ManifestEntry.from_args(
status=ManifestEntryStatus.EXISTING,
snapshot_id=entry.snapshot_id,
sequence_number=entry.sequence_number,
file_sequence_number=entry.file_sequence_number,
data_file=entry.data_file,
)
)
existing_files.append(writer.to_manifest_file())
return existing_files


class UpdateSnapshot:
_transaction: Transaction
_io: FileIO
Expand Down Expand Up @@ -724,6 +814,14 @@ def delete(self) -> _DeleteFiles:
snapshot_properties=self._snapshot_properties,
)

def replace(self) -> _RewriteFiles:
return _RewriteFiles(
operation=Operation.REPLACE,
transaction=self._transaction,
io=self._io,
snapshot_properties=self._snapshot_properties,
)


class _ManifestMergeManager(Generic[U]):
_target_size_bytes: int
Expand Down
Loading
Loading