Skip to content

Commit 1fee461

Browse files
tubclaude
andcommitted
[python] Address PR apache#7347 review feedback: use @cachedmethod, trim docs, remove ChangelogProducer
- Upgrade cachetools to >=7,<8 for cachedmethod(info=True) support - Remove ChangelogProducer enum (belongs in apache#7348 scanners branch) - Replace manual cache hit/miss counters with @cachedmethod(info=True) decorator on ManifestFileManager, ManifestListManager, SnapshotManager - Trim verbose docstrings across identifier, file_io, pyarrow_file_io, manifest_list_manager, and snapshot_manager - Update cache tests to use cache_info() instead of manual counters Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent ffb9834 commit 1fee461

9 files changed

Lines changed: 45 additions & 247 deletions

File tree

paimon-python/dev/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
################################################################################
1818
# Core dependencies for pypaimon
1919
cachetools>=4.2,<6; python_version=="3.6"
20-
cachetools>=5,<6; python_version>"3.6"
20+
cachetools>=7,<8; python_version>"3.6"
2121
dataclasses>=0.8; python_version < "3.7"
2222
fastavro>=1.4,<2
2323
fsspec>=2021.10,<2026; python_version<"3.8"

paimon-python/pypaimon/common/file_io.py

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -53,18 +53,7 @@ def exists(self, path: str) -> bool:
5353
pass
5454

5555
def exists_batch(self, paths: List[str]) -> Dict[str, bool]:
56-
"""
57-
Check existence of multiple paths.
58-
59-
Default implementation calls exists() for each path sequentially.
60-
Subclasses may override to provide more efficient batch operations.
61-
62-
Args:
63-
paths: List of file paths to check
64-
65-
Returns:
66-
Dictionary mapping each path to its existence status
67-
"""
56+
"""Check existence of multiple paths, returning {path: bool}."""
6857
return {path: self.exists(path) for path in paths}
6958

7059
@abstractmethod

paimon-python/pypaimon/common/identifier.py

Lines changed: 1 addition & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -37,29 +37,7 @@ def create(cls, database: str, object: str) -> "Identifier":
3737

3838
@classmethod
3939
def from_string(cls, full_name: str) -> "Identifier":
40-
"""
41-
Parse an identifier from a string.
42-
43-
Supports two modes:
44-
1. Backtick-quoted (for names with periods):
45-
- `database.name`.table → database="database.name", object="table"
46-
- `db`.`table` → database="db", object="table"
47-
48-
2. Simple split on first period (Java-compatible fallback):
49-
- database.table → database="database", object="table"
50-
51-
For database names containing periods, use backticks or
52-
Identifier.create("database.name", "table") directly.
53-
54-
Args:
55-
full_name: The full identifier string
56-
57-
Returns:
58-
Identifier instance
59-
60-
Raises:
61-
ValueError: If the format is invalid
62-
"""
40+
"""Parse a 'database.object' identifier, with optional backtick quoting."""
6341
if not full_name or not full_name.strip():
6442
raise ValueError("fullName cannot be null or empty")
6543

@@ -79,13 +57,6 @@ def from_string(cls, full_name: str) -> "Identifier":
7957

8058
@classmethod
8159
def _parse_with_backticks(cls, full_name: str) -> "Identifier":
82-
"""
83-
Parse identifier with backtick-quoted segments.
84-
85-
Examples:
86-
- `db.name`.table → database="db.name", object="table"
87-
- `db`.`table` → database="db", object="table"
88-
"""
8960
parts = []
9061
current = ""
9162
in_backticks = False

paimon-python/pypaimon/common/options/core_options.py

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -45,21 +45,6 @@ class MergeEngine(str, Enum):
4545
FIRST_ROW = "first-row"
4646

4747

48-
class ChangelogProducer(str, Enum):
49-
"""
50-
Specifies the changelog producer for tables with primary key.
51-
52-
- NONE: No changelog file (default for append-only tables)
53-
- INPUT: Double write changelog from input
54-
- FULL_COMPACTION: Generate changelog with each full compaction
55-
- LOOKUP: Generate changelog through lookup compaction
56-
"""
57-
NONE = "none"
58-
INPUT = "input"
59-
FULL_COMPACTION = "full-compaction"
60-
LOOKUP = "lookup"
61-
62-
6348
class CoreOptions:
6449
"""Core options for Paimon tables."""
6550
# File format constants
@@ -276,15 +261,6 @@ class CoreOptions:
276261
"Options: deduplicate, partial-update, aggregation, first-row.")
277262
)
278263

279-
CHANGELOG_PRODUCER: ConfigOption[ChangelogProducer] = (
280-
ConfigOptions.key("changelog-producer")
281-
.enum_type(ChangelogProducer)
282-
.default_value(ChangelogProducer.NONE)
283-
.with_description(
284-
"Whether to double write to a changelog file. "
285-
"Options: none, input, full-compaction, lookup."
286-
)
287-
)
288264
# Commit options
289265
COMMIT_USER_PREFIX: ConfigOption[str] = (
290266
ConfigOptions.key("commit.user-prefix")
@@ -502,9 +478,6 @@ def deletion_vectors_enabled(self, default=None):
502478
def merge_engine(self, default=None):
503479
return self.options.get(CoreOptions.MERGE_ENGINE, default)
504480

505-
def changelog_producer(self, default=None):
506-
return self.options.get(CoreOptions.CHANGELOG_PRODUCER, default)
507-
508481
def data_file_external_paths(self, default=None):
509482
external_paths_str = self.options.get(CoreOptions.DATA_FILE_EXTERNAL_PATHS, default)
510483
if not external_paths_str:

paimon-python/pypaimon/filesystem/pyarrow_file_io.py

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -235,18 +235,7 @@ def exists(self, path: str) -> bool:
235235
return self._get_file_info(path_str).type != pafs.FileType.NotFound
236236

237237
def exists_batch(self, paths: List[str]) -> Dict[str, bool]:
238-
"""
239-
Check existence of multiple paths in one S3 API call.
240-
241-
This is more efficient than calling exists() for each path individually
242-
as it batches the file info requests.
243-
244-
Args:
245-
paths: List of file paths to check
246-
247-
Returns:
248-
Dictionary mapping each path to its existence status
249-
"""
238+
"""Check existence of multiple paths in a single batched API call."""
250239
if not paths:
251240
return {}
252241

paimon-python/pypaimon/manifest/manifest_file_manager.py

Lines changed: 4 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@
1818
from concurrent.futures import ThreadPoolExecutor
1919
from datetime import datetime
2020
from io import BytesIO
21-
from typing import List, Optional
21+
from typing import List
2222

2323
import fastavro
24-
from cachetools import LRUCache
24+
from cachetools import LRUCache, cachedmethod
2525

2626
from pypaimon.manifest.schema.data_file_meta import DataFileMeta
2727
from pypaimon.manifest.schema.manifest_entry import (MANIFEST_ENTRY_SCHEMA,
@@ -47,12 +47,7 @@ def __init__(self, table, cache_max_size: int = 100):
4747
self.primary_keys_fields = self.table.primary_keys_fields
4848
self.trimmed_primary_keys_fields = self.table.trimmed_primary_keys_fields
4949

50-
# LRU cache for manifest file contents
51-
self._cache: Optional[LRUCache] = (
52-
LRUCache(maxsize=cache_max_size) if cache_max_size > 0 else None
53-
)
54-
self._cache_hits = 0
55-
self._cache_misses = 0
50+
self._cache: LRUCache = LRUCache(maxsize=max(cache_max_size, 0))
5651

5752
def read_entries_parallel(self, manifest_files: List[ManifestFileMeta], manifest_entry_filter=None,
5853
drop_stats=True, max_workers=8) -> List[ManifestEntry]:
@@ -89,21 +84,7 @@ def _entry_identifier(e: ManifestEntry) -> tuple:
8984
return final_entries
9085

9186
def read(self, manifest_file_name: str, manifest_entry_filter=None, drop_stats=True) -> List[ManifestEntry]:
92-
# Check cache first (cache stores unfiltered entries with stats)
93-
if self._cache is not None and manifest_file_name in self._cache:
94-
self._cache_hits += 1
95-
entries = self._cache[manifest_file_name]
96-
return self._apply_post_read(entries, manifest_entry_filter, drop_stats)
97-
98-
self._cache_misses += 1
99-
100-
# Read from storage
10187
entries = self._read_from_storage(manifest_file_name)
102-
103-
# Cache the unfiltered entries
104-
if self._cache is not None:
105-
self._cache[manifest_file_name] = entries
106-
10788
return self._apply_post_read(entries, manifest_entry_filter, drop_stats)
10889

10990
def _apply_post_read(self, entries: List[ManifestEntry], manifest_entry_filter,
@@ -119,6 +100,7 @@ def _apply_post_read(self, entries: List[ManifestEntry], manifest_entry_filter,
119100
result.append(entry)
120101
return result
121102

103+
@cachedmethod(lambda self: self._cache, key=lambda self, name: name, info=True)
122104
def _read_from_storage(self, manifest_file_name: str) -> List[ManifestEntry]:
123105
"""Read manifest entries from storage (no filtering, with stats)."""
124106
manifest_file_path = f"{self.manifest_path}/{manifest_file_name}"

paimon-python/pypaimon/manifest/manifest_list_manager.py

Lines changed: 6 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from typing import List, Optional
2121

2222
import fastavro
23-
from cachetools import LRUCache
23+
from cachetools import LRUCache, cachedmethod
2424

2525
from pypaimon.manifest.schema.manifest_file_meta import (
2626
MANIFEST_FILE_META_SCHEMA, ManifestFileMeta)
@@ -41,12 +41,7 @@ def __init__(self, table, cache_max_size: int = 50):
4141
self.manifest_path = f"{manifest_path}/manifest"
4242
self.file_io = self.table.file_io
4343

44-
# LRU cache for manifest list contents
45-
self._cache: Optional[LRUCache] = (
46-
LRUCache(maxsize=cache_max_size) if cache_max_size > 0 else None
47-
)
48-
self._cache_hits = 0
49-
self._cache_misses = 0
44+
self._cache: LRUCache = LRUCache(maxsize=max(cache_max_size, 0))
5045

5146
def read_all(self, snapshot: Optional[Snapshot]) -> List[ManifestFileMeta]:
5247
if snapshot is None:
@@ -59,57 +54,22 @@ def read_all(self, snapshot: Optional[Snapshot]) -> List[ManifestFileMeta]:
5954
return manifest_files
6055

6156
def read_base(self, snapshot: Snapshot) -> List[ManifestFileMeta]:
62-
"""
63-
Read only base manifest list (all files at this snapshot point).
64-
65-
This is useful for computing diffs between two snapshots, where
66-
we only need the base state and not the delta changes.
67-
68-
Args:
69-
snapshot: The snapshot to read base manifest from
70-
71-
Returns:
72-
List of ManifestFileMeta representing all files at this snapshot
73-
"""
57+
"""Read only the base manifest list for the given snapshot."""
7458
return self.read(snapshot.base_manifest_list)
7559

7660
def read_delta(self, snapshot: Snapshot) -> List[ManifestFileMeta]:
7761
return self.read(snapshot.delta_manifest_list)
7862

7963
def read_changelog(self, snapshot: Snapshot) -> List[ManifestFileMeta]:
80-
"""
81-
Read changelog manifest files from snapshot.
82-
83-
For primary key tables with changelog-producer=input/full-compaction/lookup,
84-
changelog files are stored in a separate manifest list.
85-
86-
Args:
87-
snapshot: The snapshot to read changelog from
88-
89-
Returns:
90-
List of ManifestFileMeta for changelog files, or empty list if no changelog
91-
"""
64+
"""Read changelog manifest files from snapshot, or empty list if none."""
9265
if snapshot.changelog_manifest_list is None:
9366
return []
9467
return self.read(snapshot.changelog_manifest_list)
9568

9669
def read(self, manifest_list_name: str) -> List[ManifestFileMeta]:
97-
# Check cache first
98-
if self._cache is not None and manifest_list_name in self._cache:
99-
self._cache_hits += 1
100-
return self._cache[manifest_list_name]
101-
102-
self._cache_misses += 1
103-
104-
# Read from storage
105-
result = self._read_from_storage(manifest_list_name)
106-
107-
# Cache the result
108-
if self._cache is not None:
109-
self._cache[manifest_list_name] = result
110-
111-
return result
70+
return self._read_from_storage(manifest_list_name)
11271

72+
@cachedmethod(lambda self: self._cache, key=lambda self, name: name, info=True)
11373
def _read_from_storage(self, manifest_list_name: str) -> List[ManifestFileMeta]:
11474
"""Read manifest list from storage."""
11575
manifest_files = []

0 commit comments

Comments
 (0)