diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py index 04642ad5d401..92394d4a93dc 100644 --- a/src/diffusers/models/model_loading_utils.py +++ b/src/diffusers/models/model_loading_utils.py @@ -38,6 +38,7 @@ WEIGHTS_INDEX_NAME, _add_variant, _get_model_file, + _is_lfs_pointer, deprecate, is_accelerate_available, is_accelerate_version, @@ -164,6 +165,22 @@ def load_state_dict( # TODO: maybe refactor a bit this part where we pass a dict here if isinstance(checkpoint_file, dict): return checkpoint_file + + # Detect Git LFS pointer stubs before attempting to load. Without this, safetensors / + # torch.load fails far away from the real cause with a confusing deserialization error. + # The check covers both `git clone` without `git lfs pull` and bucket mirrors created + # with `gsutil rsync` / `aws s3 sync` that copied the LFS pointer text rather than + # the underlying weights. + if dduf_entries is None and _is_lfs_pointer(checkpoint_file): + raise OSError( + f"`{checkpoint_file}` is a Git LFS pointer file, not the actual weights. This typically " + f"happens when a Hugging Face repository was mirrored without LFS-aware copying — for " + f"example, `git clone` without a subsequent `git lfs pull`, or `gsutil rsync` / " + f"`aws s3 sync` from a bucket that holds the original git checkout. Re-mirror with " + f"`git lfs pull` (or with an LFS-aware tool such as " + f"`huggingface-cli download --local-dir `) and try again." + ) + try: file_extension = os.path.basename(checkpoint_file).split(".")[-1] if file_extension == SAFETENSORS_FILE_EXTENSION: diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index 10ad75d92f17..eacdd5ec695e 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -50,6 +50,7 @@ _add_variant, _get_checkpoint_shard_files, _get_model_file, + _is_lfs_pointer, extract_commit_hash, http_user_agent, ) diff --git a/src/diffusers/utils/hub_utils.py b/src/diffusers/utils/hub_utils.py index b5eb9ab2e17f..0908eaffd1b2 100644 --- a/src/diffusers/utils/hub_utils.py +++ b/src/diffusers/utils/hub_utils.py @@ -224,6 +224,38 @@ def _add_variant(weights_name: str, variant: str | None = None) -> str: return weights_name +# Real `.safetensors` / `.bin` weight files are MB-to-GB. Git LFS pointer stubs are +# tiny text files (~130 bytes). Anything under this size that starts with the LFS +# pointer marker is, with extremely high confidence, a pointer rather than weights. +_LFS_POINTER_MAX_SIZE = 1024 +_LFS_POINTER_PREFIX = b"version https://git-lfs.github.com/spec/v1" + + +def _is_lfs_pointer(path: str | os.PathLike) -> bool: + """ + Return ``True`` if ``path`` is a Git LFS pointer stub rather than the actual file content. + + LFS pointer files are small text files (~130 bytes) of the form:: + + version https://git-lfs.github.com/spec/v1 + oid sha256: + size + + They are produced by ``git clone`` of an LFS-backed repository without ``git lfs pull``, + or by tools such as ``gsutil rsync`` / ``aws s3 sync`` that mirror an HF repository + without LFS-aware copying. Loading a pointer file as if it were the actual weights + leads to a confusing safetensors / pickle deserialization error far away from the + real cause. + """ + try: + if os.path.getsize(path) > _LFS_POINTER_MAX_SIZE: + return False + with open(path, "rb") as f: + return f.read(len(_LFS_POINTER_PREFIX)) == _LFS_POINTER_PREFIX + except OSError: + return False + + @validate_hf_hub_args def _get_model_file( pretrained_model_name_or_path: str | Path, diff --git a/tests/others/test_hub_utils.py b/tests/others/test_hub_utils.py index 0a6b8ef2bd9f..e98bec265753 100644 --- a/tests/others/test_hub_utils.py +++ b/tests/others/test_hub_utils.py @@ -16,7 +16,7 @@ from pathlib import Path from tempfile import TemporaryDirectory -from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card +from diffusers.utils.hub_utils import _is_lfs_pointer, load_or_create_model_card, populate_model_card class CreateModelCardTest(unittest.TestCase): @@ -27,3 +27,42 @@ def test_generate_model_card_with_library_name(self): model_card = load_or_create_model_card(file_path) populate_model_card(model_card) assert model_card.data.library_name == "foo" + + +class IsLFSPointerTest(unittest.TestCase): + LFS_POINTER_TEXT = ( + "version https://git-lfs.github.com/spec/v1\n" + "oid sha256:0000000000000000000000000000000000000000000000000000000000000000\n" + "size 17000000000\n" + ) + + def test_detects_lfs_pointer(self): + with TemporaryDirectory() as tmpdir: + file_path = Path(tmpdir) / "diffusion_pytorch_model.safetensors" + file_path.write_text(self.LFS_POINTER_TEXT) + assert _is_lfs_pointer(file_path) is True + + def test_real_safetensors_not_flagged(self): + # safetensors files start with an 8-byte little-endian header length and JSON metadata, + # never with the LFS pointer marker. Synthesise a small payload to confirm. + with TemporaryDirectory() as tmpdir: + file_path = Path(tmpdir) / "diffusion_pytorch_model.safetensors" + file_path.write_bytes(b"\x08\x00\x00\x00\x00\x00\x00\x00{} ") + assert _is_lfs_pointer(file_path) is False + + def test_large_file_not_flagged(self): + # A file larger than the pointer-size threshold is short-circuited to False without + # being read, even if its contents would otherwise match. + with TemporaryDirectory() as tmpdir: + file_path = Path(tmpdir) / "diffusion_pytorch_model.safetensors" + file_path.write_bytes(self.LFS_POINTER_TEXT.encode() + b"\x00" * 4096) + assert _is_lfs_pointer(file_path) is False + + def test_missing_file_returns_false(self): + assert _is_lfs_pointer("/nonexistent/path/foo.safetensors") is False + + def test_unrelated_short_file_not_flagged(self): + with TemporaryDirectory() as tmpdir: + file_path = Path(tmpdir) / "config.json" + file_path.write_text('{"version": 2}\n') + assert _is_lfs_pointer(file_path) is False