Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions src/diffusers/models/model_loading_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
WEIGHTS_INDEX_NAME,
_add_variant,
_get_model_file,
_is_lfs_pointer,
deprecate,
is_accelerate_available,
is_accelerate_version,
Expand Down Expand Up @@ -164,6 +165,22 @@ def load_state_dict(
# TODO: maybe refactor a bit this part where we pass a dict here
if isinstance(checkpoint_file, dict):
return checkpoint_file

# Detect Git LFS pointer stubs before attempting to load. Without this, safetensors /
# torch.load fails far away from the real cause with a confusing deserialization error.
# The check covers both `git clone` without `git lfs pull` and bucket mirrors created
# with `gsutil rsync` / `aws s3 sync` that copied the LFS pointer text rather than
# the underlying weights.
if dduf_entries is None and _is_lfs_pointer(checkpoint_file):
raise OSError(
f"`{checkpoint_file}` is a Git LFS pointer file, not the actual weights. This typically "
f"happens when a Hugging Face repository was mirrored without LFS-aware copying — for "
f"example, `git clone` without a subsequent `git lfs pull`, or `gsutil rsync` / "
f"`aws s3 sync` from a bucket that holds the original git checkout. Re-mirror with "
f"`git lfs pull` (or with an LFS-aware tool such as "
f"`huggingface-cli download <repo_id> --local-dir <dir>`) and try again."
)

try:
file_extension = os.path.basename(checkpoint_file).split(".")[-1]
if file_extension == SAFETENSORS_FILE_EXTENSION:
Expand Down
1 change: 1 addition & 0 deletions src/diffusers/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
_add_variant,
_get_checkpoint_shard_files,
_get_model_file,
_is_lfs_pointer,
extract_commit_hash,
http_user_agent,
)
Expand Down
32 changes: 32 additions & 0 deletions src/diffusers/utils/hub_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,38 @@ def _add_variant(weights_name: str, variant: str | None = None) -> str:
return weights_name


# Real `.safetensors` / `.bin` weight files are MB-to-GB. Git LFS pointer stubs are
# tiny text files (~130 bytes). Anything under this size that starts with the LFS
# pointer marker is, with extremely high confidence, a pointer rather than weights.
_LFS_POINTER_MAX_SIZE = 1024
_LFS_POINTER_PREFIX = b"version https://git-lfs.github.com/spec/v1"


def _is_lfs_pointer(path: str | os.PathLike) -> bool:
"""
Return ``True`` if ``path`` is a Git LFS pointer stub rather than the actual file content.

LFS pointer files are small text files (~130 bytes) of the form::

version https://git-lfs.github.com/spec/v1
oid sha256:<hash>
size <bytes>

They are produced by ``git clone`` of an LFS-backed repository without ``git lfs pull``,
or by tools such as ``gsutil rsync`` / ``aws s3 sync`` that mirror an HF repository
without LFS-aware copying. Loading a pointer file as if it were the actual weights
leads to a confusing safetensors / pickle deserialization error far away from the
real cause.
"""
try:
if os.path.getsize(path) > _LFS_POINTER_MAX_SIZE:
return False
with open(path, "rb") as f:
return f.read(len(_LFS_POINTER_PREFIX)) == _LFS_POINTER_PREFIX
except OSError:
return False


@validate_hf_hub_args
def _get_model_file(
pretrained_model_name_or_path: str | Path,
Expand Down
41 changes: 40 additions & 1 deletion tests/others/test_hub_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from pathlib import Path
from tempfile import TemporaryDirectory

from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
from diffusers.utils.hub_utils import _is_lfs_pointer, load_or_create_model_card, populate_model_card


class CreateModelCardTest(unittest.TestCase):
Expand All @@ -27,3 +27,42 @@ def test_generate_model_card_with_library_name(self):
model_card = load_or_create_model_card(file_path)
populate_model_card(model_card)
assert model_card.data.library_name == "foo"


class IsLFSPointerTest(unittest.TestCase):
LFS_POINTER_TEXT = (
"version https://git-lfs.github.com/spec/v1\n"
"oid sha256:0000000000000000000000000000000000000000000000000000000000000000\n"
"size 17000000000\n"
)

def test_detects_lfs_pointer(self):
with TemporaryDirectory() as tmpdir:
file_path = Path(tmpdir) / "diffusion_pytorch_model.safetensors"
file_path.write_text(self.LFS_POINTER_TEXT)
assert _is_lfs_pointer(file_path) is True

def test_real_safetensors_not_flagged(self):
# safetensors files start with an 8-byte little-endian header length and JSON metadata,
# never with the LFS pointer marker. Synthesise a small payload to confirm.
with TemporaryDirectory() as tmpdir:
file_path = Path(tmpdir) / "diffusion_pytorch_model.safetensors"
file_path.write_bytes(b"\x08\x00\x00\x00\x00\x00\x00\x00{} ")
assert _is_lfs_pointer(file_path) is False

def test_large_file_not_flagged(self):
# A file larger than the pointer-size threshold is short-circuited to False without
# being read, even if its contents would otherwise match.
with TemporaryDirectory() as tmpdir:
file_path = Path(tmpdir) / "diffusion_pytorch_model.safetensors"
file_path.write_bytes(self.LFS_POINTER_TEXT.encode() + b"\x00" * 4096)
assert _is_lfs_pointer(file_path) is False

def test_missing_file_returns_false(self):
assert _is_lfs_pointer("/nonexistent/path/foo.safetensors") is False

def test_unrelated_short_file_not_flagged(self):
with TemporaryDirectory() as tmpdir:
file_path = Path(tmpdir) / "config.json"
file_path.write_text('{"version": 2}\n')
assert _is_lfs_pointer(file_path) is False
Loading