huggingface · NazarKozak · Apr 25, 2026
diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
@@ -38,6 +38,7 @@
     WEIGHTS_INDEX_NAME,
     _add_variant,
     _get_model_file,
+    _is_lfs_pointer,
     deprecate,
     is_accelerate_available,
     is_accelerate_version,
@@ -164,6 +165,22 @@ def load_state_dict(
     # TODO: maybe refactor a bit this part where we pass a dict here
     if isinstance(checkpoint_file, dict):
         return checkpoint_file
+
+    # Detect Git LFS pointer stubs before attempting to load. Without this, safetensors /
+    # torch.load fails far away from the real cause with a confusing deserialization error.
+    # The check covers both `git clone` without `git lfs pull` and bucket mirrors created
+    # with `gsutil rsync` / `aws s3 sync` that copied the LFS pointer text rather than
+    # the underlying weights.
+    if dduf_entries is None and _is_lfs_pointer(checkpoint_file):
+        raise OSError(
+            f"`{checkpoint_file}` is a Git LFS pointer file, not the actual weights. This typically "
+            f"happens when a Hugging Face repository was mirrored without LFS-aware copying — for "
+            f"example, `git clone` without a subsequent `git lfs pull`, or `gsutil rsync` / "
+            f"`aws s3 sync` from a bucket that holds the original git checkout. Re-mirror with "
+            f"`git lfs pull` (or with an LFS-aware tool such as "
+            f"`huggingface-cli download <repo_id> --local-dir <dir>`) and try again."
+        )
+
     try:
         file_extension = os.path.basename(checkpoint_file).split(".")[-1]
         if file_extension == SAFETENSORS_FILE_EXTENSION:

diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
@@ -50,6 +50,7 @@
     _add_variant,
     _get_checkpoint_shard_files,
     _get_model_file,
+    _is_lfs_pointer,
     extract_commit_hash,
     http_user_agent,
 )

diff --git a/src/diffusers/utils/hub_utils.py b/src/diffusers/utils/hub_utils.py
@@ -224,6 +224,38 @@ def _add_variant(weights_name: str, variant: str | None = None) -> str:
     return weights_name
 
 
+# Real `.safetensors` / `.bin` weight files are MB-to-GB. Git LFS pointer stubs are
+# tiny text files (~130 bytes). Anything under this size that starts with the LFS
+# pointer marker is, with extremely high confidence, a pointer rather than weights.
+_LFS_POINTER_MAX_SIZE = 1024
+_LFS_POINTER_PREFIX = b"version https://git-lfs.github.com/spec/v1"
+
+
+def _is_lfs_pointer(path: str | os.PathLike) -> bool:
+    """
+    Return ``True`` if ``path`` is a Git LFS pointer stub rather than the actual file content.
+
+    LFS pointer files are small text files (~130 bytes) of the form::
+
+        version https://git-lfs.github.com/spec/v1
+        oid sha256:<hash>
+        size <bytes>
+
+    They are produced by ``git clone`` of an LFS-backed repository without ``git lfs pull``,
+    or by tools such as ``gsutil rsync`` / ``aws s3 sync`` that mirror an HF repository
+    without LFS-aware copying. Loading a pointer file as if it were the actual weights
+    leads to a confusing safetensors / pickle deserialization error far away from the
+    real cause.
+    """
+    try:
+        if os.path.getsize(path) > _LFS_POINTER_MAX_SIZE:
+            return False
+        with open(path, "rb") as f:
+            return f.read(len(_LFS_POINTER_PREFIX)) == _LFS_POINTER_PREFIX
+    except OSError:
+        return False
+
+
 @validate_hf_hub_args
 def _get_model_file(
     pretrained_model_name_or_path: str | Path,

diff --git a/tests/others/test_hub_utils.py b/tests/others/test_hub_utils.py
@@ -16,7 +16,7 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory
 
-from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.hub_utils import _is_lfs_pointer, load_or_create_model_card, populate_model_card
 
 
 class CreateModelCardTest(unittest.TestCase):
@@ -27,3 +27,42 @@ def test_generate_model_card_with_library_name(self):
             model_card = load_or_create_model_card(file_path)
             populate_model_card(model_card)
             assert model_card.data.library_name == "foo"
+
+
+class IsLFSPointerTest(unittest.TestCase):
+    LFS_POINTER_TEXT = (
+        "version https://git-lfs.github.com/spec/v1\n"
+        "oid sha256:0000000000000000000000000000000000000000000000000000000000000000\n"
+        "size 17000000000\n"
+    )
+
+    def test_detects_lfs_pointer(self):
+        with TemporaryDirectory() as tmpdir:
+            file_path = Path(tmpdir) / "diffusion_pytorch_model.safetensors"
+            file_path.write_text(self.LFS_POINTER_TEXT)
+            assert _is_lfs_pointer(file_path) is True
+
+    def test_real_safetensors_not_flagged(self):
+        # safetensors files start with an 8-byte little-endian header length and JSON metadata,
+        # never with the LFS pointer marker. Synthesise a small payload to confirm.
+        with TemporaryDirectory() as tmpdir:
+            file_path = Path(tmpdir) / "diffusion_pytorch_model.safetensors"
+            file_path.write_bytes(b"\x08\x00\x00\x00\x00\x00\x00\x00{}      ")
+            assert _is_lfs_pointer(file_path) is False
+
+    def test_large_file_not_flagged(self):
+        # A file larger than the pointer-size threshold is short-circuited to False without
+        # being read, even if its contents would otherwise match.
+        with TemporaryDirectory() as tmpdir:
+            file_path = Path(tmpdir) / "diffusion_pytorch_model.safetensors"
+            file_path.write_bytes(self.LFS_POINTER_TEXT.encode() + b"\x00" * 4096)
+            assert _is_lfs_pointer(file_path) is False
+
+    def test_missing_file_returns_false(self):
+        assert _is_lfs_pointer("/nonexistent/path/foo.safetensors") is False
+
+    def test_unrelated_short_file_not_flagged(self):
+        with TemporaryDirectory() as tmpdir:
+            file_path = Path(tmpdir) / "config.json"
+            file_path.write_text('{"version": 2}\n')
+            assert _is_lfs_pointer(file_path) is False