vietanhdev
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 9 additions & 7 deletions b/‎pyproject.toml‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎samexporter/export_sam2.py‎
Lines changed: 6 additions & 3 deletions b/‎samexporter/export_sam2.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎samexporter/export_sam3.py‎
Lines changed: 2 additions & 1 deletion b/‎samexporter/export_sam3.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎samexporter/sam2_configs/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎samexporter/sam2_configs/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎samexporter/sam2_configs/sam2.1/sam2.1_hiera_b+.yaml‎
Lines changed: 116 additions & 0 deletions b/‎samexporter/sam2_configs/sam2.1/sam2.1_hiera_b+.yaml‎
Lines changed: 116 additions & 0 deletions
diff --git a/‎samexporter/sam2_configs/sam2.1/sam2.1_hiera_l.yaml‎
Lines changed: 120 additions & 0 deletions b/‎samexporter/sam2_configs/sam2.1/sam2.1_hiera_l.yaml‎
Lines changed: 120 additions & 0 deletions
@@ -6,4 +6,5 @@
 /output_images
 *.egg-info
 *.pyc
-/dist
+/dist
+/build
@@ -4,19 +4,19 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "samexporter"
-version = "0.4.4"
+version = "0.4.5"
 description = "Exporting Segment Anything models ONNX format"
 authors = [
     {name = "Viet Anh Nguyen", email = "vietanh.dev@gmail.com"},
 ]
 readme = "README.md"
 requires-python = ">=3.11"
-license = {file = "LICENSE"}
+license = "MIT"
+license-files = ["LICENSE"]
 classifiers = [
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
     "Programming Language :: Python :: 3.13",
-    "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
 ]
 
@@ -28,7 +28,6 @@ dependencies = [
     "torch==2.10.0",
     "torchvision==0.25.0",
     "timm==0.9.2",
-    "onnxsim==0.5.0",
     "numpy==1.26.4",
     "onnxscript==0.6.2",
     "osam",  # CLIP tokeniser for SAM3 text prompts
@@ -39,13 +38,16 @@ dependencies = [
 "Bug Tracker" = "https://github.com/vietanhdev/samexporter/issues"
 
 [project.optional-dependencies]
-dev = ["ruff", "pre-commit"]
+# onnxsim has no Windows wheel; the source tarball exceeds Windows 260-char
+# path limits.  Install with: pip install samexporter[export]
+export = ["onnxsim==0.5.0"]
+dev = ["ruff", "pre-commit", "pytest"]
 
 [tool.setuptools]
-packages = ["samexporter"]
+packages = ["samexporter", "samexporter.mobile_encoder", "samexporter.sam2_configs"]
 
 [tool.setuptools.package-data]
-"samexporter.sam2_configs" = ["*.py", "*.yaml"]
+"samexporter.sam2_configs" = ["*.yaml", "sam2.1/*.yaml"]
 
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 
@@ -4,7 +4,6 @@
 
 import onnx
 import torch
-from onnxsim import simplify
 from sam2.build_sam import build_sam2
 from sam2.modeling.sam2_base import SAM2Base
 from torch import nn
@@ -217,9 +216,9 @@ def _embed_masks(
     # Clear any existing Hydra instance
     GlobalHydra.instance().clear()
 
-    # Get absolute path to sam2_configs
+    # Get absolute path to sam2_configs (bundled inside the samexporter package)
     config_dir = os.path.abspath(
-        os.path.join(os.path.dirname(__file__), "..", "sam2_configs")
+        os.path.join(os.path.dirname(__file__), "sam2_configs")
     )
 
     with initialize_config_dir(config_dir=config_dir, version_base="1.2"):
@@ -242,6 +241,8 @@ def _embed_masks(
     print("Saved encoder to", args.output_encoder)
     if args.simplify:
         print("Simplifying encoder...")
+        from onnxsim import simplify
+
         onnx_model = onnx.load(args.output_encoder)
         model_simp, check = simplify(onnx_model)
         assert check, "Simplified ONNX model could not be validated"
@@ -312,6 +313,8 @@ def _embed_masks(
     print("Saved decoder to", args.output_decoder)
     if args.simplify:
         print("Simplifying decoder...")
+        from onnxsim import simplify
+
         onnx_model = onnx.load(args.output_decoder)
         model_simp, check = simplify(onnx_model)
         assert check, "Simplified ONNX model could not be validated"
 
@@ -6,7 +6,6 @@
 
 import onnx
 import torch
-from onnxsim import simplify
 from torchvision.transforms import v2
 
 # Mock triton for Windows – must happen before any sam3 imports.
@@ -285,6 +284,8 @@ def export_sam3(output_dir: str, opset: int = 18, simplify_model: bool = False):
 
     # ── Simplify models conditionally ─────────────────────────────────────────
     if simplify_model:
+        from onnxsim import simplify
+
         for path in [encoder_path, language_path, decoder_path]:
             print(f"Simplifying {path}...")
             try:
 
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
@@ -0,0 +1,116 @@
+# @package _global_
+
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 112
+      num_heads: 2
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [896, 448, 224, 112]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  no_obj_embed_spatial: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: true
+  proj_tpos_enc_in_obj_ptrs: true
+  use_signed_tpos_enc_to_obj_ptrs: true
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False
@@ -0,0 +1,120 @@
+# @package _global_
+
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 144
+      num_heads: 2
+      stages: [2, 6, 36, 4]
+      global_att_blocks: [23, 33, 43]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+      window_spec: [8, 4, 16, 8]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [1152, 576, 288, 144]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  no_obj_embed_spatial: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: true
+  proj_tpos_enc_in_obj_ptrs: true
+  use_signed_tpos_enc_to_obj_ptrs: true
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False