diff --git a/graph_net/sample_pass/resumable_sample_pass_mixin.py b/graph_net/sample_pass/resumable_sample_pass_mixin.py
index 804005988..7ffb4b2d6 100644
--- a/graph_net/sample_pass/resumable_sample_pass_mixin.py
+++ b/graph_net/sample_pass/resumable_sample_pass_mixin.py
@@ -45,7 +45,7 @@ def resumable_handle_sample(self, rel_model_path: str):
         self._inc_num_handled_models_or_exit()
 
     def _inc_num_handled_models_or_exit(self):
-        if self.config["limits_handled_models"] is None:
+        if self.config.get("limits_handled_models", None) is None:
             return
         self.num_handled_models += 1
         if self.num_handled_models >= self.config["limits_handled_models"]:
diff --git a/graph_net/test/dtype_gen_test.sh b/graph_net/test/dtype_gen_test.sh
index 6a9cd8d82..c3e203781 100755
--- a/graph_net/test/dtype_gen_test.sh
+++ b/graph_net/test/dtype_gen_test.sh
@@ -1,42 +1,45 @@
 #!/bin/bash
 
-GRAPH_NET_ROOT=$(python3 -c "import graph_net; import os; print(
-os.path.dirname(graph_net.__file__))")
-GRAPHNET_ROOT="$GRAPH_NET_ROOT/../"
+GRAPH_NET_ROOT=$(python3 -c "import graph_net; import os; print(os.path.dirname(os.path.dirname(graph_net.__file__)))")
 OUTPUT_DIR="/tmp/dtype_gen_samples"
-mkdir -p "$OUTPUT_DIR"
+
+mkdir -p $OUTPUT_DIR
+
+model_list=${GRAPH_NET_ROOT}/graph_net/config/small10_torch_samples_list.txt
+model_path_prefix="${GRAPH_NET_ROOT}"
 
 # Step 1: Initialize dtype generalization passes (samples of torchvision)
 python3 -m graph_net.apply_sample_pass \
-    --model-path-list "graph_net/config/small100_torch_samples_list.txt" \
-    --sample-pass-file-path "$GRAPH_NET_ROOT/torch/sample_pass/dtype_generalizer.py" \
+    --use-subprocess \
+    --model-path-list $model_list \
+    --sample-pass-file-path "$GRAPH_NET_ROOT/graph_net/torch/sample_pass/dtype_generalizer.py" \
     --sample-pass-class-name InitDataTypeGeneralizationPasses \
     --sample-pass-config $(base64 -w 0 <<EOF
 {
     "dtype_list": ["float16", "bfloat16"],
-    "model_path_prefix": "$GRAPHNET_ROOT",
+    "model_path_prefix": "$model_path_prefix",
     "output_dir": "$OUTPUT_DIR",
     "resume": true,
-    "limits_handled_models": null
+    "limits_handled_models": 10
 }
 EOF
-) 
+)
 
 # Step 2: Apply passes to generate samples
 python3 -m graph_net.apply_sample_pass \
-    --model-path-list "graph_net/config/small100_torch_samples_list.txt" \
-    --sample-pass-file-path "$GRAPH_NET_ROOT/torch/sample_pass/dtype_generalizer.py" \
+    --use-subprocess \
+    --model-path-list $model_list \
+    --sample-pass-file-path "$GRAPH_NET_ROOT/graph_net/torch/sample_pass/dtype_generalizer.py" \
     --sample-pass-class-name ApplyDataTypeGeneralizationPasses \
     --sample-pass-config $(base64 -w 0 <<EOF
 {
     "output_dir": "$OUTPUT_DIR",
-    "model_path_prefix": "$GRAPHNET_ROOT",
-    "model_runnable_predicator_filepath": "$GRAPH_NET_ROOT/torch/constraint_util.py",
-    "resume": true,
-    "limits_handled_models": null,
+    "model_path_prefix": "$model_path_prefix",
+    "model_runnable_predicator_filepath": "$GRAPH_NET_ROOT/graph_net/torch/constraint_util.py",
+    "device": "cuda",
+    "resume": false,
+    "limits_handled_models": 10,
     "try_run": true
 }
 EOF
-)
-
-
+)
\ No newline at end of file
diff --git a/graph_net/tools/generate_subgraph_dataset.sh b/graph_net/tools/generate_subgraph_dataset.sh
index 6ee1adc40..3ccc381f6 100755
--- a/graph_net/tools/generate_subgraph_dataset.sh
+++ b/graph_net/tools/generate_subgraph_dataset.sh
@@ -26,18 +26,26 @@ GROUPED_FUSIBLE_SUBGRAPH_RANGES_DIR=$DECOMPOSE_WORKSPACE/10_grouped_fusible_subg
 SUBGRAPH_DIMENSION_GENERALIZED_OUTPUT_DIR=$DECOMPOSE_WORKSPACE/11_dimension_generalized_fusible_subgraphs
 RENAMED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR=$DECOMPOSE_WORKSPACE/12_renamed_dimension_generalized_fusible_subgraphs
 DEDUPLICATED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR=$DECOMPOSE_WORKSPACE/13_deduplicated_dimension_generalized_fusible_subgraphs
-UNITTESTS_OUTPUT_DIR=$DECOMPOSE_WORKSPACE/14_kernelbench_unittests
-
-DB_PATH=$DECOMPOSE_WORKSPACE/small100_torch_samples.db
+DTYPE_GENERALIZED_OUTPUT_DIR=$DECOMPOSE_WORKSPACE/14_dtype_generalized_fusible_subgraphs
+UNITTESTS_OUTPUT_DIR=$DECOMPOSE_WORKSPACE/15_kernelbench_unittests
 
 mkdir -p "$DECOMPOSE_WORKSPACE"
 
-model_list="$GRAPH_NET_ROOT/graph_net/config/small100_torch_samples_list.txt" 
+model_list="$GRAPH_NET_ROOT/graph_net/config/small100_torch_samples_list.txt"
+DB_PATH=$DECOMPOSE_WORKSPACE/small100_torch_samples.db
+
 device_rewrited_sample_list=${DECOMPOSE_WORKSPACE}/device_rewrited_sample_list.txt
 range_decomposed_subgraph_list=${DECOMPOSE_WORKSPACE}/range_decomposed_subgraph_sample_list.txt
 deduplicated_subgraph_list=${DECOMPOSE_WORKSPACE}/deduplicated_subgraph_sample_list.txt
 dimension_generalized_subgraph_list=${DECOMPOSE_WORKSPACE}/dimension_generalized_subgraph_sample_list.txt
 deduplicated_fusible_subgraphs_list=${DECOMPOSE_WORKSPACE}/deduplicated_dimension_generalized_subgraph_sample_list.txt
+dtype_generalized_subgraphs_list=${DECOMPOSE_WORKSPACE}/dtype_generalized_subgraphs_sample_list.txt
+
+if [[ "$model_list" == *"/torch_samples_list.txt" ]]; then
+    USE_SUBPROCESS_ARGS="--use-subprocess"
+else
+    USE_SUBPROCESS_ARGS=""
+fi
 
 function generate_generalized_subgraph_list() {
     local target_dir="$1"
@@ -63,7 +71,7 @@ function generate_subgraph_list() {
         | tee $sample_list
 }
 
-function grpahsample_insert(){
+function insert_graph_sample(){
     local target_dir="$1"
     local repo_uid="$2"
     local sample_type="$3"
@@ -115,7 +123,7 @@ EOF
 function dimension_generalizer(){
     echo ">>> [2] Apply dimension generalization for samples under ${device_rewrited_sample_list}."
     echo ">>>"
-    python3 -m graph_net.apply_sample_pass \
+    python3 -m graph_net.apply_sample_pass ${USE_SUBPROCESS_ARGS} \
         --model-path-list $device_rewrited_sample_list \
         --sample-pass-file-path "$GRAPH_NET_ROOT/graph_net/dimension_generalizer.py" \
         --sample-pass-class-name "ApplyDimGenPasses" \
@@ -135,7 +143,7 @@ EOF
 function generate_op_names() {
     echo ">>> [3] Generate op_names.txt for samples in ${model_list}."
     echo ">>>"
-    python3 -m graph_net.model_path_handler \
+    python3 -m graph_net.model_path_handler ${USE_SUBPROCESS_ARGS} \
         --model-path-list $model_list \
         --handler-config=$(base64 -w 0 <<EOF
 {
@@ -181,7 +189,7 @@ EOF
 function range_decompose() {
     echo ">>> [5] Decompose according to subgraph_ranges.json for samples in ${device_rewrited_sample_list}."
     echo ">>>"
-    python3 -m graph_net.model_path_handler \
+    python3 -m graph_net.model_path_handler ${USE_SUBPROCESS_ARGS} \
         --model-path-list "$device_rewrited_sample_list" \
         --handler-config=$(base64 -w 0 <<EOF
 {
@@ -295,7 +303,7 @@ function subgraph_dimension_generalizer(){
         echo ">>> Generating dimension generalized subgraph variant index: ${index}"
         dimension_generalized_sample_list="${DIMENSION_GENERALIZED_OUTPUT_DIR}/${index}/dimension_generalized_sample_list.txt"
         generate_subgraph_list ${DIMENSION_GENERALIZED_OUTPUT_DIR}/${index} ${dimension_generalized_sample_list}
-        python3 -m graph_net.model_path_handler \
+        python3 -m graph_net.model_path_handler ${USE_SUBPROCESS_ARGS} \
             --model-path-list "${dimension_generalized_sample_list}" \
             --handler-config $(base64 -w 0 <<EOF
 {
@@ -350,18 +358,39 @@ function remove_duplicate_dimension_generalized_fusible_graphs() {
     done
 }
 
+function dtype_generalizer() {
+    echo ">>> [12] Data type generalizer for samples under ${DEDUPLICATED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR}."
+    echo ">>>"
+    python3 -m graph_net.apply_sample_pass \
+        --use-subprocess \
+        --model-path-list $deduplicated_fusible_subgraphs_list \
+        --sample-pass-file-path "$GRAPH_NET_ROOT/graph_net/torch/sample_pass/dtype_generalizer.py" \
+        --sample-pass-class-name ApplyDataTypeGeneralizationPasses \
+        --sample-pass-config $(base64 -w 0 <<EOF
+{
+    "output_dir": "$DTYPE_GENERALIZED_OUTPUT_DIR",
+    "model_path_prefix": "$DEDUPLICATED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR",
+    "model_runnable_predicator_filepath": "$GRAPH_NET_ROOT/graph_net/torch/constraint_util.py",
+    "try_run": false,
+    "device": "cuda",
+    "resume": ${RESUME}
+}
+EOF
+)
+}
+
 function generate_unittests() {
-    echo ">>> [12] Generate unittests for subgraph samples under ${DEDUPLICATED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR}. "
+    echo ">>> [13] Generate unittests for subgraph samples under ${DTYPE_GENERALIZED_OUTPUT_DIR}. "
     echo ">>>"
     python3 -m graph_net.model_path_handler \
-        --model-path-list ${deduplicated_fusible_subgraphs_list} \
+        --model-path-list ${dtype_generalized_subgraphs_list} \
         --handler-config=$(base64 -w 0 <<EOF
 {
     "handler_path": "${GRAPH_NET_ROOT}/graph_net/sample_pass/agent_unittest_generator.py",
     "handler_class_name": "AgentUnittestGeneratorPass",
     "handler_config": {
         "framework": "torch",
-        "model_path_prefix": "${DEDUPLICATED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR}",
+        "model_path_prefix": "${DTYPE_GENERALIZED_OUTPUT_DIR}",
         "output_dir": "${UNITTESTS_OUTPUT_DIR}",
         "device": "cuda",
         "generate_main": false,
@@ -380,8 +409,8 @@ main() {
     suffix="${OP_RANGE}ops_${timestamp}"
 
     # init database
-    python ./sqlite/init_db.py --db_path ${DB_PATH} 2>&1 | tee sqlite/logs/init_db_$(date +"%Y%m%d_%H%M%S").log
-    grpahsample_insert ${GRAPH_NET_ROOT} "github_torch_samples" "full_graph" ${model_list}
+    python ${GRAPH_NET_ROOT}/sqlite/init_db.py --db_path ${DB_PATH} 2>&1 | tee sqlite/logs/init_db_${timestamp}.log
+    insert_graph_sample ${GRAPH_NET_ROOT} "github_torch_samples" "full_graph" ${model_list}
 
     # rewrite the device in model to cuda
     rewrite_device 2>&1 | tee ${DECOMPOSE_WORKSPACE}/log_rewrite_device_${suffix}.txt
@@ -395,11 +424,11 @@ main() {
     generate_split_point 2>&1 | tee ${DECOMPOSE_WORKSPACE}/log_split_point_${suffix}.txt
     range_decompose 2>&1 | tee ${DECOMPOSE_WORKSPACE}/log_range_decompose_${suffix}.txt
     generate_subgraph_list ${RANGE_DECOMPOSE_OUTPUT_DIR} ${range_decomposed_subgraph_list}
-
+    
     rename_decomposed_subgraph 2>&1 | tee ${DECOMPOSE_WORKSPACE}/log_rename_decomposed_subgraph_${suffix}.txt
     remove_duplicate_renamed_graphs 2>&1 | tee ${DECOMPOSE_WORKSPACE}/log_remove_duplicate_renamed_graphs_${suffix}.txt
     generate_subgraph_list ${DEDUPLICATED_OUTPUT_DIR} ${deduplicated_subgraph_list}
-    grpahsample_insert ${DEDUPLICATED_OUTPUT_DIR} "github_torch_samples" "typical_graph" ${deduplicated_subgraph_list}
+    insert_graph_sample ${DEDUPLICATED_OUTPUT_DIR} "github_torch_samples" "typical_graph" ${deduplicated_subgraph_list}
 
     # generate fusible subgraph ranges
     gen_fusible_subgraph_ranges 2>&1 | tee ${DECOMPOSE_WORKSPACE}/log_fusible_subgraphs_${suffix}.txt
@@ -407,14 +436,109 @@ main() {
     # subgraph dimension generalization
     subgraph_dimension_generalizer 2>&1 | tee ${DECOMPOSE_WORKSPACE}/log_subgraph_dimension_generalizer_${suffix}.txt
     generate_generalized_subgraph_list ${SUBGRAPH_DIMENSION_GENERALIZED_OUTPUT_DIR} ${dimension_generalized_subgraph_list}
-
+    
     rename_dimension_generalized_fusible_subgraph 2>&1 | tee ${DECOMPOSE_WORKSPACE}/log_rename_dimension_generalized_subgraph_${suffix}.txt
     remove_duplicate_dimension_generalized_fusible_graphs 2>&1 | tee ${DECOMPOSE_WORKSPACE}/log_remove_duplicate_dimension_generalized_subgraphs_${suffix}.txt
     generate_generalized_subgraph_list ${DEDUPLICATED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR} ${deduplicated_fusible_subgraphs_list}
-    grpahsample_insert ${DEDUPLICATED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR} "github_torch_samples" "fusible_graph" ${deduplicated_fusible_subgraphs_list}
+
+    # dtype generalization
+    dtype_generalizer 2>&1 | tee ${DECOMPOSE_WORKSPACE}/log_dtype_generalizer_${suffix}.txt
+    generate_generalized_subgraph_list ${DTYPE_GENERALIZED_OUTPUT_DIR} ${dtype_generalized_subgraphs_list}
+    insert_graph_sample ${DEDUPLICATED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR} "github_torch_samples" "fusible_graph" ${dtype_generalized_subgraphs_list}
 
     # generate kernelbench format unittest
     generate_unittests 2>&1 | tee ${DECOMPOSE_WORKSPACE}/log_unittests_${suffix}.txt
 }
 
+summary() {
+    num_original_samples=`cat $model_list | grep "^samples/" | wc -l`
+    echo "Number of original graphnet samples: $num_original_samples"
+
+    num_device_rewrited_samples=`find ${DEVICE_REWRITED_OUTPUT_DIR} -name "model.py" | wc -l`
+    device_rewrited_successed_precent=$(( num_device_rewrited_samples * 100 / num_original_samples ))
+    echo "- [Step  1] device rewrite: successed=${num_device_rewrited_samples}, percent=$device_rewrited_successed_precent%"
+
+    num_successed_dimension_generalized_samples=`find ${DIMENSION_GENERALIZED_OUTPUT_DIR} -name "model.py" | wc -l`
+    dimension_generalized_samples_successed_percent=$((num_successed_dimension_generalized_samples * 100 / (num_original_samples * 9)))
+    echo "- [Step  2] dimension generalization: successed=${num_successed_dimension_generalized_samples}, percent=${dimension_generalized_samples_successed_percent}%"
+    for index in {0..8}; do
+        num_successed_dimension_generalized_samples=`find ${DIMENSION_GENERALIZED_OUTPUT_DIR}/${index} -name "model.py" | wc -l`
+        dimension_generalized_samples_successed_percent=$(( num_successed_dimension_generalized_samples * 100 / num_original_samples ))
+        echo "    ${index}, successed=${num_successed_dimension_generalized_samples}, percent=${dimension_generalized_samples_successed_percent}%"
+    done
+    echo ""
+
+    num_successed_op_names=`find ${OP_NAMES_OUTPUT_DIR} -name op_names.txt | wc -l`
+    op_names_successed_percent=$(( num_successed_op_names * 100 / num_original_samples ))
+    echo "- [Step  3] generate op names: successed=${num_successed_op_names}, percent=${op_names_successed_percent}%"
+
+    num_typical_subgraph_ranges=`find ${SUBGRAPH_RANGES_JSON_ROOT} -name typical_subgraph_ranges.json | wc -l`
+    typical_subgraph_ranges_successed_percent=$(( num_typical_subgraph_ranges * 100 / num_original_samples ))
+    echo "- [Step  4] generate typical subgraph ranges: successed=${num_typical_subgraph_ranges}, percent=${typical_subgraph_ranges_successed_percent}%"
+
+    num_successed_range_decomposed_subgraphs=`find ${RANGE_DECOMPOSE_OUTPUT_DIR} -name "model.py" | wc -l`
+    echo "- [Step  5] range decompose: successed=${num_successed_range_decomposed_subgraphs}"
+    
+    num_renamed_subgraphs=`find ${GRAPH_VAR_RENAME_OUTPUT_DIR} -name "model.py" | wc -l`
+    echo "- [Step  6] rename: successed=${num_renamed_subgraphs}"
+    
+    num_deduplicated_subgraphs=`find ${DEDUPLICATED_OUTPUT_DIR} -name "model.py" | wc -l`
+    echo "- [Step  7] remove duplicated: successed=${num_deduplicated_subgraphs}"
+
+    num_successed_cumsum_kernels_subgraphs=`find ${CUMSUM_NUM_KERNELS_DIR} -name "cumsum_num_kernels.json" | wc -l`
+    cumsum_kernels_successed_percent=$((num_successed_cumsum_kernels_subgraphs * 100 / num_deduplicated_subgraphs))
+    echo "- [Step  8] cumsum kernels: successed=${num_successed_cumsum_kernels_subgraphs}, percent=${cumsum_kernels_successed_percent}%"
+
+    num_fusible_subgraph_ranges=`find ${FUSIBLE_SUBGRAPH_RANGES_DIR} -name "fusible_subgraph_ranges.json" | wc -l`
+    num_grouped_fusible_subgraph_ranges=`find ${GROUPED_FUSIBLE_SUBGRAPH_RANGES_DIR} -name "grouped_fusible_subgraph_ranges.json" | wc -l`
+    echo "    fusible subgraph ranges: successed=${num_fusible_subgraph_ranges}"
+    echo "    grouped fusible subgraph ranges: successed=${num_grouped_fusible_subgraph_ranges}"
+    echo ""
+
+    num_successed_dimension_generalized_subgraphs=`find ${SUBGRAPH_DIMENSION_GENERALIZED_OUTPUT_DIR} -name "model.py" | wc -l`
+    echo "- [Step  9] subgraph dimension generalization: successed=${num_successed_dimension_generalized_subgraphs}"
+    for index in {0..8}; do
+        num_successed_dimension_generalized_subgraphs=`find ${SUBGRAPH_DIMENSION_GENERALIZED_OUTPUT_DIR}/${index} -name "model.py" | wc -l`
+        echo "    ${index}, successed=${num_successed_dimension_generalized_subgraphs}"
+    done
+    echo ""
+
+    num_renamed_fusible_subgraphs=`find ${RENAMED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR} -name "model.py" | wc -l`
+    echo "- [Step 10] rename: successed=${num_renamed_fusible_subgraphs}"
+    for index in {0..8}; do
+        num_renamed_fusible_subgraphs_index=`find ${RENAMED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR}/${index} -name "model.py" | wc -l`
+        echo "    ${index}, successed=${num_renamed_fusible_subgraphs_index}"
+    done
+    echo ""
+
+    num_deduplicated_fusible_subgraphs=`find ${DEDUPLICATED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR} -name "model.py" | wc -l`
+    echo "- [Step 11] remove duplicated: successed=${num_deduplicated_fusible_subgraphs}"
+    for index in {0..8}; do
+        num_deduplicated_fusible_subgraphs_index=`find ${DEDUPLICATED_DIMENSION_GENERALIZED_FUSIBLE_SUBGRAPH_DIR}/${index} -name "model.py" | wc -l`
+        echo "    ${index}, successed=${num_deduplicated_fusible_subgraphs_index}"
+    done
+    echo ""
+
+    num_dtype_generalized_subgraphs=`find ${DTYPE_GENERALIZED_OUTPUT_DIR} -name "model.py" | wc -l`
+    echo "- [Step 12] dtype generalization: successed=${num_dtype_generalized_subgraphs}"
+    for dtype in float32 float16 bfloat16
+    do
+        num_dtype_generalized_subgraphs_index=`find ${DTYPE_GENERALIZED_OUTPUT_DIR}/${dtype} -name "model.py" | wc -l`
+        echo "    ${dtype}, successed=${num_dtype_generalized_subgraphs_index}"
+    done
+    echo ""
+
+    num_successed_unittests=`find ${UNITTESTS_OUTPUT_DIR} -name "*_test.py" | wc -l`
+    unittest_successed_percent=$((num_successed_unittests * 100 / num_dtype_generalized_subgraphs))
+    echo "- [Step 13] generate unittest: successed=${num_successed_unittests}, percent=${unittest_successed_percent}%"
+    for dtype in float32 float16 bfloat16
+    do
+        num_successed_unittests=`find ${UNITTESTS_OUTPUT_DIR}/${dtype} -name "*_test.py" | wc -l`
+        echo "    ${dtype}, successed=${num_successed_unittests}"
+    done
+}
+
 main
+
+set +x
+summary 2>&1 | tee ${DECOMPOSE_WORKSPACE}/summary.txt
diff --git a/graph_net/torch/sample_pass/dtype_generalizer.py b/graph_net/torch/sample_pass/dtype_generalizer.py
index 48c06803a..9ce86ab2f 100644
--- a/graph_net/torch/sample_pass/dtype_generalizer.py
+++ b/graph_net/torch/sample_pass/dtype_generalizer.py
@@ -17,6 +17,7 @@
 from pathlib import Path
 from typing import Any, Dict, List
 
+import torch
 import torch.fx as fx
 
 from graph_net.graph_net_json_file_util import (
@@ -27,9 +28,6 @@
     update_json,
 )
 from graph_net.torch.constraint_util import RunModelPredicator
-from graph_net.torch.fx_graph_cache_util import (
-    parse_immutable_model_path_into_sole_graph_module,
-)
 from graph_net.torch.fx_graph_serialize_util import serialize_graph_module_to_str
 from graph_net.torch.dtype_gen_passes.pass_mgr import get_dtype_generalization_pass
 from graph_net.torch import utils
@@ -107,7 +105,7 @@ def sample_handled(self, rel_model_path: str) -> bool:
     def __call__(self, model_path: str) -> None:
         self.resumable_handle_sample(model_path)
 
-    def resume(self, model_path: str) -> None:
+    def resume(self, rel_model_path: str) -> None:
         """
         Initialize dtype passes for the given model.
 
@@ -115,8 +113,7 @@ def resume(self, model_path: str) -> None:
             model_path: Path to the model directory (may be relative to model_path_prefix)
         """
         # Apply model_path_prefix if provided
-        if self.model_path_prefix:
-            model_path = str(Path(self.model_path_prefix) / model_path)
+        model_path = str(Path(self.model_path_prefix) / rel_model_path)
 
         # Parse the computation graph
         module, inputs = get_torch_module_and_inputs(model_path)
@@ -236,9 +233,9 @@ class ApplyDataTypeGeneralizationPasses(SamplePass, ResumableSamplePassMixin):
             "output_dir": "/path/to/output",
             "model_path_prefix": "",
             "model_runnable_predicator_filepath": "...",
-            "resume": ,
-            "limits_handled_models": ,
-            "try_run": ,
+            "resume": true,
+            "limits_handled_models": null,
+            "try_run": true,
         }
     """
 
@@ -268,6 +265,7 @@ def declare_config(
         output_dir: str,
         model_path_prefix: str,
         model_runnable_predicator_filepath: str,
+        device: str = "auto",
         resume: bool = False,
         limits_handled_models: int = None,
         try_run: bool = True,
@@ -281,6 +279,13 @@ def _make_model_runnable_predicator(self, config: Dict[str, Any]):
         predicator_config = self.model_runnable_predicator_config
         return cls(predicator_config)
 
+    def _choose_device(self, device) -> str:
+        if device is None:
+            return None
+        if device in ["cpu", "cuda"]:
+            return device
+        return "cuda" if torch.cuda.is_available() else "cpu"
+
     def sample_handled(self, rel_model_path: str) -> bool:
         model_path = Path(self.config["model_path_prefix"]) / rel_model_path
         dtype_pass_names = self._read_dtype_pass_names(model_path)
@@ -309,30 +314,27 @@ def resume(self, rel_model_path: str) -> List[str]:
             List of generated sample directories
         """
         # Apply model_path_prefix if provided
-        if self.model_path_prefix:
-            abs_model_path = str(Path(self.model_path_prefix) / rel_model_path)
+        model_path = str(Path(self.model_path_prefix) / rel_model_path)
 
         # Read pass names from graph_net.json
-        dtype_pass_names = self._read_dtype_pass_names(abs_model_path)
+        dtype_pass_names = self._read_dtype_pass_names(model_path)
+        logging.info(f"Apply {dtype_pass_names=} for {rel_model_path=}")
+
+        # Copy the original float32 sample
+        fp32_output_dir = self._get_output_dir(rel_model_path, "float32")
+        self._copy_sample(rel_model_path, fp32_output_dir)
 
         if not dtype_pass_names:
-            logging.warning(f"No dtype passes found in {abs_model_path}/graph_net.json")
+            logging.warning(f"No dtype passes found in {model_path}/graph_net.json")
             return []
 
         # Parse the computation graph
-        traced_model = parse_immutable_model_path_into_sole_graph_module(abs_model_path)
-
-        # Copy the originl sample
-        files_copied = [
-            "model.py",
-            "graph_hash.txt",
-            "graph_net.json",
-            "weight_meta.py",
-            "input_meta.py",
-            "input_tensor_constraints.py",
-            "subgraph_sources.json",
-        ]
-        self._copy_sample_files(rel_model_path, "float32", files_copied)
+        module, inputs = get_torch_module_and_inputs(
+            model_path, device=self._choose_device(self.config["device"])
+        )
+        traced_model = parse_sole_graph_module(module, inputs)
+
+        ShapeProp(traced_model).propagate(*inputs)
 
         # Generate samples for each pass
         generated_samples = []
@@ -370,13 +372,13 @@ def _read_dtype_pass_names(self, model_path: str) -> List[str]:
         return metadata.get(kDataTypeGeneralizationPasses, [])
 
     def _apply_pass_and_generate(
-        self, model_path: str, traced_model: fx.GraphModule, pass_name: str
+        self, rel_model_path: str, traced_model: fx.GraphModule, pass_name: str
     ) -> str:
         """
         Apply a specific pass and generate a new sample.
 
         Args:
-            model_path: Original model path
+            rel_model_path: Original model path
             traced_model: Original traced model
             pass_name: Name of the pass file (without .py extension),
                        e.g., "dtype_generalization_pass_float16"
@@ -402,42 +404,33 @@ def _apply_pass_and_generate(
         gm_modified = dtype_pass.rewrite(gm_copy)
 
         # Generate output directory
-        output_sample_dir = Path(self.output_dir) / dtype / model_path
-        output_sample_dir.mkdir(parents=True, exist_ok=True)
+        output_dir = self._get_output_dir(rel_model_path, dtype)
+
+        # Copy metadata files of original sample
+        self._copy_sample(rel_model_path, output_dir)
 
-        # Write modified model.py
+        # Update model.py
         model_code = serialize_graph_module_to_str(gm_modified)
-        write_code = utils.apply_templates(model_code)
-        with open(output_sample_dir / "model.py", "w") as f:
-            f.write(write_code)
+        templated_model_code = utils.apply_templates(model_code)
+        (output_dir / "model.py").write_text(templated_model_code)
 
-        # Write modified graph_hash.txt
+        # Update graph_hash.txt
         model_hash = get_sha256_hash(model_code)
-        with open(output_sample_dir / "graph_hash.txt", "w") as f:
-            f.write(model_hash)
-
-        # Copy metadata files
-        files_copied = [
-            "graph_net.json",
-            "weight_meta.py",
-            "input_meta.py",
-            "input_tensor_constraints.py",
-            "subgraph_sources.json",
-        ]
-        self._copy_sample_files(model_path, dtype, files_copied)
+        (output_dir / "graph_hash.txt").write_text(model_hash)
 
         # Update graph_net.json with dtype information
-        self._update_sample_metadata(output_sample_dir, dtype)
+        self._update_sample_metadata(output_dir, dtype)
 
         # Validate generated sample (required - generated code must be runnable)
         if self.try_run:
-            if not self.model_runnable_predicator(str(output_sample_dir)):
-                raise RuntimeError(
-                    f"Generated sample failed validation: {output_sample_dir}"
-                )
-            logging.info(f"Generated sample validated: {output_sample_dir}")
+            if not self.model_runnable_predicator(str(output_dir)):
+                raise RuntimeError(f"Generated sample failed validation: {output_dir}")
+            logging.info(f"Generated sample validated: {output_dir}")
 
-        return str(output_sample_dir)
+        return str(output_dir)
+
+    def _get_output_dir(self, rel_model_path: str, dtype: str):
+        return Path(self.output_dir) / dtype / rel_model_path
 
     def _update_sample_metadata(self, sample_dir: Path, dtype: str) -> None:
         """
@@ -452,24 +445,17 @@ def _update_sample_metadata(self, sample_dir: Path, dtype: str) -> None:
         update_json(graph_net_json_path, kDtypeGeneralizationPrecision, dtype)
         update_json(graph_net_json_path, kDtypeGeneralizationGenerated, True)
 
-    def _copy_sample_files(
-        self, rel_model_path: str, dtype: str, files_copied: list
-    ) -> None:
+    def _copy_sample(self, rel_model_path: str, output_dir: str) -> None:
         """
         Copy files of sample.
 
         Args:
             rel_model_path: relative model path
         """
-        # Generate output directory
-        output_sample_dir = Path(self.output_dir) / dtype / rel_model_path
-        output_sample_dir.mkdir(parents=True, exist_ok=True)
-
-        # Copy files of original sample
-        for fname in files_copied:
-            src = Path(rel_model_path) / fname
-            if src.exists():
-                shutil.copy(src, output_sample_dir / fname)
+        model_path = str(Path(self.model_path_prefix) / rel_model_path)
+        if not output_dir.exists():
+            logging.info(f"Copy {model_path} -> {output_dir}")
+            shutil.copytree(model_path, output_dir)
 
 
 class MultiDtypeFilter: