From 7da1db123bdac16e189a7d7e231d0670ebd16954 Mon Sep 17 00:00:00 2001 From: wangyuwen1999 <3151160463@qq.com> Date: Thu, 5 Feb 2026 23:10:57 +0800 Subject: [PATCH] add md5sum print to model runner --- fastdeploy/envs.py | 3 +++ fastdeploy/worker/gpu_model_runner.py | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index 47c35787810..835327ab696 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -200,6 +200,9 @@ "FD_WORKER_ALIVE_TIMEOUT": lambda: int(os.getenv("FD_WORKER_ALIVE_TIMEOUT", "30")), # File path for file storage backend "FILE_BACKEND_STORAGE_DIR": lambda: str(os.getenv("FILE_BACKEND_STORAGE_DIR", "/tmp/fastdeploy")), + + # Whether to print model's md5 value + "MD5SUM_PRINT": lambda: bool(int(os.getenv("MD5SUM_PRINT", "0"))), } diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 94813313a3f..263376ebf4d 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1399,6 +1399,27 @@ def load_model(self) -> None: ) else: self.dynamic_weight_manager = DynamicWeightManager(self.fd_config, self.model, self.local_rank) + + # save model md5sum + if envs.MD5SUM_PRINT: + logger.info("MD5SUM_PRINT is set to True, saving model md5sum...") + self._save_model_md5sum() + + def _save_model_md5sum(self) -> None: + md5sum_dict = {} + for key, param in self.model.state_dict().items(): + md5sum_dict[key] = get_tensor_md5(param) + logger.info(f">>>>>>>>>> {key} : {md5sum_dict[key]}") + # 使用 'w' 模式写入 + file_name = f"md5sum_rank_{paddle.distributed.get_rank()}.json" + output_dir = envs.FD_LOG_DIR + if not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + with open(os.path.join(output_dir, file_name), "w", encoding="utf-8") as f: + # indent=4 让生成的 JSON 有层级缩进,方便肉眼查看 + # ensure_ascii=False 保证路径中的中文或特殊字符不被转义 + json.dump(md5sum_dict, f, indent=4, ensure_ascii=False) + def get_model(self) -> nn.Layer: """Get current model"""