ModelTC
diff --git a/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 4 additions & 0 deletions b/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎lightllm/common/basemodel/layer_infer/template/transformer_layer_infer_template.py‎
Lines changed: 28 additions & 21 deletions b/‎lightllm/common/basemodel/layer_infer/template/transformer_layer_infer_template.py‎
Lines changed: 28 additions & 21 deletions
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/__init__.py‎
Lines changed: 10 additions & 1 deletion b/‎lightllm/common/basemodel/layer_weights/meta_weights/__init__.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py‎
Lines changed: 71 additions & 0 deletions b/‎lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/parameter_weight.py‎
Lines changed: 93 additions & 0 deletions b/‎lightllm/common/basemodel/layer_weights/meta_weights/parameter_weight.py‎
Lines changed: 93 additions & 0 deletions
@@ -11,6 +11,7 @@
 
 from lightllm.common.basemodel.layer_weights.hf_load_utils import load_hf_weights
 from lightllm.common.basemodel.infer_struct import InferStateInfo
+from lightllm.server.router.dynamic_prompt.radix_cache import RadixCache
 from lightllm.common.kv_cache_mem_manager import MemoryManager
 from lightllm.common.kv_cache_mem_manager.mem_utils import select_mem_manager_class
 from lightllm.common.req_manager import ReqManager
@@ -53,6 +54,9 @@ class TpPartBaseModel:
     # infer state class
     infer_state_class = InferStateInfo
 
+    # radix cache class
+    radix_cache_class = RadixCache
+
     def __init__(self, kvargs):
         self.args = get_env_start_args()
         self.run_mode = kvargs["run_mode"]
 
@@ -62,20 +62,21 @@ def _ffn(self, input, infer_state: InferStateInfo, layer_weight) -> torch.Tensor
     def _tpsp_ffn(self, input, infer_state: InferStateInfo, layer_weight) -> torch.Tensor:
         raise Exception("need to impl")
 
-    def context_forward(self, input_embdings, infer_state: InferStateInfo, layer_weight):
-        input1 = self._att_norm(input_embdings, infer_state, layer_weight)
-        q, cache_kv = self._get_qkv(input1, infer_state, layer_weight)
-        input1 = None
+    def context_attention_forward(self, input_embdings, infer_state: InferStateInfo, layer_weight):
+        q, cache_kv = self._get_qkv(input_embdings, infer_state, layer_weight)
         self._post_cache_kv(cache_kv, infer_state, layer_weight)
-
         o = self._context_attention_wrapper_run(
             q=q, cache_kv=cache_kv, infer_state=infer_state, layer_weight=layer_weight
         )
-
         q = None
         o = self._get_o(o, infer_state, layer_weight)
         if self.tp_world_size_ > 1:
             all_reduce(o, op=dist.ReduceOp.SUM, group=infer_state.dist_group, async_op=False)
+        return o
+
+    def context_forward(self, input_embdings, infer_state: InferStateInfo, layer_weight):
+        input1 = self._att_norm(input_embdings, infer_state, layer_weight)
+        o = self.context_attention_forward(input1, infer_state, layer_weight)
         input_embdings.add_(o.view(-1, self.embed_dim_))
         o = None
 
@@ -87,39 +88,42 @@ def context_forward(self, input_embdings, infer_state: InferStateInfo, layer_wei
         input_embdings.add_(ffn_out.view(-1, self.embed_dim_))
         return input_embdings
 
-    def token_forward(self, input_embdings, infer_state: InferStateInfo, layer_weight):
-        input1 = self._att_norm(input_embdings, infer_state, layer_weight)
-        q, cache_kv = self._get_qkv(input1, infer_state, layer_weight)
-        input1 = None
+    def token_attention_forward(self, input_embdings, infer_state: InferStateInfo, layer_weight):
+        q, cache_kv = self._get_qkv(input_embdings, infer_state, layer_weight)
         self._post_cache_kv(cache_kv, infer_state, layer_weight)
         o = self._token_attention_kernel(q, infer_state, layer_weight)
         q = None
         o = self._get_o(o, infer_state, layer_weight)
         if self.tp_world_size_ > 1:
             all_reduce(o, op=dist.ReduceOp.SUM, group=infer_state.dist_group, async_op=False)
+        return o
+
+    def token_forward(self, input_embdings, infer_state: InferStateInfo, layer_weight):
+        input1 = self._att_norm(input_embdings, infer_state, layer_weight)
+        o = self.token_attention_forward(input1, infer_state, layer_weight)
         input_embdings.add_(o.view(-1, self.embed_dim_))
         o = None
 
         input1 = self._ffn_norm(input_embdings, infer_state, layer_weight)
         ffn_out = self._ffn(input1, infer_state, layer_weight)
-        input1 = None
         if self.tp_world_size_ > 1:
             all_reduce(ffn_out, op=dist.ReduceOp.SUM, group=infer_state.dist_group, async_op=False)
         input_embdings.add_(ffn_out.view(-1, self.embed_dim_))
         return input_embdings
 
-    def tpsp_context_forward(self, input_embdings: torch.Tensor, infer_state: InferStateInfo, layer_weight):
-        input1 = self._att_norm(input_embdings, infer_state, layer_weight)
-        q, cache_kv = self._tpsp_get_qkv(input1, infer_state, layer_weight)
-        input1 = None
+    def tpsp_context_attention_forward(self, input_embdings: torch.Tensor, infer_state: InferStateInfo, layer_weight):
+        q, cache_kv = self._tpsp_get_qkv(input_embdings, infer_state, layer_weight)
         self._post_cache_kv(cache_kv, infer_state, layer_weight)
-
         o = self._context_attention_wrapper_run(
             q=q, cache_kv=cache_kv, infer_state=infer_state, layer_weight=layer_weight
         )
-
         q = None
         o = self._tpsp_get_o(o, infer_state, layer_weight)
+        return o
+
+    def tpsp_context_forward(self, input_embdings: torch.Tensor, infer_state: InferStateInfo, layer_weight):
+        input1 = self._att_norm(input_embdings, infer_state, layer_weight)
+        o = self.tpsp_context_attention_forward(input1, infer_state, layer_weight)
         input_embdings.add_(o.view(-1, self.embed_dim_))
         o = None
 
@@ -129,14 +133,17 @@ def tpsp_context_forward(self, input_embdings: torch.Tensor, infer_state: InferS
         input_embdings.add_(ffn_out.view(-1, self.embed_dim_))
         return input_embdings
 
-    def tpsp_token_forward(self, input_embdings: torch.Tensor, infer_state: InferStateInfo, layer_weight):
-        input1 = self._att_norm(input_embdings, infer_state, layer_weight)
-        q, cache_kv = self._tpsp_get_qkv(input1, infer_state, layer_weight)
-        input1 = None
+    def tpsp_token_attention_forward(self, input_embdings: torch.Tensor, infer_state: InferStateInfo, layer_weight):
+        q, cache_kv = self._tpsp_get_qkv(input_embdings, infer_state, layer_weight)
         self._post_cache_kv(cache_kv, infer_state, layer_weight)
         o = self._token_attention_kernel(q, infer_state, layer_weight)
         q = None
         o = self._tpsp_get_o(o, infer_state, layer_weight)
+        return o
+
+    def tpsp_token_forward(self, input_embdings: torch.Tensor, infer_state: InferStateInfo, layer_weight):
+        input1 = self._att_norm(input_embdings, infer_state, layer_weight)
+        o = self.tpsp_token_attention_forward(input1, infer_state, layer_weight)
         input_embdings.add_(o.view(-1, self.embed_dim_))
         o = None
 
 
@@ -7,7 +7,16 @@
     QKVROWNMMWeight,
     COLMMWeight,
 )
-from .norm_weight import TpRMSNormWeight, RMSNormWeight, LayerNormWeight, NoTpGEMMANormWeight, QKRMSNORMWeight
+from .norm_weight import (
+    TpRMSNormWeight,
+    RMSNormWeight,
+    GatedRMSNormWeight,
+    LayerNormWeight,
+    NoTpGEMMANormWeight,
+    QKRMSNORMWeight,
+    QKGEMMANormWeight,
+)
 from .embedding_weight import EmbeddingWeight, LMHeadWeight, NoTpPosEmbeddingWeight
 from .att_sink_weight import TpAttSinkWeight
 from .fused_moe.fused_moe_weight import FusedMoeWeight
+from .parameter_weight import ParameterWeight, TpParameterWeight
@@ -5,6 +5,7 @@
 from lightllm.common.basemodel.triton_kernel.norm.rmsnorm import rmsnorm_forward
 from lightllm.common.basemodel.triton_kernel.norm.layernorm import layernorm_forward
 from lightllm.common.basemodel.triton_kernel.norm.qk_norm import qk_rmsnorm_fused_forward
+from lightllm.common.basemodel.triton_kernel.norm.gated_rmsnorm import gated_rmsnorm_forward
 from .platform_op import PlatformAwareOp
 
 
@@ -71,6 +72,55 @@ def __call__(
         return self._forward(input=input, eps=eps, out=out, alloc_func=alloc_func)
 
 
+class GatedRMSNormWeight(RMSNormWeight):
+    def _triton_forward(
+        self,
+        input: torch.Tensor,
+        gate_value: torch.Tensor,
+        eps: float,
+        out: Optional[torch.Tensor] = None,
+        alloc_func=torch.empty,
+    ) -> torch.Tensor:
+        assert (
+            input.ndim in [2, 3] and self.weight.ndim == 1
+        ), f"input.ndim: {input.ndim} != 2 or weight.ndim: {self.weight.ndim} != 1"
+        if out is None:
+            out = alloc_func(input.shape, dtype=input.dtype, device=input.device)
+        return gated_rmsnorm_forward(x=input, weight=self.weight, bias=None, eps=eps, z=gate_value, out=out)
+
+    def _cuda_forward(
+        self,
+        input: torch.Tensor,
+        gate_value: torch.Tensor,
+        eps: float,
+        out: Optional[torch.Tensor] = None,
+        alloc_func=torch.empty,
+    ) -> torch.Tensor:
+        # only triton implementation is supported for rmsnorm on cuda platform
+        return self._triton_forward(input=input, gate_value=gate_value, eps=eps, out=out, alloc_func=alloc_func)
+
+    def _musa_forward(
+        self,
+        input: torch.Tensor,
+        gate_value: torch.Tensor,
+        eps: float,
+        out: Optional[torch.Tensor] = None,
+        alloc_func=torch.empty,
+    ) -> torch.Tensor:
+        # triton implementation is supported by musa.
+        return self._triton_forward(input=input, gate_value=gate_value, eps=eps, out=out, alloc_func=alloc_func)
+
+    def __call__(
+        self,
+        input: torch.Tensor,
+        gate_value: torch.Tensor,
+        eps: float,
+        out: Optional[torch.Tensor] = None,
+        alloc_func=torch.empty,
+    ) -> torch.Tensor:
+        return self._forward(input=input, gate_value=gate_value, eps=eps, out=out, alloc_func=alloc_func)
+
+
 class LayerNormWeight(BaseWeightTpl, PlatformAwareOp):
     def __init__(self, dim: int, weight_name: str, data_type: torch.dtype, bias_name: str = None):
         super().__init__(tp_rank=0, tp_world_size=1)
@@ -193,6 +243,7 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
         if self.weight_name in weights:
             self.weight.copy_(weights[self.weight_name])
             self.weight += 1
+            self.weight.load_ok = True
 
 
 class QKRMSNORMWeight(BaseWeightTpl, PlatformAwareOp):
@@ -276,3 +327,23 @@ def __call__(
         eps: float,
     ) -> None:
         return self._forward(q=q, k=k, eps=eps)
+
+
+class QKGEMMANormWeight(QKRMSNORMWeight):
+    def load_hf_weights(self, weights: Dict[str, torch.Tensor]):
+        if self.q_weight_name in weights:
+            self.q_weight.copy_(weights[self.q_weight_name])
+            self.q_weight += 1
+            self.q_weight.load_ok = True
+        if self.k_weight_name in weights:
+            self.k_weight.copy_(weights[self.k_weight_name])
+            self.k_weight += 1
+            self.k_weight.load_ok = True
+
+    def _triton_forward(self, q: torch.Tensor, k: torch.Tensor, eps: float) -> tuple:
+        assert q.ndim == 2 and self.q_weight.ndim == 1
+        assert k.ndim == 2 and self.k_weight.ndim == 1
+        # Llama does x.to(float16) * w whilst Gemma is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        # So we need to set fp32_multiply to True here.
+        return qk_rmsnorm_fused_forward(q=q, k=k, w_q=self.q_weight, w_k=self.k_weight, eps=eps, fp32_multiply=True)
@@ -0,0 +1,93 @@
+import torch
+from typing import Dict, Optional, Tuple
+from .base_weight import BaseWeightTpl
+from lightllm.utils.dist_utils import get_dp_world_size
+
+
+class ParameterWeight(BaseWeightTpl):
+    def __init__(
+        self,
+        weight_name: str,
+        data_type: torch.dtype,
+        weight_shape: Optional[Tuple[int, ...]],
+        bias_name: Optional[str] = None,
+        bias_shape: Optional[Tuple[int, ...]] = None,
+    ):
+        super().__init__()
+        self.weight_name = weight_name
+        self.bias_name = bias_name
+        self.data_type_ = data_type
+        self.weight_shape = weight_shape
+        self.bias_shape = bias_shape
+        self.weight: Optional[torch.Tensor] = None
+        self.bias: Optional[torch.Tensor] = None
+        if weight_shape is not None:
+            self._create_weight()
+
+    def _create_weight(self):
+        if self.weight_shape is not None:
+            self.weight = torch.empty(*self.weight_shape, dtype=self.data_type_, device=self.device_id_)
+            self.weight.load_ok = False
+        if self.bias_name is not None and self.bias_shape is not None:
+            self.bias = torch.empty(*self.bias_shape, dtype=self.data_type_, device=self.device_id_)
+            self.bias.load_ok = False
+
+    def load_hf_weights(self, weights: Dict[str, torch.Tensor]) -> None:
+        if self.weight_name in weights:
+            t_weight = weights[self.weight_name]
+            self.weight.copy_(t_weight.to(self.data_type_))
+            self.weight.load_ok = True
+        if self.bias_name is not None and self.bias_name in weights:
+            t_bias = weights[self.bias_name]
+            self.bias.copy_(t_bias.to(self.data_type_))
+            self.bias.load_ok = True
+
+    def verify_load(self) -> bool:
+        if self.weight is not None and not getattr(self.weight, "load_ok", False):
+            return False
+        if self.bias is not None and not getattr(self.bias, "load_ok", False):
+            return False
+        return True
+
+
+class TpParameterWeight(ParameterWeight):
+    def __init__(
+        self,
+        weight_name: str,
+        data_type: torch.dtype,
+        bias_name: Optional[str] = None,
+        weight_shape: Optional[Tuple[int, ...]] = None,
+        bias_shape: Optional[Tuple[int, ...]] = None,
+        dim: int = 0,  # the default split dimension is 0
+    ):
+
+        assert (
+            0 <= dim < len(weight_shape)
+        ), f"split dimension: {dim} must be less than the length of weight_shape: {weight_shape}"
+        n_embed = weight_shape[dim]
+        tp_world_size = get_dp_world_size()
+        assert (
+            n_embed % tp_world_size == 0
+        ), f"weight_shape[{dim}]={weight_shape[dim]} must be divisible by tp_world_size_: {tp_world_size}"
+        self.dim = dim
+        self.split_n_embed = n_embed // tp_world_size
+        tp_weight_shape = None
+        tp_bias_shape = None
+        if weight_shape is not None:
+            tp_weight_shape = weight_shape[:dim] + (self.split_n_embed,) + weight_shape[dim + 1 :]
+        if bias_shape is not None:
+            tp_bias_shape = bias_shape[:dim] + (self.split_n_embed,) + bias_shape[dim + 1 :]
+        super().__init__(weight_name, data_type, tp_weight_shape, bias_name, tp_bias_shape)
+
+    def load_hf_weights(self, weights: Dict[str, torch.Tensor]) -> None:
+        start = self.split_n_embed * self.tp_rank_
+        end = self.split_n_embed * (self.tp_rank_ + 1)
+
+        if self.weight_name in weights:
+            t_weight = weights[self.weight_name].narrow(self.dim, start, end - start)
+            self.weight.copy_(t_weight.to(self.data_type_))
+            self.weight.load_ok = True
+        if self.bias_name is not None and self.bias_name in weights:
+            t_bias = weights[self.bias_name].narrow(self.dim, start, end - start)
+            self.bias.copy_(t_bias.to(self.data_type_))
+            self.bias.load_ok = True