support eplb

zhaochaoxing · zhaochaoxing · commit e22d683c1e83 · 2025-05-29T17:13:59.000+08:00
diff --git a/lmdeploy/pytorch/models/qwen3_moe.py b/lmdeploy/pytorch/models/qwen3_moe.py
@@ -6,15 +6,25 @@
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 
-from lmdeploy.pytorch.distributed import get_tp_world_rank
+from lmdeploy.pytorch.distributed import get_dist_manager, get_ep_world_rank, get_tp_world_rank
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding
 from lmdeploy.pytorch.nn.linear import build_merged_colwise_linear, build_qkv_proj, build_rowwise_linear
 from lmdeploy.pytorch.nn.moe import SoftmaxTopK, build_fused_moe
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
+from lmdeploy.utils import get_logger
 
 from .utils.cudagraph import CudaGraphMixin
 
+logger = get_logger('lmdeploy')
+
+try:
+    from dlblas.layers.moe import eplb
+    use_dlblas = True
+except Exception:
+    use_dlblas = False
+    logger.warning('For higher performance, please install dlBLAS https://github.com/DeepLink-org/dlBLAS')
+
 
 class Qwen3MoeAttention(nn.Module):
     """Rewrite module of Qwen3MoeAttention."""
@@ -199,7 +209,13 @@ def __init__(self,
 
         world_size, _ = get_tp_world_rank()
         _all_reduce = world_size > 1
-
+        if get_dist_manager().current_context().dist_config.enable_eplb:
+            dist_ctx = get_dist_manager().current_context()
+            self.eplb_dispatch_info = eplb.EPLBDispatchInfo.init_new(
+                ep_rank=dist_ctx.ep_rank,
+                layer_idx=layer_idx,
+            )
+            self.num_experts = eplb.get_global_eplb_metadata().num_physical_experts()
         self.experts = build_fused_moe(
             self.hidden_dim,
             self.ffn_dim,
@@ -210,16 +226,17 @@ def __init__(self,
             device=device,
             quant_config=quantization_config,
             all_reduce=_all_reduce,
+            layer_idx=layer_idx,
         )
 
     def forward(self, hidden_states: torch.Tensor):
         """forward."""
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits = self.gate(hidden_states)
-
         topk_weights, topk_ids = self.softmax_topk(router_logits)
-
+        if get_dist_manager().current_context().dist_config.enable_eplb:
+            topk_ids = eplb.topk_ids_logical_to_physical(topk_ids, self.eplb_dispatch_info)
         out_states = self.experts(
             hidden_states,
             topk_weights,
@@ -307,6 +324,15 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
                                          dtype=dtype,
                                          device=device)
 
+        if get_dist_manager().current_context().dist_config.enable_eplb:
+            if not use_dlblas:
+                raise ImportError('To enable eplb, please install dlBLAS https://github.com/DeepLink-org/dlBLAS')
+            ep_size, _ = get_ep_world_rank()
+            eplb.init_global_eplb_metadata(
+                ep_size=ep_size,
+                num_routed_experts=config.num_experts,
+                num_hidden_layers=config.num_hidden_layers,
+            )
         # build all decode layers
         self.layers = nn.ModuleList([
             Qwen3MoeDecoderLayer(config, layer_idx, dtype=dtype, device=device)