refactor weight

InternLM · lvhan028 · Jun 13, 2025 · Mar 27, 2025 · Apr 2, 2025 · Apr 7, 2025
commit e3fe34c97b4de16332773fb93c9bcbc58691471a
diff --git a/src/turbomind/core/CMakeLists.txt b/src/turbomind/core/CMakeLists.txt
@@ -13,6 +13,7 @@ add_library(core STATIC
         layout.cc
         tensor.cc
         tensor.cu
+        module.cc
         typecvt.cc)
 target_link_libraries(core PRIVATE CUDA::cudart CUDA::cuda_driver)
 set_property(TARGET core PROPERTY POSITION_INDEPENDENT_CODE ON)

diff --git a/src/turbomind/core/module.cc b/src/turbomind/core/module.cc
@@ -0,0 +1,78 @@
+
+#include "src/turbomind/core/module.h"
+#include "src/turbomind/core/check.h"
+#include <optional>
+
+namespace turbomind::core {
+
+Module::Module(): parent_{} {}
+
+Module::~Module()
+{
+    if (parent_) {
+        parent_->remove_module(*this);
+        parent_ = {};
+    }
+}
+
+void Module::register_module(std::string name, Module& module, std::optional<int> index)
+{
+    module.parent_ = this;
+    if (index) {
+        name += ".";
+        name += std::to_string(*index);
+    }
+    // std::cout << "register Module " << name << " " << &module << ", parent " << this << "\n";
+    modules_.emplace_back(std::move(name), &module);
+}
+
+void Module::register_parameter(std::string name, Tensor& param)
+{
+    // std::cout << "register Parameter " << name << " " << &param << " " << param.layout() << "\n";
+    params_.emplace_back(std::move(name), &param);
+}
+
+void Module::remove_module(Module& module)
+{
+    for (auto it = modules_.begin(); it != modules_.end(); ++it) {
+        if (it->second == &module) {
+            // std::cout << "erase " << it->first << " " << &module << " from " << this << "\n";
+            modules_.erase(it);
+            return;
+        }
+    }
+    TM_CHECK(0) << "module " << &module << " not found";
+}
+
+void Module::remove_parameter(Tensor& param)
+{
+    for (auto it = params_.begin(); it != params_.end(); ++it) {
+        if (it->second == &param) {
+            params_.erase(it);
+            return;
+        }
+    }
+    TM_CHECK(0) << "param " << &param << " not found";
+}
+
+TensorMap Module::get_parameters() const
+{
+    TensorMap m;
+    get_parameters_impl({}, m);
+    return m;
+}
+
+void Module::get_parameters_impl(std::string prefix, TensorMap& m) const
+{
+    if (!prefix.empty()) {
+        prefix += ".";
+    }
+    for (const auto& [k, v] : params_) {
+        m.emplace(prefix + k, *v);
+    }
+    for (const auto& [k, v] : modules_) {
+        v->get_parameters_impl(prefix + k, m);
+    }
+}
+
+}  // namespace turbomind::core
diff --git a/src/turbomind/core/module.h b/src/turbomind/core/module.h
@@ -0,0 +1,36 @@
+
+#include "src/turbomind/core/tensor.h"
+
+namespace turbomind::core {
+
+class Module {
+public:
+    virtual ~Module();
+
+    Module();
+
+    Module(const Module&)            = delete;
+    Module& operator=(const Module&) = delete;
+
+    Module(Module&&) noexcept            = delete;
+    Module& operator=(Module&&) noexcept = delete;
+
+    void register_module(std::string name, Module& module, std::optional<int> index = {});
+    void register_parameter(std::string name, Tensor& param);
+
+    void remove_module(Module& module);
+    void remove_parameter(Tensor& param);
+
+    TensorMap get_parameters() const;
+
+private:
+    void get_parameters_impl(std::string prefix, TensorMap& m) const;
+
+protected:
+    Module* parent_;
+
+    std::vector<std::pair<std::string, Module*>> modules_;
+    std::vector<std::pair<std::string, Tensor*>> params_;
+};
+
+}  // namespace turbomind::core
diff --git a/src/turbomind/kernels/norm/rms_norm.cu b/src/turbomind/kernels/norm/rms_norm.cu
@@ -84,11 +84,12 @@ __global__ void RMSNorm(T*       dst,
 
 }  // namespace kernel
 
-void invokeRMSNorm(core::Tensor& out, const core::Tensor& x, const void* w, float eps, cudaStream_t st)
+void invokeRMSNorm(core::Tensor& out, const core::Tensor& x, const core::Tensor& w, float eps, cudaStream_t st)
 {
     TM_CHECK(x.ndim() == 2);
     TM_CHECK(out.shape() == x.shape());
     TM_CHECK(out.dtype() == x.dtype());
+    TM_CHECK(w.dtype() == x.dtype() && w.shape(-1) == x.shape(-1));
 
     if (x.size() == 0) {
         return;
@@ -108,7 +109,7 @@ void invokeRMSNorm(core::Tensor& out, const core::Tensor& x, const void* w, floa
                                                                                  out.stride(0),
                                                                                  (const T*)x.raw_data(),
                                                                                  x.stride(0),
-                                                                                 (const T*)w,
+                                                                                 (const T*)w.raw_data(),
                                                                                  dim,
                                                                                  num,
                                                                                  eps,
@@ -227,7 +228,7 @@ void invokeQkRMSNorm(void*        data,
     }
 }
 
-void invokeRMSNormQK(core::Tensor& x, const void* w, float eps, cudaStream_t st)
+void invokeRMSNormQK(core::Tensor& x, const core::Tensor& w, float eps, cudaStream_t st)
 {
     TM_CHECK(x.ndim() == 3);
 
@@ -253,7 +254,7 @@ void invokeRMSNormQK(core::Tensor& x, const void* w, float eps, cudaStream_t st)
         const int grid_dim  = cdiv(threads, block_dim);
 
         kernel::RMSNormQK<T, float, vec_size, max_dim><<<grid_dim, block_dim, 0, st>>>(
-            (T*)data, stride, (const T*)w, head_dim, head_num, token_num, eps, 1.f / head_dim);
+            (T*)data, stride, (const T*)w.raw_data(), head_dim, head_num, token_num, eps, 1.f / head_dim);
     };
 
     constexpr constant<128> max_dim{};

diff --git a/src/turbomind/kernels/norm/rms_norm.h b/src/turbomind/kernels/norm/rms_norm.h
@@ -8,19 +8,9 @@
 
 namespace turbomind {
 
-void invokeRMSNorm(core::Tensor& out, const core::Tensor& x, const void* w, float eps, cudaStream_t st);
+void invokeRMSNorm(core::Tensor& out, const core::Tensor& x, const core::Tensor& w, float eps, cudaStream_t st);
 
-inline void invokeRMSNorm(core::Tensor& out, const core::Tensor& x, const core::Buffer& w, float eps, cudaStream_t st)
-{
-    return invokeRMSNorm(out, x, w.raw_data(), eps, st);
-}
-
-void invokeRMSNormQK(core::Tensor& x, const void* w, float eps, cudaStream_t st);
-
-inline void invokeRMSNormQK(core::Tensor& x, const core::Buffer& w, float eps, cudaStream_t st)
-{
-    return invokeRMSNormQK(x, w.raw_data(), eps, st);
-}
+void invokeRMSNormQK(core::Tensor& x, const core::Tensor& w, float eps, cudaStream_t st);
 
 template<class T>
 void invokeBiasResidualRMSNorm(

diff --git a/src/turbomind/models/llama/CMakeLists.txt b/src/turbomind/models/llama/CMakeLists.txt
@@ -13,6 +13,7 @@ add_library(Llama STATIC
         BlockTrie.cc
         SequenceManager.cc
         LlamaWeight.cc
+        LlamaDenseWeight.cc
         LlamaDecoderLayerWeight.cc
         LlamaFfnLayer.cc
         moe_ffn_layer.cc

diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -74,91 +74,53 @@ LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(DataType           data_type,
     mlp_tp_size_(engine.mlp_tp_size),
     mlp_tp_rank_(engine.mlp_tp_rank)
 {
-    self_attn_weights = LlamaAttentionWeight{hidden_units_,
-                                             size_per_head_,
-                                             head_num_,
-                                             kv_head_num_,
-                                             model.mla,
-                                             attn_bias_,
-                                             model.qk_norm,
-                                             attn_tp_size_,
-                                             data_type_,
-                                             weight_type_,
-                                             model.group_size};
-
-    ffn_weights = LlamaFfnWeight{
-        hidden_units_,
-        inter_size_,
-        mlp_tp_size_,
-        data_type_,
-        weight_type_,
-        model.group_size,
-        weight_type_ == TYPE_UINT4 && is_fuse_silu_act(),
-    };
-
-    moe_weights = MoeFfnWeight{layer_id,
-                               moe_param,
-                               hidden_units_,
-                               data_type_,
-                               weight_type_,
-                               model.group_size,
-                               mlp_tp_size_,
-                               is_fuse_silu_act()};
-
-    if (lora_param.policy == LoraPolicy::kPlora) {
-        std::vector<std::string> keys = {
-            "attention.w_qkv", "attention.wo", "feed_forward.w1", "feed_forward.w2", "feed_forward.w3"};
-        std::vector<LlamaDenseWeight*> weights = {&self_attn_weights.qkv,
-                                                  &self_attn_weights.output,
-                                                  &ffn_weights.gating,
-                                                  &ffn_weights.output,
-                                                  &ffn_weights.intermediate};
-        for (int i = 0; i < keys.size(); i++) {
-            const auto& name      = keys[i];
-            auto&       weight    = *weights[i];
-            int         rank      = lora_param.r;
-            float       scale     = lora_param.scale;
-            std::string full_name = "layers." + std::to_string(layer_id) + "." + name;
-
-            for (const auto& [re, pr] : lora_param.rank_pattern) {
-                if (std::regex_search(full_name, pr.first)) {
-                    rank = pr.second;
-                    TM_LOG_DEBUG("find rank, pattern=%s, name=%s, value=%d", re.c_str(), full_name.c_str(), rank);
-                    break;
-                }
-            }
-            for (const auto& [re, pr] : lora_param.scale_pattern) {
-                if (std::regex_search(full_name, pr.first)) {
-                    scale = pr.second;
-                    TM_LOG_DEBUG("find scale pattern=%s, name=%s, value=%f", re.c_str(), full_name.c_str(), scale);
-                    break;
-                }
-            }
-            if (rank) {
-                weight.lora.r      = rank;
-                weight.lora.scale  = scale;
-                weight.lora.policy = lora_param.policy;
-            }
-        }
-    }
-
-    fused_up_and_gate_ = ffn_weights.gating.lora.policy != LoraPolicy::kPlora;
-}
-
-void LlamaDecoderLayerWeight::malloc()
-{
-    self_attn_norm = core::Buffer{hidden_units_, data_type_, MEMORY_GPU};
-    ffn_norm       = core::Buffer{hidden_units_, data_type_, MEMORY_GPU};
-
-    self_attn_weights.malloc();
+    self_attn_weights.reset(new LlamaAttentionWeight{hidden_units_,
+                                                     size_per_head_,
+                                                     head_num_,
+                                                     kv_head_num_,
+                                                     model.mla,
+                                                     attn_bias_,
+                                                     model.qk_norm,
+                                                     attn_tp_size_,
+                                                     attn_tp_rank_,
+                                                     data_type_,
+                                                     weight_type_,
+                                                     model.group_size});
+    register_module("attention", *self_attn_weights);
 
     if (inter_size_) {
-        ffn_weights.malloc();
+        ffn_weights.reset(new LlamaFfnWeight{
+            hidden_units_,
+            inter_size_,
+            mlp_tp_size_,
+            mlp_tp_rank_,
+            data_type_,
+            weight_type_,
+            model.group_size,
+            weight_type_ == TYPE_UINT4 && is_fuse_silu_act(),
+        });
+        register_module("feed_forward", *ffn_weights);
     }
 
-    if (!moe_weights.experts.empty()) {
-        moe_weights.malloc();
+    if (layer_id < moe_param.expert_num.size() && moe_param.expert_num[layer_id]) {
+        moe_weights.reset(new MoeFfnWeight{layer_id,
+                                           moe_param,
+                                           hidden_units_,
+                                           data_type_,
+                                           weight_type_,
+                                           model.group_size,
+                                           mlp_tp_size_,
+                                           mlp_tp_rank_,
+                                           is_fuse_silu_act()});
+        register_module("moe_ffn", *moe_weights);
     }
+
+    fused_up_and_gate_ = ffn_weights->gating.lora.policy != LoraPolicy::kPlora;
+
+    self_attn_norm = core::Tensor{{hidden_units_}, data_type_, MEMORY_GPU};
+    ffn_norm       = core::Tensor{{hidden_units_}, data_type_, MEMORY_GPU};
+    register_parameter("attention_norm.weight", self_attn_norm);
+    register_parameter("ffn_norm.weight", ffn_norm);
 }
 
 size_t LlamaDecoderLayerWeight::workspace_size() const noexcept
@@ -169,128 +131,23 @@ size_t LlamaDecoderLayerWeight::workspace_size() const noexcept
 
     size_t size = 0;
 
-    size = std::max(size, get_size(self_attn_weights.qkv));
-    size = std::max(size, get_size(self_attn_weights.output));
-    size = std::max(size, get_size(ffn_weights.gating));
-    size = std::max(size, get_size(ffn_weights.fused_gating_intermediate));
+    size = std::max(size, get_size(self_attn_weights->qkv));
+    size = std::max(size, get_size(self_attn_weights->output));
+    size = std::max(size, get_size(ffn_weights->gating));
+    size = std::max(size, get_size(ffn_weights->fused_gating_intermediate));
 
-    for (const auto& e : moe_weights.experts) {
-        size = std::max(size, get_size(e.gating));
-        size = std::max(size, get_size(e.fused_gating_intermediate));
+    if (moe_weights) {
+        for (const auto& e : moe_weights->experts) {
+            size = std::max(size, get_size(e->gating));
+            size = std::max(size, get_size(e->fused_gating_intermediate));
+        }
     }
 
     return size * sizeof(uint16_t);
 }
 
-template<typename FirstArg, typename... Args>
-std::string concat(FirstArg&& first, Args&&... args)
-{
-    std::stringstream stream;
-    stream << first;
-    ((stream << "." << args), ...);
-    return stream.str();
-}
-
-void getWeightTensor(LlamaDenseWeight& dense, bool bias, const std::string& prefix, core::TensorMap& output)
-{
-    auto get_name = [=](const std::string& name) { return concat(prefix, name); };
-
-    TM_CHECK_EQ(bias, bool(dense.bias));
-    if (bias) {
-        output.emplace(get_name("bias"), dense.bias);
-    }
-
-    const size_t bit_size = core::get_byte_size(dense.weight_type, 8);
-    if (bit_size >= 16) {
-        output.emplace(get_name("weight"), dense.weight);
-    }
-    else {
-        output.emplace(get_name("qweight"), dense.weight);
-        output.emplace(get_name("scales"), dense.scales);
-        output.emplace(get_name("zeros"), dense.zeros);
-    }
-}
-
-void LlamaDecoderLayerWeight::free()
-{
-    self_attn_norm = {};
-    ffn_norm       = {};
-
-    self_attn_weights.free();
-
-    if (inter_size_) {
-        ffn_weights.free();
-    }
-
-    if (!moe_weights.experts.empty()) {
-        moe_weights.free();
-    }
-}
-
 LlamaDecoderLayerWeight::~LlamaDecoderLayerWeight() = default;
 
-void getMLATensor(LlamaAttentionWeight& w, const std::string& p, core::TensorMap& m, int tp_rank)
-{
-    if (w.q_proj.output_dim) {
-        getWeightTensor(w.q_proj, false, concat(p, "attention.q_proj", tp_rank), m);
-    }
-    else {
-        getWeightTensor(w.q_a_proj, false, concat(p, "attention.q_a_proj"), m);
-        getWeightTensor(w.q_b_proj, false, concat(p, "attention.q_b_proj", tp_rank), m);
-        m.emplace(concat(p, "attention.q_a_layernorm"), w.q_a_layernorm);
-    }
-    getWeightTensor(w.kv_a_proj, false, concat(p, "attention.kv_a_proj"), m);
-    getWeightTensor(w.kv_b_proj, false, concat(p, "attention.kv_b_proj", tp_rank), m);
-    m.emplace(concat(p, "attention.kv_a_layernorm"), w.kv_a_layernorm);
-}
-
-core::TensorMap LlamaDecoderLayerWeight::getParams(std::string prefix)
-{
-    core::TensorMap output;
-
-    output.emplace(concat(prefix, "attention_norm.weight"), self_attn_norm);
-    output.emplace(concat(prefix, "ffn_norm.weight"), ffn_norm);
-
-    auto get_attn = [=](std::string_view name) { return concat(prefix, name, attn_tp_rank_); };
-
-    if (self_attn_weights.qkv.output_dim) {
-        getWeightTensor(self_attn_weights.qkv, attn_bias_, get_attn("attention.w_qkv"), output);
-
-        if (self_attn_weights.qk_norm) {
-            output.emplace(concat(prefix, "attention.q_norm"), self_attn_weights.q_a_layernorm);
-            output.emplace(concat(prefix, "attention.k_norm"), self_attn_weights.kv_a_layernorm);
-        }
-    }
-    else {
-        getMLATensor(self_attn_weights, prefix, output, attn_tp_rank_);
-    }
-    getWeightTensor(self_attn_weights.output, attn_bias_, get_attn("attention.wo"), output);
-
-    auto get_mlp = [=](std::string_view name) { return concat(prefix, name, mlp_tp_rank_); };
-
-    if (inter_size_) {
-        getWeightTensor(ffn_weights.gating, false, get_mlp("feed_forward.w1"), output);
-        getWeightTensor(ffn_weights.intermediate, false, get_mlp("feed_forward.w3"), output);
-        getWeightTensor(ffn_weights.output, false, get_mlp("feed_forward.w2"), output);
-    }
-
-    if (!moe_weights.experts.empty()) {
-        output.emplace(concat(prefix, "moe_ffn.gate.weight"), moe_weights.gate.weight);
-        auto& experts = moe_weights.experts;
-        for (size_t i = 0; i < experts.size(); ++i) {
-            const std::string name = "moe_ffn.experts." + std::to_string(i);
-            getWeightTensor(experts[i].gating, false, get_mlp(concat(name, "w1")), output);
-            getWeightTensor(experts[i].intermediate, false, get_mlp(concat(name, "w3")), output);
-            getWeightTensor(experts[i].output, false, get_mlp(concat(name, "w2")), output);
-        }
-        if (moe_weights.shared_gate.weight) {
-            output.emplace(concat(prefix, "moe_ffn.shared_gate.weight"), moe_weights.shared_gate.weight);
-        }
-    }
-
-    return output;
-}
-
 static void
 convert_u4(LlamaDenseWeight& dense, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st)
 {
@@ -543,14 +400,19 @@ void LlamaDecoderLayerWeight::prepare(void* workspace, size_t size, const cudaDe
 {
     const bool is_16xx = is_16xx_series(prop.name);
 
-    convert(self_attn_weights.qkv, false, data_type_, workspace, size, is_16xx, st);
-    convert(self_attn_weights.output, false, data_type_, workspace, size, is_16xx, st);
+    convert(self_attn_weights->qkv, false, data_type_, workspace, size, is_16xx, st);
+    convert(self_attn_weights->output, false, data_type_, workspace, size, is_16xx, st);
 
     auto process_ffn = [&](LlamaFfnWeight& ffn, bool is_fused_moe) {
         if (fused_up_and_gate_) {
             auto& fused_up_and_gate = ffn.fused_gating_intermediate;
 
-            fused_up_and_gate.malloc(st);
+            fused_up_and_gate.emplace(ffn.gating.input_dim,
+                                      ffn.gating.output_dim * 2,
+                                      data_type_,
+                                      false,
+                                      weight_type_,
+                                      ffn.gating.group_size);
 
             if (ffn.is_fused_silu) {
                 interleave(fused_up_and_gate, ffn.gating, ffn.intermediate, data_type_, workspace, size, st);
@@ -561,8 +423,8 @@ void LlamaDecoderLayerWeight::prepare(void* workspace, size_t size, const cudaDe
 
             convert(ffn.fused_gating_intermediate, is_fused_moe, data_type_, workspace, size, is_16xx, st);
 
-            ffn.gating.free();
-            ffn.intermediate.free();
+            ffn.gating       = {};
+            ffn.intermediate = {};
         }
         else {
             convert(ffn.gating, is_fused_moe, data_type_, workspace, size, is_16xx, st);
@@ -574,37 +436,39 @@ void LlamaDecoderLayerWeight::prepare(void* workspace, size_t size, const cudaDe
 
     if (inter_size_) {
         // std::cerr << "process FFN\n";
-        process_ffn(ffn_weights, false);
+        process_ffn(*ffn_weights, false);
     }
 
-    if (!moe_weights.experts.empty()) {
+    if (moe_weights) {
         // std::cerr << "process MoE\n";
         std::vector<std::pair<void*, int>> fused_ptrs;
         std::vector<std::pair<void*, int>> output_ptrs;
         std::vector<std::pair<void*, int>> fused_param_ptrs;
         std::vector<std::pair<void*, int>> output_param_ptrs;
 
-        for (auto& e : moe_weights.experts) {
+        for (auto& e : moe_weights->experts) {
 
-            process_ffn(e, moe_weights.method == MoeParam::kFused);
+            process_ffn(*e, moe_weights->method == MoeParam::kFused);
 
-            auto& fused  = e.fused_gating_intermediate;
-            auto& output = e.output;
+            auto& fused  = e->fused_gating_intermediate;
+            auto& output = e->output;
 
             fused_ptrs.push_back({fused.weight.raw_data(), fused.k_desc.ld});
             output_ptrs.push_back({output.weight.raw_data(), output.k_desc.ld});
 
-            if (e.fused_gating_intermediate.scales_zeros) {
+            if (e->fused_gating_intermediate.scales_zeros) {
                 fused_param_ptrs.emplace_back(fused.scales_zeros.raw_data(), fused.q_desc.ld);
                 output_param_ptrs.emplace_back(output.scales_zeros.raw_data(), output.q_desc.ld);
             }
         }
-
+#if 0
         // Note: This assumes all experts has the same shape
-        moe_weights.block = moe_weights.experts.at(0);
+        auto& b_ = moe_weights->block;
+        auto& e_ = *moe_weights->experts.at(0);
+
 
-        auto& fused  = moe_weights.block.fused_gating_intermediate;
-        auto& output = moe_weights.block.output;
+        auto& fused  = moe_weights->block.fused_gating_intermediate;
+        auto& output = moe_weights->block.output;
 
         const auto weight_type = fused.weight_type;
 
@@ -625,6 +489,7 @@ void LlamaDecoderLayerWeight::prepare(void* workspace, size_t size, const cudaDe
 
         fused.q_desc.ld = output.q_desc.ld = 0;
         fused.q_desc.num = output.q_desc.num = moe_weights.experts.size();
+#endif
     }
 }
 

diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
@@ -26,7 +26,7 @@
 
 namespace turbomind {
 
-struct LlamaDecoderLayerWeight {
+struct LlamaDecoderLayerWeight: core::Module {
 public:
     LlamaDecoderLayerWeight() = delete;
 
@@ -41,23 +41,17 @@ struct LlamaDecoderLayerWeight {
     LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight&)            = delete;
     LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight&) = delete;
 
-    core::TensorMap getParams(std::string prefix);
-
     void prepare(void* workspace, size_t size, const cudaDeviceProp& prop, cudaStream_t st);
 
     size_t workspace_size() const noexcept;
 
-    void malloc();
-
-    void free();
-
-    core::Buffer self_attn_norm;
-    core::Buffer ffn_norm;
+    core::Tensor self_attn_norm;
+    core::Tensor ffn_norm;
 
-    LlamaAttentionWeight self_attn_weights{};
+    std::unique_ptr<LlamaAttentionWeight> self_attn_weights;
 
-    LlamaFfnWeight ffn_weights{};
-    MoeFfnWeight   moe_weights{};
+    std::unique_ptr<LlamaFfnWeight> ffn_weights;
+    std::unique_ptr<MoeFfnWeight>   moe_weights;
 
 private:
     int head_num_;

diff --git a/src/turbomind/models/llama/LlamaDenseWeight.cc b/src/turbomind/models/llama/LlamaDenseWeight.cc
@@ -0,0 +1,158 @@
+#include "src/turbomind/models/llama/LlamaDenseWeight.h"
+
+namespace turbomind {
+
+void LlamaDenseWeight::emplace(
+    int input_dim, int output_dim, DataType data_type, bool bias, DataType weight_type, int group_size)
+{
+    this->data_type   = data_type;
+    this->weight_type = weight_type;
+    this->input_dim   = input_dim;
+    this->output_dim  = output_dim;
+    this->group_size  = group_size;
+
+    const auto wbits = core::get_byte_size(weight_type, 8);
+
+    weight = core::Tensor({input_dim, output_dim}, weight_type, MEMORY_GPU);
+    register_parameter(wbits < 16 ? "qweight" : "weight", weight);
+
+    if (bias) {
+        this->bias = core::Tensor{{output_dim}, data_type, MEMORY_GPU};
+        register_parameter("bias", this->bias);
+    }
+
+    if (wbits < 16) {
+        TM_CHECK(input_dim % group_size == 0) << input_dim << " " << group_size;
+        scales = core::Tensor{{input_dim / group_size, output_dim}, data_type, MEMORY_GPU};
+        zeros  = core::Tensor{{input_dim / group_size, output_dim}, data_type, MEMORY_GPU};
+        register_parameter("scales", scales);
+        register_parameter("zeros", zeros);
+    }
+}
+
+LlamaAttentionWeight::LlamaAttentionWeight(int      hidden_dim,
+                                           int      head_dim,
+                                           int      head_num,
+                                           int      kv_head_num,
+                                           MLAParam mla,
+                                           bool     bias,
+                                           bool     qk_norm,
+                                           int      tp_size,
+                                           int      tp_rank,
+                                           DataType data_type,
+                                           DataType weight_type,
+                                           int      group_size)
+{
+    if (mla.kv_lora_rank == 0) {
+        qkv.emplace(
+            hidden_dim, (head_num + 2 * kv_head_num) * head_dim / tp_size, data_type, bias, weight_type, group_size);
+        register_module("w_qkv", qkv, tp_rank);
+        if (qk_norm) {
+            q_a_layernorm  = core::Tensor{{head_dim}, data_type, MEMORY_GPU};
+            kv_a_layernorm = core::Tensor{{head_dim}, data_type, MEMORY_GPU};
+            register_parameter("q_norm", q_a_layernorm);
+            register_parameter("k_norm", kv_a_layernorm);
+        }
+    }
+    else {
+        const int qk_nope_dim = head_dim - mla.qk_rope_dim;
+        if (mla.q_lora_rank) {
+            q_a_proj.emplace(hidden_dim, mla.q_lora_rank, data_type, false, weight_type, group_size);
+            q_b_proj.emplace(mla.q_lora_rank, head_num * head_dim / tp_size, data_type, false, weight_type, group_size);
+            q_a_layernorm = core::Tensor{{q_b_proj.input_dim}, data_type, MEMORY_GPU};
+            register_module("q_a_proj", q_a_proj);
+            register_module("q_b_proj", q_b_proj, tp_rank);
+            register_parameter("q_a_layernorm", q_a_layernorm);
+        }
+        else {
+            q_proj.emplace(hidden_dim, head_num * head_dim / tp_size, data_type, false, weight_type, group_size);
+            register_module("q_proj", q_proj, tp_rank);
+        }
+        kv_a_proj.emplace(hidden_dim, mla.kv_lora_rank + mla.qk_rope_dim, data_type, false, weight_type, group_size);
+        kv_b_proj.emplace(mla.kv_lora_rank,
+                          head_num * (qk_nope_dim + mla.v_head_dim) / tp_size,
+                          data_type,
+                          false,
+                          weight_type,
+                          group_size);
+
+        kv_a_layernorm = core::Tensor{{kv_b_proj.input_dim}, data_type, MEMORY_GPU};
+        register_module("kv_a_proj", kv_a_proj);
+        register_module("kv_b_proj", kv_b_proj, tp_rank);
+        register_parameter("kv_a_layernorm", kv_a_layernorm);
+    }
+    output.emplace((head_num * head_dim) / tp_size, hidden_dim, data_type, bias, weight_type, group_size);
+    register_module("wo", output, tp_rank);
+}
+
+LlamaFfnWeight::LlamaFfnWeight(int      hidden_dim,
+                               int      inter_size,
+                               int      tp_size,
+                               int      tp_rank,
+                               DataType data_type,
+                               DataType weight_type,
+                               int      group_size,
+                               bool     fuse_silu_act)
+{
+    TM_CHECK(inter_size % tp_size == 0) << inter_size << " " << tp_size;
+
+    inter_size /= tp_size;
+
+    this->inter_size = inter_size;
+
+    gating.emplace(hidden_dim, inter_size, data_type, false, weight_type, group_size);
+
+    intermediate.emplace(hidden_dim, inter_size, data_type, false, weight_type, group_size);
+
+    // fused_gating_intermediate = {hidden_dim, inter_size * 2, data_type, weight_type, group_size};
+    is_fused_silu = fuse_silu_act;
+
+    output.emplace(inter_size, hidden_dim, data_type, false, weight_type, group_size);
+
+    register_module("w1", gating, tp_rank);
+    register_module("w3", intermediate, tp_rank);
+    register_module("w2", output, tp_rank);
+}
+
+MoeFfnWeight::MoeFfnWeight(int             layer_id,
+                           const MoeParam& param,
+                           int             hidden_dim,
+                           DataType        data_type,
+                           DataType        weight_type,
+                           int             group_size,
+                           int             tp_size,
+                           int             tp_rank,
+                           bool            fuse_silu_act)
+{
+    if ((int)param.expert_num.size() <= layer_id) {
+        return;
+    }
+
+    const int expert_num = param.expert_num[layer_id];
+
+    if (expert_num == 0) {
+        return;
+    }
+
+    // printf("%d %d %d\n", (int)hidden_dim, (int)param.inter_size, (int)expert_num);
+
+    gate.emplace(hidden_dim, expert_num, data_type, false, data_type, 1);
+    register_module("gate", gate);
+
+    method        = param.method;
+    fuse_silu_act = fuse_silu_act && method == MoeParam::kFused;
+
+    experts.reserve(expert_num);
+    for (int i = 0; i < expert_num; ++i) {
+        experts.emplace_back(new LlamaFfnWeight{
+            hidden_dim, param.inter_size, tp_size, tp_rank, data_type, weight_type, group_size, fuse_silu_act});
+        register_module("experts", *experts.back(), i);
+    }
+
+    if (param.shared_gate) {
+        shared_gate.emplace(hidden_dim, 1, data_type, false, data_type, 1);
+        register_module("shared_gate", shared_gate);
+    }
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h
@@ -20,6 +20,7 @@
 #pragma once
 
 #include "src/turbomind/core/buffer.h"
+#include "src/turbomind/core/module.h"
 #include "src/turbomind/core/tensor.h"
 
 #include "src/turbomind/kernels/gemm/types.h"
@@ -47,7 +48,23 @@ struct LoraWeight {
     void*      b;
 };
 
-struct LlamaDenseWeight {
+struct LlamaDenseWeight: public core::Module {
+
+    LlamaDenseWeight(): data_type{}, weight_type{}, lora{}, k_desc{}, q_desc{} {}
+
+    void emplace(int input_dim, int output_dim, DataType data_type, bool bias, DataType weight_type, int group_size);
+
+    LlamaDenseWeight& operator=(std::nullptr_t)
+    {
+        this->~LlamaDenseWeight();
+        new (this) LlamaDenseWeight{};
+        return *this;
+    }
+
+    operator bool() const noexcept
+    {
+        return static_cast<bool>(weight);
+    }
 
     int input_dim  = 0;
     int output_dim = 0;
@@ -57,7 +74,7 @@ struct LlamaDenseWeight {
     DataType weight_type;
 
     core::Tensor weight;
-    core::Buffer bias;
+    core::Tensor bias;
 
     core::Tensor scales;
     core::Tensor zeros;
@@ -68,49 +85,9 @@ struct LlamaDenseWeight {
 
     gemm::MatrixLayout k_desc;
     gemm::MatrixLayout q_desc;
-
-    LlamaDenseWeight(): data_type{}, weight_type{}, lora{}, k_desc{}, q_desc{} {}
-
-    LlamaDenseWeight(int input_dim, int output_dim, DataType data_type, DataType weight_type, int group_size):
-        LlamaDenseWeight{}
-    {
-        this->data_type   = data_type;
-        this->weight_type = weight_type;
-        this->input_dim   = input_dim;
-        this->output_dim  = output_dim;
-        this->group_size  = group_size;
-    }
-
-    explicit operator bool() const noexcept
-    {
-        return static_cast<bool>(weight);
-    }
-
-    void malloc(bool with_bias = false)
-    {
-        if (with_bias) {
-            bias = core::Buffer{output_dim, data_type, MEMORY_GPU};
-        }
-
-        weight = core::Tensor({input_dim, output_dim}, weight_type, MEMORY_GPU);
-
-        if (auto wbits = core::get_byte_size(weight_type, 8); wbits <= 8) {
-            TM_CHECK_EQ(input_dim % group_size, 0);
-            scales = core::Tensor{{input_dim / group_size, output_dim}, data_type, MEMORY_GPU};
-            zeros  = core::Tensor{{input_dim / group_size, output_dim}, data_type, MEMORY_GPU};
-        }
-    }
-
-    void free()
-    {
-        bias   = {};
-        weight = {};
-        scales = {};
-        zeros  = {};
-    }
 };
 
-struct LlamaAttentionWeight {
+struct LlamaAttentionWeight: public core::Module {
 
     LlamaAttentionWeight() = default;
 
@@ -121,80 +98,11 @@ struct LlamaAttentionWeight {
                          MLAParam mla,
                          bool     bias,
                          bool     qk_norm,
-                         int      tp,
+                         int      tp_size,
+                         int      tp_rank,
                          DataType data_type,
                          DataType weight_type,
-                         int      group_size)
-    {
-        this->bias        = bias;
-        this->head_dim    = head_dim;
-        this->qk_norm     = qk_norm;
-        this->data_type   = data_type;
-        this->weight_type = weight_type;
-
-        if (mla.kv_lora_rank == 0) {
-            qkv = {hidden_dim, (head_num + 2 * kv_head_num) * head_dim / tp, data_type, weight_type, group_size};
-        }
-        else {
-            const int qk_nope_dim = head_dim - mla.qk_rope_dim;
-            if (mla.q_lora_rank) {
-                q_a_proj = {hidden_dim, mla.q_lora_rank, data_type, weight_type, group_size};
-                q_b_proj = {mla.q_lora_rank, head_num * head_dim / tp, data_type, weight_type, group_size};
-            }
-            else {
-                q_proj = {hidden_dim, head_num * head_dim / tp, data_type, weight_type, group_size};
-            }
-            kv_a_proj = {hidden_dim, mla.kv_lora_rank + mla.qk_rope_dim, data_type, weight_type, group_size};
-            kv_b_proj = {
-                mla.kv_lora_rank, head_num * (qk_nope_dim + mla.v_head_dim) / tp, data_type, weight_type, group_size};
-        }
-        output = {(head_num * head_dim) / tp, hidden_dim, data_type, weight_type, group_size};
-    }
-
-    void malloc()
-    {
-        if (qkv.output_dim) {
-            qkv.malloc(bias);
-            if (qk_norm) {
-                q_a_layernorm  = core::Buffer{head_dim, data_type, MEMORY_GPU};
-                kv_a_layernorm = core::Buffer{head_dim, data_type, MEMORY_GPU};
-            }
-        }
-        else {  // MLA
-            if (q_proj.output_dim) {
-                q_proj.malloc();
-            }
-            else {
-                q_a_proj.malloc();
-                q_b_proj.malloc();
-                q_a_layernorm = core::Buffer{q_b_proj.input_dim, data_type, MEMORY_GPU};
-            }
-            kv_a_proj.malloc();
-            kv_b_proj.malloc();
-            kv_a_layernorm = core::Buffer{kv_b_proj.input_dim, data_type, MEMORY_GPU};
-        }
-        output.malloc(bias);
-    }
-
-    void free()
-    {
-        qkv.free();
-        q_proj.free();
-        q_a_proj.free();
-        q_b_proj.free();
-        kv_a_proj.free();
-        kv_b_proj.free();
-        output.free();
-        q_a_layernorm  = {};
-        kv_a_layernorm = {};
-    }
-
-    int  head_dim{};
-    bool bias{};
-    bool qk_norm{};
-
-    DataType data_type{};
-    DataType weight_type{};
+                         int      group_size);
 
     LlamaDenseWeight qkv;
     LlamaDenseWeight output;
@@ -205,49 +113,22 @@ struct LlamaAttentionWeight {
     LlamaDenseWeight kv_a_proj;
     LlamaDenseWeight kv_b_proj;
 
-    core::Buffer q_a_layernorm;
-    core::Buffer kv_a_layernorm;
+    core::Tensor q_a_layernorm;
+    core::Tensor kv_a_layernorm;
 };
 
-struct LlamaFfnWeight {
+struct LlamaFfnWeight: core::Module {
 
     LlamaFfnWeight() = default;
 
     LlamaFfnWeight(int      hidden_dim,
                    int      inter_size,
-                   int      tp,
+                   int      tp_size,
+                   int      tp_rank,
                    DataType data_type,
                    DataType weight_type,
                    int      group_size,
-                   bool     fuse_silu_act)
-    {
-        TM_CHECK_EQ(inter_size % tp, 0);
-
-        this->inter_size = inter_size;
-
-        gating       = {hidden_dim, inter_size, data_type, weight_type, group_size};
-        intermediate = {hidden_dim, inter_size, data_type, weight_type, group_size};
-
-        fused_gating_intermediate = {hidden_dim, inter_size * 2, data_type, weight_type, group_size};
-        is_fused_silu             = fuse_silu_act;
-
-        output = {inter_size, hidden_dim, data_type, weight_type, group_size};
-    }
-
-    void malloc()
-    {
-        gating.malloc();
-        intermediate.malloc();
-        output.malloc();
-    }
-
-    void free()
-    {
-        gating.free();
-        intermediate.free();
-        output.free();
-        fused_gating_intermediate.free();
-    }
+                   bool     fuse_silu_act);
 
     LlamaDenseWeight gating;
     LlamaDenseWeight intermediate;
@@ -258,7 +139,7 @@ struct LlamaFfnWeight {
     bool is_fused_silu{};
 };
 
-struct MoeFfnWeight {
+struct MoeFfnWeight: core::Module {
 
     MoeFfnWeight() = default;
 
@@ -268,65 +149,15 @@ struct MoeFfnWeight {
                  DataType        data_type,
                  DataType        weight_type,
                  int             group_size,
-                 int             tp,
-                 bool            fuse_silu_act)
-    {
-
-        if ((int)param.expert_num.size() <= layer_id) {
-            return;
-        }
-
-        const int expert_num = param.expert_num[layer_id];
-
-        if (expert_num == 0) {
-            return;
-        }
-
-        // printf("%d %d %d\n", (int)hidden_dim, (int)param.inter_size, (int)expert_num);
-
-        gate = {hidden_dim, expert_num, data_type, data_type, 1};
-
-        experts.resize(expert_num);
-
-        method        = param.method;
-        fuse_silu_act = fuse_silu_act && method == MoeParam::kFused;
-
-        for (auto& e : experts) {
-            // inter size is divided by tp in `FfnWeight`
-            e = LlamaFfnWeight{hidden_dim, param.inter_size, tp, data_type, weight_type, group_size, fuse_silu_act};
-        }
-
-        if (param.shared_gate) {
-            shared_gate = {hidden_dim, 1, data_type, data_type, 1};
-        }
-    }
-
-    void malloc()
-    {
-        gate.malloc();
-        if (shared_gate.output_dim) {
-            shared_gate.malloc();
-        }
-        for (auto& e : experts) {
-            e.malloc();
-        }
-    }
-
-    void free()
-    {
-        gate.free();
-        shared_gate.free();
-        for (auto& e : experts) {
-            e.free();
-        }
-        block.free();
-    }
-
-    LlamaDenseWeight            gate;
-    std::vector<LlamaFfnWeight> experts;
+                 int             tp_size,
+                 int             tp_rank,
+                 bool            fuse_silu_act);
 
+    LlamaDenseWeight gate;
     LlamaDenseWeight shared_gate;
 
+    std::vector<std::unique_ptr<LlamaFfnWeight>> experts;
+
     // reference into `experts`
     LlamaFfnWeight block;
 

diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc
@@ -62,23 +62,23 @@ LlamaWeight::LlamaWeight(DataType           data_type,
 
     core::ContextGuard guard = context();
 
-    decoder_layer_weights.reserve(num_layer_);
-    for (unsigned l = 0; l < num_layer_; ++l) {
-        decoder_layer_weights.emplace_back(
-            new LlamaDecoderLayerWeight(data_type, l, model, engine_param, lora_param, moe_param));
-        decoder_layer_weights.back()->malloc();
-    }
-
     TM_CHECK_EQ(vocab_size_padded_ % tp_size_, 0);
     TM_CHECK_EQ(hidden_units_ % tp_size_, 0);
 
-    pre_decoder_embedding = LlamaDenseWeight{embedding_size_, hidden_units_ / tp_size_, data_type, data_type, 1};
-    pre_decoder_embedding.malloc();
+    pre_decoder_embedding.emplace(embedding_size_, hidden_units_ / tp_size_, data_type, false, data_type, 1);
+    post_decoder_embedding.emplace(hidden_units_, vocab_size_padded_ / tp_size_, data_type, false, data_type, 1);
+    register_module("tok_embeddings", pre_decoder_embedding, tp_rank_);
+    register_module("output", post_decoder_embedding, tp_rank_);
 
-    post_decoder_embedding = LlamaDenseWeight{hidden_units_, vocab_size_padded_ / tp_size_, data_type, data_type, 1};
-    post_decoder_embedding.malloc();
+    decoder_layer_weights.reserve(num_layer_);
+    for (int i = 0; i < num_layer_; ++i) {
+        decoder_layer_weights.emplace_back(
+            new LlamaDecoderLayerWeight(data_type, i, model, engine_param, lora_param, moe_param));
+        register_module("layers", *decoder_layer_weights.back(), i);
+    }
 
-    output_norm_weight = core::Buffer{hidden_units_, data_type_, MEMORY_GPU};
+    output_norm_weight = core::Tensor{{hidden_units_}, data_type_, MEMORY_GPU};
+    register_parameter("norm.weight", output_norm_weight);
 }
 
 LlamaWeight::~LlamaWeight()
@@ -90,7 +90,6 @@ LlamaWeight::~LlamaWeight()
     output_norm_weight     = {};
 
     for (auto& p : decoder_layer_weights) {
-        p->free();
         delete p;
     }
 
@@ -105,27 +104,6 @@ core::ContextGuard LlamaWeight::context() const
     return core::ContextGuard{stream_, alloca_};
 }
 
-core::TensorMap LlamaWeight::getParams()
-{
-    core::TensorMap output;
-
-    output.emplace("tok_embeddings." + std::to_string(tp_rank_) + ".weight", pre_decoder_embedding.weight);
-    output.emplace("output." + std::to_string(tp_rank_) + ".weight", post_decoder_embedding.weight);
-
-    output.emplace("norm.weight", output_norm_weight);
-
-    // transformer layers
-    for (size_t i = 0; i < num_layer_; i++) {
-        std::string     prefix = fmtstr("layers.%d", i);
-        core::TensorMap layer  = decoder_layer_weights[i]->getParams(prefix);
-        for (auto& kv : layer) {
-            output.insert(std::move(kv));
-        }
-    }
-
-    return output;
-}
-
 void LlamaWeight::prepare(const cudaDeviceProp& prop)
 {
     core::ContextGuard guard = context();

diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h
@@ -27,7 +27,7 @@
 
 namespace turbomind {
 
-struct LlamaWeight {
+struct LlamaWeight: core::Module {
     LlamaWeight() = default;
 
     LlamaWeight(DataType           data_type,
@@ -41,8 +41,6 @@ struct LlamaWeight {
     LlamaWeight(const LlamaWeight&)            = delete;
     LlamaWeight& operator=(const LlamaWeight&) = delete;
 
-    core::TensorMap getParams();
-
     void prepare(const cudaDeviceProp& prop);
 
     core::ContextGuard context() const;
@@ -52,7 +50,7 @@ struct LlamaWeight {
     LlamaDenseWeight pre_decoder_embedding;
     LlamaDenseWeight post_decoder_embedding;
 
-    core::Buffer output_norm_weight;
+    core::Tensor output_norm_weight;
 
 private:
     int hidden_units_;

diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc
@@ -170,10 +170,10 @@ void MoeFfnLayer::Forward(ForwardParam& p)
         }
 
         for (int i = 0; i < expert_num; ++i) {
-            FT_CHECK(moe.experts[i].is_fused_silu == false);
+            FT_CHECK(moe.experts[i]->is_fused_silu == false);
             if (int count = h_offsets_[i + 1] - h_offsets_[i]) {
                 auto io = p.temp.slice({h_offsets_[i], 0}, {count, -1});
-                expert_ffn_->forward({io, io, &moe.experts.at(i), p.layer_id});
+                expert_ffn_->forward({io, io, moe.experts.at(i).get(), p.layer_id});
             }
         }
     }

diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc
@@ -109,7 +109,7 @@ struct ForwardParam {
     core::Buffer_<int>       cu_block_nums;
     core::Buffer_<uintptr_t> kv_block_ptrs;
 
-    const void* weights;
+    const LlamaAttentionWeight* weights;
 
     core::Event event;
 
@@ -123,7 +123,7 @@ void Initialize(ForwardParam& p, core::TensorMap& args, const core::Tensor& inpu
     p.Init(args, input, output);
 }
 
-void SetLayer(ForwardParam& p, const void* weights, int layer_id)
+void SetLayer(ForwardParam& p, const LlamaAttentionWeight* weights, int layer_id)
 {
     p.weights  = weights;
     p.layer_id = layer_id;
@@ -235,7 +235,7 @@ void UnifiedAttentionLayer::forward(ForwardParam& p)
 
     const int layer_id = p.layer_id;
 
-    const auto& weights = *(const WeightType*)p.weights;
+    const auto& weights = *p.weights;
 
     // [L, 2, H, s, D]
     const size_t layer_offset = layer_id * 2 * local_kv_head_num_ * param_.cache_block_seq_len * size_per_head_;
@@ -313,7 +313,7 @@ core::Tensor UnifiedAttentionLayer::core_attention(core::Tensor& qkv, const Forw
         params.stride = (local_head_num_ + 2 * local_kv_head_num_) * size_per_head_;
 
         if (weights.qkv.bias) {
-            params.q_bias = weights.qkv.bias.unsafe_data<T>();
+            params.q_bias = weights.qkv.bias.buffer().unsafe_data<T>();
             params.k_bias = params.q_bias + local_head_num_ * size_per_head_;
             params.v_bias = params.k_bias + local_kv_head_num_ * size_per_head_;
         }

diff --git a/src/turbomind/models/llama/unified_attention_layer.h b/src/turbomind/models/llama/unified_attention_layer.h
@@ -39,7 +39,7 @@ struct ForwardParam;
 
 void Initialize(ForwardParam& p, core::TensorMap& args, const core::Tensor& input, core::Tensor& output);
 
-void SetLayer(ForwardParam& p, const void* weights, int layer_id);
+void SetLayer(ForwardParam& p, const LlamaAttentionWeight* weights, int layer_id);
 
 void Finalize(ForwardParam& p);
 

diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
@@ -55,8 +55,8 @@ UnifiedDecoder::~UnifiedDecoder() = default;
 
 void UnifiedDecoder::AllreduceResidualRMSnorm(core::Tensor&       hidden_states,
                                               core::Tensor&       residual,
-                                              const core::Buffer& bias,
-                                              const core::Buffer& weight,
+                                              const core::Tensor& bias,
+                                              const core::Tensor& weight,
                                               int                 token_num,
                                               int                 group0,
                                               int                 group1,
@@ -68,7 +68,7 @@ void UnifiedDecoder::AllreduceResidualRMSnorm(core::Tensor&       hidden_states,
     else if (group0 || group1) {
         d_comm_->AllreduceResidualBiasRMSnormEx(hidden_states.raw_data(),
                                                 residual.raw_data(),
-                                                bias.unsafe_data(),
+                                                bias.buffer().unsafe_data(),
                                                 weight.raw_data(),
                                                 rmsnorm_eps_,
                                                 hidden_units_,
@@ -82,7 +82,7 @@ void UnifiedDecoder::AllreduceResidualRMSnorm(core::Tensor&       hidden_states,
     else if (d_comm_) {
         d_comm_->AllreduceResidualBiasRMSnorm(hidden_states.raw_data(),
                                               residual.raw_data(),
-                                              bias.unsafe_data(),
+                                              bias.buffer().unsafe_data(),
                                               weight.raw_data(),
                                               rmsnorm_eps_,
                                               hidden_units_,
@@ -96,7 +96,7 @@ void UnifiedDecoder::AllreduceResidualRMSnorm(core::Tensor&       hidden_states,
         invokeResidualBiasRMSNorm(hidden_states.raw_data(),
                                   residual.raw_data(),
                                   weight.raw_data(),
-                                  bias.unsafe_data(),
+                                  bias.buffer().unsafe_data(),
                                   dtype,
                                   hidden_units_,
                                   token_num,
@@ -170,14 +170,14 @@ void UnifiedDecoder::Forward(core::TensorMap& args, const std::vector<WeightType
 
         /////////////////////////////////////////////
         /// self-attention
-        SetLayer(*attn_fwd_param_, &weights.at(layer)->self_attn_weights, layer);
+        SetLayer(*attn_fwd_param_, weights.at(layer)->self_attn_weights.get(), layer);
         attn_layer_->forward(*attn_fwd_param_);
 
         TM_DEBUG_TENSOR(local_hidden_states, Concat("attn_block", layer), 1);
 
         AllreduceResidualRMSnorm(global_hidden_states,
                                  local_residual,
-                                 weights.at(layer)->self_attn_weights.output.bias,
+                                 weights.at(layer)->self_attn_weights->output.bias,
                                  weights.at(layer)->ffn_norm,
                                  local_token_num,
                                  attn_tp_group_,
@@ -192,22 +192,22 @@ void UnifiedDecoder::Forward(core::TensorMap& args, const std::vector<WeightType
 
         std::optional<MoeFfnLayer::ForwardParam> moe_fwd_param;
 
-        if (!weights.at(layer)->moe_weights.experts.empty()) {
+        if (weights.at(layer)->moe_weights) {
             moe_fwd_param = MoeFfnLayer::ForwardParam{global_hidden_states,
                                                       global_hidden_states,
                                                       {},
                                                       ffn_layer_ ? 1.f : 0.f,
                                                       (int)layer,
-                                                      &weights.at(layer)->moe_weights};
+                                                      weights.at(layer)->moe_weights.get()};
         }
 
         if (moe_fwd_param) {
             moe_ffn_layer_->Forward(*moe_fwd_param);
         }
 
-        if (weights.at(layer)->ffn_weights.output.weight) {
+        if (weights.at(layer)->ffn_weights) {
             ffn_layer_->forward(
-                {global_hidden_states, global_hidden_states, &weights.at(layer)->ffn_weights, (int)layer});
+                {global_hidden_states, global_hidden_states, weights.at(layer)->ffn_weights.get(), (int)layer});
         }
 
         if (moe_fwd_param) {
@@ -218,11 +218,11 @@ void UnifiedDecoder::Forward(core::TensorMap& args, const std::vector<WeightType
 
         const bool last = layer == layer_num_ - 1;
 
-        auto& scale_weight = !last ? weights.at(layer + 1)->self_attn_norm : args.at("output_norm_weight").buffer();
+        auto& scale_weight = !last ? weights.at(layer + 1)->self_attn_norm : args.at("output_norm_weight");
 
         AllreduceResidualRMSnorm(global_hidden_states,
                                  local_residual,
-                                 weights.at(layer)->ffn_weights.output.bias,
+                                 weights.at(layer)->ffn_weights->output.bias,
                                  scale_weight,
                                  local_token_num,
                                  0,
@@ -260,9 +260,6 @@ void UnifiedDecoder::Forward(core::TensorMap& args, const std::vector<WeightType
     }
 
     Finalize(*attn_fwd_param_);
-
-    // core::Context::stream().Sync();
-    // TM_CHECK(0);
 }
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/unified_decoder.h b/src/turbomind/models/llama/unified_decoder.h
@@ -43,8 +43,8 @@ class UnifiedDecoder {
 
     void AllreduceResidualRMSnorm(core::Tensor&       hidden_states,
                                   core::Tensor&       residual,
-                                  const core::Buffer& bias,
-                                  const core::Buffer& weight,
+                                  const core::Tensor& bias,
+                                  const core::Tensor& weight,
                                   int                 token_num,
                                   int                 t0,
                                   int                 t1,

diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -435,9 +435,7 @@ void LlamaTritonModel::createSharedWeights(int device_id, int rank)
 
 core::TensorMap LlamaTritonModel::getParams(int device_id, int rank)
 {
-    check_cuda_error(cudaSetDevice(device_id));
-
-    return TM_CHECK_NOTNULL(weights_[rank])->getParams();
+    return TM_CHECK_NOTNULL(weights_[rank])->get_parameters();
 }
 
 void LlamaTritonModel::processWeights(int device_id, int rank)