[bugfix]: deepgemm online quant (#1130)

shihaobai · web-flow · commit 5f2ce960f391 · 2025-12-01T15:30:39.000+08:00
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py
@@ -422,8 +422,12 @@ def _fuse(self):
                 inter_shape, hidden_size = self.w2_list[0].shape[0], self.w2_list[0].shape[1]
                 w2 = torch._utils._flatten_dense_tensors(self.w2_list).view(len(self.w2_list), inter_shape, hidden_size)
                 if not self.quantized_weight and self.quant_method is not None:
-                    self.w1 = self.quant_method.quantize(w1)
-                    self.w2 = self.quant_method.quantize(w2)
+                    qw1, qw1_scale, qw1_zero_point = self.quant_method.quantize(w1)
+                    qw2, qw2_scale, qw2_zero_point = self.quant_method.quantize(w2)
+                    self.w1[0] = qw1
+                    self.w1[1] = qw1_scale
+                    self.w2[0] = qw2
+                    self.w2[1] = qw2_scale
                 else:
                     self.w1[0] = self._cuda(w1)
                     self.w2[0] = self._cuda(w2)
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep_redundancy.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep_redundancy.py
@@ -102,12 +102,15 @@ def _fuse(self):
                 inter_shape, hidden_size = self.w2_list[0].shape[0], self.w2_list[0].shape[1]
                 w2 = torch._utils._flatten_dense_tensors(self.w2_list).view(len(self.w2_list), inter_shape, hidden_size)
                 if not self._ep_w.quantized_weight and self._ep_w.quant_method is not None:
-                    self.w1 = self._ep_w.quant_method.quantize(w1)
-                    self.w2 = self._ep_w.quant_method.quantize(w2)
+                    qw1, qw1_scale, qw1_zero_point = self._ep_w.quant_method.quantize(w1)
+                    qw2, qw2_scale, qw2_zero_point = self._ep_w.quant_method.quantize(w2)
+                    self.w1[0] = qw1
+                    self.w1[1] = qw1_scale
+                    self.w2[0] = qw2
+                    self.w2[1] = qw2_scale
                 else:
                     self.w1[0] = w1
                     self.w2[0] = w2
-
                 delattr(self, "w2_list")
                 delattr(self, "experts_up_projs")
                 delattr(self, "experts_gate_projs")
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_tp.py
@@ -182,8 +182,12 @@ def _fuse(self):
                 inter_shape, hidden_size = self.w2_list[0].shape[0], self.w2_list[0].shape[1]
                 w2 = torch._utils._flatten_dense_tensors(self.w2_list).view(len(self.w2_list), inter_shape, hidden_size)
                 if not self.quantized_weight and self.quant_method is not None:
-                    self.w1 = self.quant_method.quantize(w1)
-                    self.w2 = self.quant_method.quantize(w2)
+                    qw1, qw1_scale, qw1_zero_point = self.quant_method.quantize(w1)
+                    qw2, qw2_scale, qw2_zero_point = self.quant_method.quantize(w2)
+                    self.w1[0] = qw1
+                    self.w1[1] = qw1_scale
+                    self.w2[0] = qw2
+                    self.w2[1] = qw2_scale
                 else:
                     self.w1[0] = self._cuda(w1)
                     self.w2[0] = self._cuda(w2)
diff --git a/lightllm/common/quantization/deepgemm_quant.py b/lightllm/common/quantization/deepgemm_quant.py
@@ -63,7 +63,8 @@ def method_name(self):
     def quantize(self, weight: torch.Tensor):
         from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_block_quant_kernel import weight_quant
 
-        return weight_quant(weight, self.block_size)
+        weight, scale = weight_quant(weight, self.block_size)
+        return weight, scale, None
 
     def apply(
         self,
diff --git a/lightllm/common/quantization/triton_quant/fp8/fp8w8a8_block_quant_kernel.py b/lightllm/common/quantization/triton_quant/fp8/fp8w8a8_block_quant_kernel.py
@@ -55,4 +55,4 @@ def weight_quant(x: torch.Tensor, block_size: int = 128) -> tuple[torch.Tensor,
         return y_quant, s_scales
     else:
         y_quant, s_scales = mm_weight_quant(x, block_size)
-        return y_quant.t(), s_scales.t()
+        return y_quant, s_scales