[Metax] fix build error of rejection

zhangchenyi_dl · neilzhuu · commit 038c4448c630 · 2025-12-03T20:20:46.000+08:00
diff --git a/custom_ops/gpu_ops/sample_kernels/sampling.cuh b/custom_ops/gpu_ops/sample_kernels/sampling.cuh
@@ -434,14 +434,14 @@ __global__ void TopKTopPSamplingFromProbKernel(DType* probs,
       __syncthreads();
       aggregate_gt_pivot_0 = temp_storage.block_aggregate.pair;
 
-#ifdef PADDLE_WITH_COREX
+#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
       aggregate_gt_pivot_1 += BlockReduce<ValueCount<float>, BLOCK_THREADS>(
                                   temp_storage.block_prim.reduce_value_count)
                                   .Sum(probs_gt_pivot_1);
 #else
       aggregate_gt_pivot_1 += BlockReduce<ValueCount<float>, BLOCK_THREADS>(
                                   temp_storage.block_prim.reduce_value_count)
-                                  .Sum(probs_gt_pivot_1);
+                                  .Sum<VEC_SIZE>(probs_gt_pivot_1);
 #endif
       if (tx == 0) {
         temp_storage.block_aggregate.pair = aggregate_gt_pivot_1;
@@ -573,14 +573,14 @@ __global__ void TopPSamplingFromProbKernel(DType* probs,
       __syncthreads();
       aggregate_gt_pivot_0 = temp_storage.block_aggregate.value;
 
-#ifdef PADDLE_WITH_COREX
+#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
       aggregate_gt_pivot_1 +=
           BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
               .Sum(probs_gt_pivot_1);
 #else
       aggregate_gt_pivot_1 +=
           BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
-              .Sum(probs_gt_pivot_1);
+              .Sum<VEC_SIZE>(probs_gt_pivot_1);
 #endif
       if (tx == 0) {
         temp_storage.block_aggregate.value = aggregate_gt_pivot_1;
@@ -822,7 +822,7 @@ __global__ void TopKRenormProbKernel(DType* probs,
 #endif
         __syncthreads();
 
-#ifdef PADDLE_WITH_COREX
+#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
         aggregate_gt_pivot_1 +=
             BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
                 temp_storage.block_prim.reduce_value_count)
@@ -831,7 +831,7 @@ __global__ void TopKRenormProbKernel(DType* probs,
         aggregate_gt_pivot_1 +=
             BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
                 temp_storage.block_prim.reduce_value_count)
-                .Sum(probs_gt_pivot_1_pair);
+                .Sum<VEC_SIZE>(probs_gt_pivot_1_pair);
 #endif
         __syncthreads();
       }
diff --git a/fastdeploy/worker/metax_model_runner.py b/fastdeploy/worker/metax_model_runner.py
@@ -1473,7 +1473,7 @@ def _dummy_sampler_run(
         )
 
         post_process(
-            sampler_output=sampler_output,
+            sampler_or_pooler_output=sampler_output,
             model_output=model_output_data,
             share_inputs=self.share_inputs,
             block_size=self.cache_config.block_size,

Original file line number	Diff line number	Diff line change
`@@ -1473,7 +1473,7 @@ def _dummy_sampler_run(`
`1473`	`1473`	`)`
`1474`	`1474`
`1475`	`1475`	`post_process(`
`1476`		`- sampler_output=sampler_output,`
	`1476`	`+ sampler_or_pooler_output=sampler_output,`
`1477`	`1477`	`model_output=model_output_data,`
`1478`	`1478`	`share_inputs=self.share_inputs,`
`1479`	`1479`	`block_size=self.cache_config.block_size,`