@@ -228,8 +228,7 @@ __device__ __forceinline__ void DeviceSamplingFromProb(
228228 prob_greater_than_threshold[j] = pred (prob_vec[j]) ? prob_vec[j] : 0 ;
229229 valid[j] = pred (prob_vec[j]) && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d;
230230 }
231- #if defined(PADDLE_WITH_COREX) || defined(__MC_PLATFORM_MXCC__) || \
232- defined (__MCC__) || defined (__MXCC__)
231+ #if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
233232 float aggregate_local = BlockReduce<float , BLOCK_THREADS, REDUCE_ALGORITHM>(
234233 temp_storage->block_prim .reduce )
235234 .Sum (prob_greater_than_threshold);
@@ -252,8 +251,7 @@ __device__ __forceinline__ void DeviceSamplingFromProb(
252251 REDUCE_ALGORITHM>(
253252 prob_greater_than_threshold, inclusive_cdf, temp_storage);
254253 } else {
255- #if defined(PADDLE_WITH_COREX) || defined(__MC_PLATFORM_MXCC__) || \
256- defined (__MCC__) || defined (__MXCC__)
254+ #if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
257255 BlockScan<float , BLOCK_THREADS, SCAN_ALGORITHM>(
258256 temp_storage->block_prim .scan )
259257 .InclusiveSum (prob_greater_than_threshold, inclusive_cdf);
@@ -273,8 +271,7 @@ __device__ __forceinline__ void DeviceSamplingFromProb(
273271
274272 bool greater_than_u_diff[VEC_SIZE];
275273#ifdef SAMPLING_CUB_SUBTRACTLEFT_DEFINED
276- #if defined(PADDLE_WITH_COREX) || defined(__MC_PLATFORM_MXCC__) || \
277- defined (__MCC__) || defined (__MXCC__)
274+ #if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
278275 BlockAdjacentDifference<bool , BLOCK_THREADS>(
279276 temp_storage->block_prim .adj_diff )
280277 .SubtractLeft (greater_than_u, greater_than_u_diff, BoolDiffOp ());
@@ -285,8 +282,7 @@ __device__ __forceinline__ void DeviceSamplingFromProb(
285282 greater_than_u, greater_than_u_diff, BoolDiffOp ());
286283#endif
287284#else
288- #if defined(PADDLE_WITH_COREX) || defined(__MC_PLATFORM_MXCC__) || \
289- defined (__MCC__) || defined (__MXCC__)
285+ #if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
290286 BlockAdjacentDifference<bool , BLOCK_THREADS>(
291287 temp_storage->block_prim .adj_diff )
292288 .FlagHeads (greater_than_u_diff, greater_than_u, BoolDiffOp (), 0 );
@@ -423,8 +419,7 @@ __global__ void TopKTopPSamplingFromProbKernel(DType* probs,
423419 (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d)};
424420 }
425421
426- #if defined(PADDLE_WITH_COREX) || defined(__MC_PLATFORM_MXCC__) || \
427- defined (__MCC__) || defined (__MXCC__)
422+ #if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
428423 aggregate_gt_pivot_0 += BlockReduce<ValueCount<float >, BLOCK_THREADS>(
429424 temp_storage.block_prim .reduce_value_count )
430425 .Sum (probs_gt_pivot_0);
@@ -563,8 +558,7 @@ __global__ void TopPSamplingFromProbKernel(DType* probs,
563558 probs_gt_pivot_1[j] = (probs_vec[j] > pivot_1) ? probs_vec[j] : 0 ;
564559 }
565560
566- #if defined(PADDLE_WITH_COREX) || defined(__MC_PLATFORM_MXCC__) || \
567- defined (__MCC__) || defined (__MXCC__)
561+ #if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
568562 aggregate_gt_pivot_0 +=
569563 BlockReduce<float , BLOCK_THREADS>(temp_storage.block_prim .reduce )
570564 .Sum (probs_gt_pivot_0);
@@ -638,8 +632,7 @@ __device__ __forceinline__ float GetMaxValue(float* in_data,
638632 for (uint32_t j = 0 ; j < VEC_SIZE; ++j) {
639633 in_data_[j] = in_data_vec[j];
640634 }
641- #if defined(PADDLE_WITH_COREX) || defined(__MC_PLATFORM_MXCC__) || \
642- defined (__MCC__) || defined (__MXCC__)
635+ #if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
643636 max_val = max (max_val,
644637 BlockReduce<float , BLOCK_THREADS, REDUCE_ALGORITHM>(
645638 temp_storage.block_prim .reduce )
@@ -748,8 +741,7 @@ __global__ void TopKRenormProbKernel(DType* probs,
748741 const uint32_t bx = blockIdx .x , tx = threadIdx .x ;
749742 const uint32_t row_idx = bx;
750743 const uint32_t k = top_k_arr[row_idx] == 0 ? d : top_k_arr[row_idx];
751- #if defined(PADDLE_WITH_COREX) || defined(__MC_PLATFORM_MXCC__) || \
752- defined (__MCC__) || defined (__MXCC__)
744+ #if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
753745 double pivot = std::numeric_limits<float >::infinity (), normalizer = 1 ;
754746#else
755747 double pivot = -cuda::std::numeric_limits<float >::infinity (), normalizer = 1 ;
@@ -817,8 +809,7 @@ __global__ void TopKRenormProbKernel(DType* probs,
817809 }
818810 }
819811
820- #if defined(PADDLE_WITH_COREX) || defined(__MC_PLATFORM_MXCC__) || \
821- defined (__MCC__) || defined (__MXCC__)
812+ #if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
822813 aggregate_gt_pivot_0 +=
823814 BlockReduce<ValueCount<float >, BLOCK_THREADS, REDUCE_ALGORITHM>(
824815 temp_storage.block_prim .reduce_value_count )
0 commit comments