Skip to content

Commit 5709438

Browse files
zhangchenyi_dlneilzhuu
authored andcommitted
[Metax] Modify macros about rejection
1 parent 808f74e commit 5709438

File tree

2 files changed

+10
-19
lines changed

2 files changed

+10
-19
lines changed

custom_ops/gpu_ops/sample_kernels/sampling.cuh

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -228,8 +228,7 @@ __device__ __forceinline__ void DeviceSamplingFromProb(
228228
prob_greater_than_threshold[j] = pred(prob_vec[j]) ? prob_vec[j] : 0;
229229
valid[j] = pred(prob_vec[j]) && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d;
230230
}
231-
#if defined(PADDLE_WITH_COREX) || defined(__MC_PLATFORM_MXCC__) || \
232-
defined(__MCC__) || defined(__MXCC__)
231+
#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
233232
float aggregate_local = BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(
234233
temp_storage->block_prim.reduce)
235234
.Sum(prob_greater_than_threshold);
@@ -252,8 +251,7 @@ __device__ __forceinline__ void DeviceSamplingFromProb(
252251
REDUCE_ALGORITHM>(
253252
prob_greater_than_threshold, inclusive_cdf, temp_storage);
254253
} else {
255-
#if defined(PADDLE_WITH_COREX) || defined(__MC_PLATFORM_MXCC__) || \
256-
defined(__MCC__) || defined(__MXCC__)
254+
#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
257255
BlockScan<float, BLOCK_THREADS, SCAN_ALGORITHM>(
258256
temp_storage->block_prim.scan)
259257
.InclusiveSum(prob_greater_than_threshold, inclusive_cdf);
@@ -273,8 +271,7 @@ __device__ __forceinline__ void DeviceSamplingFromProb(
273271

274272
bool greater_than_u_diff[VEC_SIZE];
275273
#ifdef SAMPLING_CUB_SUBTRACTLEFT_DEFINED
276-
#if defined(PADDLE_WITH_COREX) || defined(__MC_PLATFORM_MXCC__) || \
277-
defined(__MCC__) || defined(__MXCC__)
274+
#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
278275
BlockAdjacentDifference<bool, BLOCK_THREADS>(
279276
temp_storage->block_prim.adj_diff)
280277
.SubtractLeft(greater_than_u, greater_than_u_diff, BoolDiffOp());
@@ -285,8 +282,7 @@ __device__ __forceinline__ void DeviceSamplingFromProb(
285282
greater_than_u, greater_than_u_diff, BoolDiffOp());
286283
#endif
287284
#else
288-
#if defined(PADDLE_WITH_COREX) || defined(__MC_PLATFORM_MXCC__) || \
289-
defined(__MCC__) || defined(__MXCC__)
285+
#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
290286
BlockAdjacentDifference<bool, BLOCK_THREADS>(
291287
temp_storage->block_prim.adj_diff)
292288
.FlagHeads(greater_than_u_diff, greater_than_u, BoolDiffOp(), 0);
@@ -423,8 +419,7 @@ __global__ void TopKTopPSamplingFromProbKernel(DType* probs,
423419
(i * BLOCK_THREADS + tx) * VEC_SIZE + j < d)};
424420
}
425421

426-
#if defined(PADDLE_WITH_COREX) || defined(__MC_PLATFORM_MXCC__) || \
427-
defined(__MCC__) || defined(__MXCC__)
422+
#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
428423
aggregate_gt_pivot_0 += BlockReduce<ValueCount<float>, BLOCK_THREADS>(
429424
temp_storage.block_prim.reduce_value_count)
430425
.Sum(probs_gt_pivot_0);
@@ -563,8 +558,7 @@ __global__ void TopPSamplingFromProbKernel(DType* probs,
563558
probs_gt_pivot_1[j] = (probs_vec[j] > pivot_1) ? probs_vec[j] : 0;
564559
}
565560

566-
#if defined(PADDLE_WITH_COREX) || defined(__MC_PLATFORM_MXCC__) || \
567-
defined(__MCC__) || defined(__MXCC__)
561+
#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
568562
aggregate_gt_pivot_0 +=
569563
BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
570564
.Sum(probs_gt_pivot_0);
@@ -638,8 +632,7 @@ __device__ __forceinline__ float GetMaxValue(float* in_data,
638632
for (uint32_t j = 0; j < VEC_SIZE; ++j) {
639633
in_data_[j] = in_data_vec[j];
640634
}
641-
#if defined(PADDLE_WITH_COREX) || defined(__MC_PLATFORM_MXCC__) || \
642-
defined(__MCC__) || defined(__MXCC__)
635+
#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
643636
max_val = max(max_val,
644637
BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(
645638
temp_storage.block_prim.reduce)
@@ -748,8 +741,7 @@ __global__ void TopKRenormProbKernel(DType* probs,
748741
const uint32_t bx = blockIdx.x, tx = threadIdx.x;
749742
const uint32_t row_idx = bx;
750743
const uint32_t k = top_k_arr[row_idx] == 0 ? d : top_k_arr[row_idx];
751-
#if defined(PADDLE_WITH_COREX) || defined(__MC_PLATFORM_MXCC__) || \
752-
defined(__MCC__) || defined(__MXCC__)
744+
#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
753745
double pivot = std::numeric_limits<float>::infinity(), normalizer = 1;
754746
#else
755747
double pivot = -cuda::std::numeric_limits<float>::infinity(), normalizer = 1;
@@ -817,8 +809,7 @@ __global__ void TopKRenormProbKernel(DType* probs,
817809
}
818810
}
819811

820-
#if defined(PADDLE_WITH_COREX) || defined(__MC_PLATFORM_MXCC__) || \
821-
defined(__MCC__) || defined(__MXCC__)
812+
#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
822813
aggregate_gt_pivot_0 +=
823814
BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
824815
temp_storage.block_prim.reduce_value_count)

custom_ops/gpu_ops/sample_kernels/utils.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,7 @@ __forceinline__ __device__ float ptx_rcp(float x) {
275275
#ifdef PADDLE_WITH_COREX
276276
return __ivcorex_rcpf(x);
277277
#else
278-
#if defined(__MC_PLATFORM_MXCC__) || defined(__MCC__) || defined(__MXCC__)
278+
#ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
279279
return __frcp_rn(x);
280280
#else
281281
float y;

0 commit comments

Comments
 (0)