diff --git a/API/src/util.t b/API/src/util.t index 0085b51b7..8efab26dd 100644 --- a/API/src/util.t +++ b/API/src/util.t @@ -522,6 +522,7 @@ __syncthreads = cudalib.nvvm_barrier0 local __shfl_down +local __shfl_down_sync if opt_float == float then @@ -537,6 +538,14 @@ if opt_float == float then ret = terralib.asm(float,"shfl.down.b32 $0, $1, $2, $3;","=f,f,r,r", true, v, delta, c) return ret; end + + terra __shfl_down_sync(mask : uint, v : float, delta : uint, width : int) + var ret : float; + var c : int; + c = ((warpSize-width) << 8) or 0x1F; + ret = terralib.asm(float,"shfl.sync.down.b32 $0, $1, $2, $3, $4;","=f,f,r,r,r", true, v, delta, c, mask) + return ret; + end else struct ULLDouble { union { @@ -606,6 +615,17 @@ else ret.u2.y = terralib.asm(uint32,"shfl.down.b32 $0, $1, $2, $3;","=f,f,r,r", true, init.u2.y, delta, c) return ret.d; end + + terra __shfl_down_sync(mask: uint, v : double, delta : uint, width : int) + var ret : uint2Double; + var init : uint2Double; + init.d = v + var c : int; + c = ((warpSize-width) << 8) or 0x1F; + ret.u2.x = terralib.asm(uint32,"shfl.sync.down.b32 $0, $1, $2, $3, $4;","=f,f,r,r,r", true, init.u2.x, delta, c, mask) + ret.u2.y = terralib.asm(uint32,"shfl.sync.down.b32 $0, $1, $2, $3, $4;","=f,f,r,r,r", true, init.u2.y, delta, c, mask) + return ret.d; + end end -- Using the "Kepler Shuffle", see http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/ diff --git a/examples/arap_mesh_deformation/src/WarpingSolverUtil.h b/examples/arap_mesh_deformation/src/WarpingSolverUtil.h index 57276b663..9ed0abb8d 100644 --- a/examples/arap_mesh_deformation/src/WarpingSolverUtil.h +++ b/examples/arap_mesh_deformation/src/WarpingSolverUtil.h @@ -12,7 +12,7 @@ __inline__ __device__ float warpReduce(float val) { int offset = 32 >> 1; while (offset > 0) { - val = val + __shfl_down(val, offset, 32); + val = val + __shfl_down_sync(0xffffffff, val, offset, 32); offset = offset >> 1; } return val; diff --git a/examples/image_warping/src/WarpingSolverUtil.h b/examples/image_warping/src/WarpingSolverUtil.h index 31eab308b..0cc9bf68f 100644 --- a/examples/image_warping/src/WarpingSolverUtil.h +++ b/examples/image_warping/src/WarpingSolverUtil.h @@ -12,7 +12,7 @@ __inline__ __device__ float warpReduce(float val) { int offset = 32 >> 1; while (offset > 0) { - val = val + __shfl_down(val, offset, 32); + val = val + __shfl_down_sync(0xffffffff, val, offset, 32); offset = offset >> 1; } return val; diff --git a/examples/poisson_image_editing/src/PatchSolverWarpingUtil.h b/examples/poisson_image_editing/src/PatchSolverWarpingUtil.h index 088070160..4f66d2818 100644 --- a/examples/poisson_image_editing/src/PatchSolverWarpingUtil.h +++ b/examples/poisson_image_editing/src/PatchSolverWarpingUtil.h @@ -105,7 +105,7 @@ __inline__ __device__ void loadPatchToCache(volatile float* cache, float* data, __inline__ __device__ float warpReduce(float val) { int offset = 32 >> 1; while (offset > 0) { - val = val + __shfl_down(val, offset, 32); + val = val + __shfl_down_sync(0xffffffff, val, offset, 32); offset = offset >> 1; } return val; diff --git a/examples/poisson_image_editing/src/WarpingSolverUtil.h b/examples/poisson_image_editing/src/WarpingSolverUtil.h index 31eab308b..0cc9bf68f 100644 --- a/examples/poisson_image_editing/src/WarpingSolverUtil.h +++ b/examples/poisson_image_editing/src/WarpingSolverUtil.h @@ -12,7 +12,7 @@ __inline__ __device__ float warpReduce(float val) { int offset = 32 >> 1; while (offset > 0) { - val = val + __shfl_down(val, offset, 32); + val = val + __shfl_down_sync(0xffffffff, val, offset, 32); offset = offset >> 1; } return val; diff --git a/examples/shape_from_shading/src/SFSSolverUtil.h b/examples/shape_from_shading/src/SFSSolverUtil.h index 75cfc7be2..63bf62a59 100644 --- a/examples/shape_from_shading/src/SFSSolverUtil.h +++ b/examples/shape_from_shading/src/SFSSolverUtil.h @@ -18,7 +18,7 @@ __inline__ __device__ bool IsValidPoint(float d) __inline__ __device__ float warpReduce(float val) { int offset = 32 >> 1; while (offset > 0) { - val = val + __shfl_down(val, offset, 32); + val = val + __shfl_down_sync(0xffffffff, val, offset, 32); offset = offset >> 1; } return val; diff --git a/examples/volumetric_mesh_deformation/src/WarpingSolverUtil.h b/examples/volumetric_mesh_deformation/src/WarpingSolverUtil.h index 18bc580db..13c4c7da1 100644 --- a/examples/volumetric_mesh_deformation/src/WarpingSolverUtil.h +++ b/examples/volumetric_mesh_deformation/src/WarpingSolverUtil.h @@ -12,7 +12,7 @@ __inline__ __device__ float warpReduce(float val) { int offset = 32 >> 1; while (offset > 0) { - val = val + __shfl_down(val, offset, 32); + val = val + __shfl_down_sync(0xffffffff, val, offset, 32); offset = offset >> 1; } return val;