diff --git a/API/src/util.t b/API/src/util.t
index 0085b51b7..8efab26dd 100644
--- a/API/src/util.t
+++ b/API/src/util.t
@@ -522,6 +522,7 @@ __syncthreads = cudalib.nvvm_barrier0
 
 
 local __shfl_down 
+local __shfl_down_sync
 
 if opt_float == float then
 
@@ -537,6 +538,14 @@ if opt_float == float then
     	ret = terralib.asm(float,"shfl.down.b32 $0, $1, $2, $3;","=f,f,r,r", true, v, delta, c)
     	return ret;
     end
+
+    terra __shfl_down_sync(mask : uint, v : float, delta : uint, width : int)
+        var ret : float;
+        var c : int;
+    	c = ((warpSize-width) << 8) or 0x1F;
+    	ret = terralib.asm(float,"shfl.sync.down.b32 $0, $1, $2, $3, $4;","=f,f,r,r,r", true, v, delta, c, mask)
+    	return ret;
+    end
 else
     struct ULLDouble {
         union {
@@ -606,6 +615,17 @@ else
         ret.u2.y = terralib.asm(uint32,"shfl.down.b32 $0, $1, $2, $3;","=f,f,r,r", true, init.u2.y, delta, c)
         return ret.d;
     end
+
+    terra __shfl_down_sync(mask: uint, v : double, delta : uint, width : int)
+        var ret : uint2Double;
+        var init : uint2Double;
+        init.d = v
+        var c : int;
+        c = ((warpSize-width) << 8) or 0x1F;
+        ret.u2.x = terralib.asm(uint32,"shfl.sync.down.b32 $0, $1, $2, $3, $4;","=f,f,r,r,r", true, init.u2.x, delta, c, mask)
+        ret.u2.y = terralib.asm(uint32,"shfl.sync.down.b32 $0, $1, $2, $3, $4;","=f,f,r,r,r", true, init.u2.y, delta, c, mask)
+        return ret.d;
+    end
 end
 
 -- Using the "Kepler Shuffle", see http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
diff --git a/examples/arap_mesh_deformation/src/WarpingSolverUtil.h b/examples/arap_mesh_deformation/src/WarpingSolverUtil.h
index 57276b663..9ed0abb8d 100644
--- a/examples/arap_mesh_deformation/src/WarpingSolverUtil.h
+++ b/examples/arap_mesh_deformation/src/WarpingSolverUtil.h
@@ -12,7 +12,7 @@
 __inline__ __device__ float warpReduce(float val) {
     int offset = 32 >> 1;
     while (offset > 0) {
-        val = val + __shfl_down(val, offset, 32);
+        val = val + __shfl_down_sync(0xffffffff, val, offset, 32);
         offset = offset >> 1;
     }
     return val;
diff --git a/examples/image_warping/src/WarpingSolverUtil.h b/examples/image_warping/src/WarpingSolverUtil.h
index 31eab308b..0cc9bf68f 100644
--- a/examples/image_warping/src/WarpingSolverUtil.h
+++ b/examples/image_warping/src/WarpingSolverUtil.h
@@ -12,7 +12,7 @@
 __inline__ __device__ float warpReduce(float val) {
     int offset = 32 >> 1;
     while (offset > 0) {
-        val = val + __shfl_down(val, offset, 32);
+        val = val + __shfl_down_sync(0xffffffff, val, offset, 32);
         offset = offset >> 1;
     }
     return val;
diff --git a/examples/poisson_image_editing/src/PatchSolverWarpingUtil.h b/examples/poisson_image_editing/src/PatchSolverWarpingUtil.h
index 088070160..4f66d2818 100644
--- a/examples/poisson_image_editing/src/PatchSolverWarpingUtil.h
+++ b/examples/poisson_image_editing/src/PatchSolverWarpingUtil.h
@@ -105,7 +105,7 @@ __inline__ __device__ void loadPatchToCache(volatile float* cache, float* data,
 __inline__ __device__ float warpReduce(float val) {
 	int offset = 32 >> 1;
 	while (offset > 0) {
-		val = val + __shfl_down(val, offset, 32);
+		val = val + __shfl_down_sync(0xffffffff, val, offset, 32);
 		offset = offset >> 1;
 	}
 	return val;
diff --git a/examples/poisson_image_editing/src/WarpingSolverUtil.h b/examples/poisson_image_editing/src/WarpingSolverUtil.h
index 31eab308b..0cc9bf68f 100644
--- a/examples/poisson_image_editing/src/WarpingSolverUtil.h
+++ b/examples/poisson_image_editing/src/WarpingSolverUtil.h
@@ -12,7 +12,7 @@
 __inline__ __device__ float warpReduce(float val) {
     int offset = 32 >> 1;
     while (offset > 0) {
-        val = val + __shfl_down(val, offset, 32);
+        val = val + __shfl_down_sync(0xffffffff, val, offset, 32);
         offset = offset >> 1;
     }
     return val;
diff --git a/examples/shape_from_shading/src/SFSSolverUtil.h b/examples/shape_from_shading/src/SFSSolverUtil.h
index 75cfc7be2..63bf62a59 100644
--- a/examples/shape_from_shading/src/SFSSolverUtil.h
+++ b/examples/shape_from_shading/src/SFSSolverUtil.h
@@ -18,7 +18,7 @@ __inline__ __device__ bool IsValidPoint(float d)
 __inline__ __device__ float warpReduce(float val) {
     int offset = 32 >> 1;
     while (offset > 0) {
-        val = val + __shfl_down(val, offset, 32);
+        val = val + __shfl_down_sync(0xffffffff, val, offset, 32);
         offset = offset >> 1;
     }
     return val;
diff --git a/examples/volumetric_mesh_deformation/src/WarpingSolverUtil.h b/examples/volumetric_mesh_deformation/src/WarpingSolverUtil.h
index 18bc580db..13c4c7da1 100644
--- a/examples/volumetric_mesh_deformation/src/WarpingSolverUtil.h
+++ b/examples/volumetric_mesh_deformation/src/WarpingSolverUtil.h
@@ -12,7 +12,7 @@
 __inline__ __device__ float warpReduce(float val) {
     int offset = 32 >> 1;
     while (offset > 0) {
-        val = val + __shfl_down(val, offset, 32);
+        val = val + __shfl_down_sync(0xffffffff, val, offset, 32);
         offset = offset >> 1;
     }
     return val;