Merge pull request #464 from JuliaGPU/tb/shfl

maleadt · web-flow · commit a4a56bd26e1c · 2019-09-25T14:45:34.000+02:00
Improvements to shfl
diff --git a/src/device/cuda/warp_shuffle.jl b/src/device/cuda/warp_shuffle.jl
@@ -2,8 +2,6 @@
 
 # TODO: does not work on sub-word (ie. Int16) or non-word divisible sized types
 
-# TODO: should shfl_idx conform to 1-based indexing?
-
 # TODO: these functions should dispatch based on the actual warp size
 const ws = Int32(32)
 
@@ -14,52 +12,48 @@ const ws = Int32(32)
 
 # "two packed values specifying a mask for logically splitting warps into sub-segments
 # and an upper bound for clamping the source lane index"
-@inline pack(width::UInt32, mask::UInt32)::UInt32 = (convert(UInt32, ws - width) << 8) | mask
+@inline pack(width, mask) = (convert(UInt32, ws - width) << 8) | convert(UInt32, mask)
 
 # NOTE: CUDA C disagrees with PTX on how shuffles are called
-for (name, mode, mask) in (("_up",   :up,   UInt32(0x00)),
-                           ("_down", :down, UInt32(0x1f)),
-                           ("_xor",  :bfly, UInt32(0x1f)),
-                           ("",      :idx,  UInt32(0x1f)))
+for (name, mode, mask, offset) in (("_up",   :up,   UInt32(0x00), src->src),
+                                   ("_down", :down, UInt32(0x1f), src->src),
+                                   ("_xor",  :bfly, UInt32(0x1f), src->src),
+                                   ("",      :idx,  UInt32(0x1f), src->:($src-1)))
     fname = Symbol("shfl$name")
+    @eval export $fname
 
     if cuda_driver_version >= v"9.0" && v"6.0" in ptx_support
-        instruction = Symbol("shfl.sync.$mode.b32")
-        fname_sync = Symbol("$(fname)_sync")
-
-        # TODO: implement using LLVM intrinsics when we have D38090
+        # newer hardware/CUDA versions use synchronizing intrinsics, which take an extra
+        # mask argument indicating which threads in the lane should be synchronized
+        intrinsic = "llvm.nvvm.shfl.sync.$mode.i32"
 
+        fname_sync = Symbol("$(fname)_sync")
+        __fname_sync = Symbol("__$(fname)_sync")
         @eval begin
-            export $fname_sync, $fname
-
-            @inline $fname_sync(val::UInt32, src::UInt32, width::UInt32=$ws,
-                                threadmask::UInt32=0xffffffff) =
-                @asmcall($"$instruction \$0, \$1, \$2, \$3, \$4;", "=r,r,r,r,r", true,
-                         UInt32, NTuple{4,UInt32},
-                         val, src, pack(width, $mask), threadmask)
-
-            # FIXME: replace this with a checked conversion once we have exceptions
-            @inline $fname_sync(val::UInt32, src::Integer, width::Integer=$ws,
-                                threadmask::UInt32=0xffffffff) =
-                $fname_sync(val, unsafe_trunc(UInt32, src), unsafe_trunc(UInt32, width),
-                            threadmask)
-
-            @inline $fname(val::UInt32, src::Integer, width::Integer=$ws) =
-                $fname_sync(val, src, width)
+            export $fname_sync
+
+            # HACK: recurse_value_invocation and friends split the first argument of a call,
+            #       so swap mask and val for these tools to works.
+            @inline $fname_sync(mask, val, src, width=$ws) =
+                $__fname_sync(val, mask, src, width)
+            @inline $__fname_sync(val::UInt32, mask, src, width) =
+                ccall($intrinsic, llvmcall, UInt32,
+                      (UInt32, UInt32, UInt32, UInt32),
+                      mask, val, $(offset(:src)), pack(width, $mask))
+
+            # for backwards compatibility, have the non-synchronizing intrinsic dispatch
+            # to the synchronizing one (with a full-lane default value for the mask)
+            @inline $fname(val::UInt32, src, width=$ws, mask::UInt32=0xffffffff) =
+                $fname_sync(mask, val, src, width)
         end
     else
-        intrinsic = Symbol("llvm.nvvm.shfl.$mode.i32")
+        intrinsic = "llvm.nvvm.shfl.$mode.i32"
 
         @eval begin
-            export $fname
-            @inline $fname(val::UInt32, src::UInt32, width::UInt32=$ws) =
-                ccall($"$intrinsic", llvmcall, UInt32,
+            @inline $fname(val::UInt32, src, width=$ws) =
+                ccall($intrinsic, llvmcall, UInt32,
                       (UInt32, UInt32, UInt32),
-                      val, src, pack(width, $mask))
-
-            # FIXME: replace this with a checked conversion once we have exceptions
-            @inline $fname(val::UInt32, src::Integer, width::Integer=$ws) =
-                $fname(val, unsafe_trunc(UInt32, src), unsafe_trunc(UInt32, width))
+                      val, $(offset(:src)), pack(width, $mask))
         end
     end
 end
@@ -71,62 +65,70 @@ for name in ["_up", "_down", "_xor", ""]
     fname = Symbol("shfl$name")
     @eval @inline $fname(src, args...) = recurse_value_invocation($fname, src, args...)
 
-    fname_sync = Symbol("$(fname)_sync")
-    @eval @inline $fname_sync(src, args...) = recurse_value_invocation($fname, src, args...)
+    fname_sync = Symbol("__$(fname)_sync")
+    @eval @inline $fname_sync(src, args...) = recurse_value_invocation($fname_sync, src, args...)
 end
 
 
 # documentation
 
 @doc """
-    shfl(val, lane::Integer, width::Integer=32)
+    shfl(val, lane::Integer, width::Integer=32, threadmask::UInt32=0xffffffff)
 
-Shuffle a value from a directly indexed lane `lane`.
+Shuffle a value from a directly indexed lane `lane`. The argument `threadmask` for selecting
+which threads to synchronize is only available on recent hardware, and defaults to all
+threads in the warp.
 """ shfl
 
 @doc """
-    shfl_up(val, delta::Integer, width::Integer=32)
+    shfl_up(val, delta::Integer, width::Integer=32, threadmask::UInt32=0xffffffff)
 
-Shuffle a value from a lane with lower ID relative to caller.
+Shuffle a value from a lane with lower ID relative to caller. The argument `threadmask` for
+selecting which threads to synchronize is only available on recent hardware, and defaults to
+all threads in the warp.
 """ shfl_up
 
 @doc """
-    shfl_down(val, delta::Integer, width::Integer=32)
+    shfl_down(val, delta::Integer, width::Integer=32, threadmask::UInt32=0xffffffff)
 
-Shuffle a value from a lane with higher ID relative to caller.
+Shuffle a value from a lane with higher ID relative to caller. The argument `threadmask` for
+selecting which threads to synchronize is only available on recent hardware, and defaults to
+all threads in the warp.
 """ shfl_down
 
 @doc """
-    shfl_xor(val, mask::Integer, width::Integer=32)
+    shfl_xor(val, lanemask::Integer, width::Integer=32, threadmask::UInt32=0xffffffff)
 
-Shuffle a value from a lane based on bitwise XOR of own lane ID with `mask`.
+Shuffle a value from a lane based on bitwise XOR of own lane ID with `lanemask`. The
+argument `threadmask` for selecting which threads to synchronize is only available on recent
+hardware, and defaults to all threads in the warp.
 """ shfl_xor
 
 
 @doc """
-    shfl_sync(val, lane::Integer, width::Integer=32, threadmask::UInt32=0xffffffff)
+    shfl_sync(threadmask::UInt32, val, lane::Integer, width::Integer=32)
 
-Shuffle a value from a directly indexed lane `lane`. The default value for `threadmask`
-performs the shuffle on all threads in the warp.
+Shuffle a value from a directly indexed lane `lane`, and synchronize threads according to
+`threadmask`.
 """ shfl_sync
 
 @doc """
-    shfl_up_sync(val, delta::Integer, width::Integer=32, threadmask::UInt32=0xffffffff)
+    shfl_up_sync(threadmask::UInt32, val, delta::Integer, width::Integer=32)
 
-Shuffle a value from a lane with lower ID relative to caller. The default value for
-`threadmask` performs the shuffle on all threads in the warp.
+Shuffle a value from a lane with lower ID relative to caller, and synchronize threads
+according to `threadmask`.
 """ shfl_up_sync
 
 @doc """
-    shfl_down_sync(val, delta::Integer, width::Integer=32, threadmask::UInt32=0xffffffff)
+    shfl_down_sync(threadmask::UInt32, val, delta::Integer, width::Integer=32)
 
-Shuffle a value from a lane with higher ID relative to caller. The default value for
-`threadmask` performs the shuffle on all threads in the warp.
+Shuffle a value from a lane with higher ID relative to caller, and synchronize threads
+according to `threadmask`.
 """ shfl_down_sync
 
 @doc """
-    shfl_xor_sync(val, mask::Integer, width::Integer=32, threadmask::UInt32=0xffffffff)
+    shfl_xor_sync(threadmask::UInt32, val, mask::Integer, width::Integer=32)
 
-Shuffle a value from a lane based on bitwise XOR of own lane ID with `mask`. The default
-value for `threadmask` performs the shuffle on all threads in the warp.
+Shuffle a value from a lane based on bitwise XOR of own lane ID with `mask`, and synchronize
+threads according to `threadmask`.
 """ shfl_xor_sync
diff --git a/test/device/cuda.jl b/test/device/cuda.jl
@@ -528,8 +528,25 @@ end
 @testset "data movement and conversion" begin
 
 if capability(dev) >= v"3.0"
-@testset "shuffle down" begin
 
+@testset "shuffle idx" begin
+    function kernel(d)
+        i = threadIdx().x
+        j = 32 - i + 1
+
+        d[i] = shfl(d[i], j)
+
+        return
+    end
+
+    warpsize = CUDAdrv.warpsize(device())
+
+    a = CuTestArray([i for i in 1:warpsize])
+    @cuda threads=warpsize kernel(a)
+    @test Array(a) == [i for i in warpsize:-1:1]
+end
+
+@testset "shuffle down" begin
     @eval struct AddableTuple
         x::Int32
         y::Int64
@@ -539,15 +556,38 @@ if capability(dev) >= v"3.0"
 
     n = 14
 
-    @testset for T in [Int32, Int64, Float32, Float64, AddableTuple]
-        function kernel(d::CuDeviceArray{T}, n) where {T}
-            t = threadIdx().x
-            if t <= n
-                d[t] += shfl_down(d[t], n÷2)
-            end
-            return
+    function kernel1(d::CuDeviceArray{T}, n) where {T}
+        t = threadIdx().x
+        if t <= n
+            d[t] += shfl_down(d[t], n÷2)
+        end
+        return
+    end
+
+    function kernel2(d::CuDeviceArray{T}, n) where {T}
+        t = threadIdx().x
+        if t <= n
+            d[t] += shfl_down(d[t], n÷2, 32, 0xffffffff)
         end
+        return
+    end
 
+    function kernel3(d::CuDeviceArray{T}, n) where {T}
+        t = threadIdx().x
+        if t <= n
+            d[t] += shfl_down_sync(0xffffffff, d[t], n÷2, 32)
+        end
+        return
+    end
+
+    kernels = try
+        getfield(CUDAnative, :shfl_sync)
+        (kernel1, kernel2, kernel3)
+    catch
+        (kernel1,)
+    end
+
+    @testset for T in [Int32, Int64, Float32, Float64, AddableTuple], kernel in kernels
         a = T[T(i) for i in 1:n]
         d_a = CuArray(a)
 
@@ -557,8 +597,8 @@ if capability(dev) >= v"3.0"
         a[1:n÷2] += a[n÷2+1:end]
         @test a == Array(d_a)
     end
-
 end
+
 end
 
 end