Skip to content

Commit

Permalink
Add CUDA MapReduce support for VF DataLayout
Browse files Browse the repository at this point in the history
  • Loading branch information
sriharshakandala authored and charleskawczynski committed May 12, 2023
1 parent ce08ede commit 612bcc2
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 44 deletions.
25 changes: 17 additions & 8 deletions src/Fields/mapreduce_cuda.jl
Original file line number Diff line number Diff line change
Expand Up @@ -64,20 +64,18 @@ function mapreduce_cuda(
weighting = false,
opargs...,
) where {
Nij,
A <: AbstractArray,
V <:
Union{DataLayouts.IJFH{<:Any, Nij, A}, DataLayouts.VIJFH{<:Any, Nij, A}},
S,
V <: Union{DataLayouts.VF{S}, DataLayouts.IJFH{S}, DataLayouts.VIJFH{S}},
}
data = Fields.field_values(field)
pdata = parent(data)
T = eltype(pdata)
(_, _, _, Nv, Nh) = size(data)
(Ni, Nj, Nk, Nv, Nh) = size(data)
Nf = div(length(pdata), prod(size(data))) # length of field dimension
wt = Spaces.weighted_jacobian(axes(field)) # wt always IJFH layout
wt = Spaces.weighted_jacobian(axes(field))
pwt = parent(wt)

nitems = Nv * Nij * Nij * Nh
nitems = Nv * Ni * Nj * Nk * Nh
max_threads = 256# 512 1024
nthreads = min(max_threads, nitems)
# perform n ops during loading to shmem (this is a tunable parameter)
Expand Down Expand Up @@ -108,7 +106,11 @@ function mapreduce_cuda(
Val(shmemsize),
)
end
return Array(Array(reduce_cuda)[1, :])[1]
if Nf == 1
return Array(reduce_cuda)[1, 1]
else
return DataLayouts.DataF{S}(CuArray(view(reduce_cuda, 1, :)[:]))
end
end

function mapreduce_cuda_kernel!(
Expand Down Expand Up @@ -178,12 +180,19 @@ end
(bidx - 1) * effective_blksize +
(fidx - 1) * effective_blksize * nblk
end
# for VF DataLayout
@inline function _get_dims(pdata::AbstractArray{FT, 2}) where {FT}
(Nv, Nf) = size(pdata)
return (Nv, 1, Nf, 1)
end

# for IJFH DataLayout
@inline function _get_dims(pdata::AbstractArray{FT, 4}) where {FT}
(Nij, _, Nf, Nh) = size(pdata)
return (1, Nij, Nf, Nh)
end

# for VIJFH DataLayout
@inline function _get_dims(pdata::AbstractArray{FT, 5}) where {FT}
(Nv, Nij, _, Nf, Nh) = size(pdata)
return (Nv, Nij, Nf, Nh)
Expand Down
69 changes: 67 additions & 2 deletions test/Fields/reduction_cuda.jl
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ end
Nq = 4
ndof = ne * ne * 6 * Nq * Nq
println(
"running reduction test on $(context.device) device; problem size Ne = $ne; Nq = $Nq; ndof = $ndof; FT = $FT",
"running reduction test on $(context.device); problem size Ne = $ne; Nq = $Nq; ndof = $ndof; FT = $FT",
)
R = FT(6.37122e6) # radius of earth
domain = Domains.SphereDomain(R)
Expand Down Expand Up @@ -140,7 +140,7 @@ end
z_elem = 10
h_elem = 4
println(
"running reduction test on $(context.device) device; h_elem = $h_elem; z_elem = $z_elem; npoly = $npoly; R = $R; z_max = $z_max; FT = $FT",
"running reduction test on $(context.device); h_elem = $h_elem; z_elem = $z_elem; npoly = $npoly; R = $R; z_max = $z_max; FT = $FT",
)
# horizontal space
domain = Domains.SphereDomain(R)
Expand Down Expand Up @@ -229,3 +229,68 @@ end
@test LinearAlgebra.norm(Yf, 3) LinearAlgebra.norm(Yf_cpu, 3)
@test LinearAlgebra.norm(Yf, Inf) LinearAlgebra.norm(Yf_cpu, Inf)
end

@testset "test cuda reduction op for single column" begin
FT = Float64
context = ClimaComms.SingletonCommsContext(ClimaComms.CUDADevice())
context_cpu = ClimaComms.SingletonCommsContext(ClimaComms.CPUDevice()) # CPU context for comparison

z0 = FT(0)
z_max = FT(2 * π)
z_nelems = 16

println(
"running column reduction test on $(context.device); z_nelems = $z_nelems; z0 = $z0; z_max = $z_max; FT = $FT",
)

domain = Domains.IntervalDomain(
Geometry.ZPoint{FT}(z0),
Geometry.ZPoint{FT}(z_max);
boundary_tags = (:left, :right),
)

z_mesh = Meshes.IntervalMesh(domain; nelems = z_nelems)
z_topology = Topologies.IntervalTopology(context, z_mesh)
z_topology_cpu = Topologies.IntervalTopology(context_cpu, z_mesh)


center_space = Spaces.CenterFiniteDifferenceSpace(z_topology)
face_space = Spaces.FaceFiniteDifferenceSpace(center_space)

center_space_cpu = Spaces.CenterFiniteDifferenceSpace(z_topology_cpu)
face_space_cpu = Spaces.FaceFiniteDifferenceSpace(center_space_cpu)

Yc = ones(FT, center_space)
Yc_cpu = ones(FT, center_space_cpu)

Yf = ones(FT, face_space)
Yf_cpu = ones(FT, face_space_cpu)

resultc_cpu = Base.sum(Yc_cpu)
resultc = Base.sum(Yc)

resultf_cpu = Base.sum(Yf_cpu)
resultf = Base.sum(Yf)
# test weighted sum
@test resultc resultc_cpu
@test resultc z_max - z0

@test resultf resultf_cpu
@test resultf z_max - z0

Yc = sin.(getproperty(Fields.coordinate_field(center_space), :z)) .+ 1
Yc_cpu =
sin.(getproperty(Fields.coordinate_field(center_space_cpu), :z)) .+ 1


@test Base.maximum(identity, Yc) Base.maximum(identity, Yc_cpu)
@test Base.minimum(identity, Yc) Base.minimum(identity, Yc_cpu)

@test Statistics.mean(Yc) Statistics.mean(Yc_cpu)

@test LinearAlgebra.norm(Yc, 1) LinearAlgebra.norm(Yc_cpu, 1)
@test LinearAlgebra.norm(Yc, 2) LinearAlgebra.norm(Yc_cpu, 2)
@test LinearAlgebra.norm(Yc, 3) LinearAlgebra.norm(Yc_cpu, 3)

@test LinearAlgebra.norm(Yc, Inf) LinearAlgebra.norm(Yc_cpu, Inf)
end
51 changes: 17 additions & 34 deletions test/Operators/finitedifference/column.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,28 +26,22 @@ device = ClimaComms.device()
center_space = Spaces.CenterFiniteDifferenceSpace(topology)
face_space = Spaces.FaceFiniteDifferenceSpace(center_space)

@test sum(ones(FT, center_space)) pi broken =
device isa ClimaComms.CUDADevice
@test sum(ones(FT, face_space)) pi broken =
device isa ClimaComms.CUDADevice
@test sum(ones(FT, center_space)) pi
@test sum(ones(FT, face_space)) pi

centers = getproperty(Fields.coordinate_field(center_space), :z)
@test sum(sin.(centers)) FT(2.0) atol = 1e-2 broken =
device isa ClimaComms.CUDADevice
@test sum(sin.(centers)) FT(2.0) atol = 1e-2

faces = getproperty(Fields.coordinate_field(face_space), :z)
@test sum(sin.(faces)) FT(2.0) atol = 1e-2 broken =
device isa ClimaComms.CUDADevice
@test sum(sin.(faces)) FT(2.0) atol = 1e-2

∇ᶜ = Operators.GradientF2C()
∂sin = Geometry.WVector.(∇ᶜ.(sin.(faces)))
@test ∂sin Geometry.WVector.(cos.(centers)) atol = 1e-2 broken =
device isa ClimaComms.CUDADevice
@test ∂sin Geometry.WVector.(cos.(centers)) atol = 1e-2

divᶜ = Operators.DivergenceF2C()
∂sin = divᶜ.(Geometry.WVector.(sin.(faces)))
@test ∂sin cos.(centers) atol = 1e-2 broken =
device isa ClimaComms.CUDADevice
@test ∂sin cos.(centers) atol = 1e-2

# Center -> Face operator
# first order convergence at boundaries
Expand All @@ -56,24 +50,21 @@ device = ClimaComms.device()
right = Operators.SetValue(FT(pi)),
)
∂z = Geometry.WVector.(∇ᶠ.(centers))
@test ∂z Geometry.WVector.(ones(FT, face_space)) rtol = 10 * eps(FT) broken =
device isa ClimaComms.CUDADevice
@test ∂z Geometry.WVector.(ones(FT, face_space)) rtol = 10 * eps(FT)

∇ᶠ = Operators.GradientC2F(
left = Operators.SetValue(FT(1)),
right = Operators.SetValue(FT(-1)),
)
∂cos = Geometry.WVector.(∇ᶠ.(cos.(centers)))
@test ∂cos Geometry.WVector.(.-sin.(faces)) atol = 1e-1 broken =
device isa ClimaComms.CUDADevice
@test ∂cos Geometry.WVector.(.-sin.(faces)) atol = 1e-1

∇ᶠ = Operators.GradientC2F(
left = Operators.SetGradient(Geometry.WVector(FT(0))),
right = Operators.SetGradient(Geometry.WVector(FT(0))),
)
∂cos = Geometry.WVector.(∇ᶠ.(cos.(centers)))
@test ∂cos Geometry.WVector.(.-sin.(faces)) atol = 1e-2 broken =
device isa ClimaComms.CUDADevice
@test ∂cos Geometry.WVector.(.-sin.(faces)) atol = 1e-2

# test that broadcasting into incorrect field space throws an error
empty_centers = zeros(FT, center_space)
Expand All @@ -100,40 +91,32 @@ end
center_space = Spaces.CenterFiniteDifferenceSpace(topology)
face_space = Spaces.FaceFiniteDifferenceSpace(center_space)

@test sum(ones(FT, center_space)) 2pi broken =
device isa ClimaComms.CUDADevice
@test sum(ones(FT, face_space)) 2pi broken =
device isa ClimaComms.CUDADevice
@test sum(ones(FT, center_space)) 2pi
@test sum(ones(FT, face_space)) 2pi

sinz_c = sin.(Fields.coordinate_field(center_space).z)
cosz_c = cos.(Fields.coordinate_field(center_space).z)
@test sum(sinz_c) FT(0.0) atol = 1e-2 broken =
device isa ClimaComms.CUDADevice
@test sum(sinz_c) FT(0.0) atol = 1e-2

sinz_f = sin.(Fields.coordinate_field(face_space).z)
cosz_f = cos.(Fields.coordinate_field(face_space).z)
@test sum(sinz_f) FT(0.0) atol = 1e-2 broken =
device isa ClimaComms.CUDADevice
@test sum(sinz_f) FT(0.0) atol = 1e-2

∇ᶜ = Operators.GradientF2C()
∂sin = Geometry.WVector.(∇ᶜ.(sinz_f))
@test ∂sin Geometry.WVector.(cosz_c) atol = 1e-2 broken =
device isa ClimaComms.CUDADevice
@test ∂sin Geometry.WVector.(cosz_c) atol = 1e-2

divᶜ = Operators.DivergenceF2C()
∂sin = divᶜ.(Geometry.WVector.(sinz_f))
@test ∂sin cosz_c atol = 1e-2 broken =
device isa ClimaComms.CUDADevice
@test ∂sin cosz_c atol = 1e-2

∇ᶠ = Operators.GradientC2F()
∂cos = Geometry.WVector.(∇ᶠ.(cosz_c))
@test ∂cos Geometry.WVector.(.-sinz_f) atol = 1e-1 broken =
device isa ClimaComms.CUDADevice
@test ∂cos Geometry.WVector.(.-sinz_f) atol = 1e-1

∇ᶠ = Operators.GradientC2F()
∂cos = Geometry.WVector.(∇ᶠ.(cosz_c))
@test ∂cos Geometry.WVector.(.-sinz_f) atol = 1e-2 broken =
device isa ClimaComms.CUDADevice
@test ∂cos Geometry.WVector.(.-sinz_f) atol = 1e-2

# test that broadcasting into incorrect field space throws an error
empty_centers = zeros(FT, center_space)
Expand Down

0 comments on commit 612bcc2

Please sign in to comment.