Open
Description
Reproducer (from ClimaAtmos):
#=
julia --project=.buildkite
ENV["CLIMACOMMS_DEVICE"] = "CUDA"
using Revise; include("../cc_fusion_repro.jl")
=#
using ClimaComms, Test
ClimaComms.@import_required_backends
using LazyBroadcast: @lazy
using ClimaCore.CommonSpaces
using ClimaCore: Spaces, Fields, Geometry, ClimaCore, Operators
using ClimaCore.DataLayouts
const divₕ = Operators.Divergence()
const wgradₕ = Operators.WeakGradient()
const curlₕ = Operators.Curl()
const wcurlₕ = Operators.WeakCurl()
Base.@kwdef struct ViscousSponge{FT}
zd::FT
κ₂::FT
end
Base.Broadcast.broadcastable(x::ViscousSponge) = tuple(x)
Base.@kwdef struct RayleighSponge{FT}
zd::FT
α_uₕ::FT
α_w::FT
end
Base.Broadcast.broadcastable(x::RayleighSponge) = tuple(x)
αₘ(s::RayleighSponge{FT}, z, α) where {FT} = ifelse(z > s.zd, α, FT(0))
ζ_rayleigh(s::RayleighSponge{FT}, z, zmax) where {FT} = sin(FT(π) / 2 * (z - s.zd) / (zmax - s.zd))^2
β_rayleigh_uₕ(s::RayleighSponge{FT}, z, zmax) where {FT} = αₘ(s, z, s.α_uₕ) * ζ_rayleigh(s, z, zmax)
αₘ(s::ViscousSponge{FT}, z) where {FT} = ifelse(z > s.zd, s.κ₂, FT(0))
ζ_viscous(s::ViscousSponge{FT}, z, zmax) where {FT} = sin(FT(π) / 2 * (z - s.zd) / (zmax - s.zd))^2
β_viscous(s::ViscousSponge{FT}, z, zmax) where {FT} = αₘ(s, z) * ζ_viscous(s, z, zmax)
function rayleigh_sponge_tendency_uₕ(ᶜuₕ, s)
ᶜz = Fields.coordinate_field(axes(ᶜuₕ)).z
ᶠz = Fields.coordinate_field(Spaces.face_space(axes(ᶜuₕ))).z
zmax = Spaces.z_max(axes(ᶠz))
return @lazy @. -β_rayleigh_uₕ(s, ᶜz, zmax) * ᶜuₕ
end
function viscous_sponge_tendency_uₕ(ᶜuₕ, s)
ᶜz = Fields.coordinate_field(axes(ᶜuₕ)).z
ᶠz = Fields.coordinate_field(Spaces.face_space(axes(ᶜuₕ))).z
zmax = Spaces.z_max(axes(ᶠz))
return @lazy @. β_viscous(s, ᶜz, zmax) * (
wgradₕ(divₕ(ᶜuₕ)) - Geometry.project(
Geometry.Covariant12Axis(),
wcurlₕ(Geometry.project(Geometry.Covariant3Axis(), curlₕ(ᶜuₕ))),
)
)
end
FT = Float64;
horizontal_layout_type = DataLayouts.IJFH
# horizontal_layout_type = DataLayouts.IJHF
@info "horizontal_layout_type = $horizontal_layout_type"
ᶜspace = ExtrudedCubedSphereSpace(
FT;
z_elem = 30,
z_min = 0,
z_max = 1,
radius = 10,
h_elem = 15,
n_quad_points = 4,
horizontal_layout_type,
staggering = CellCenter(),
);
ᶠspace = Spaces.face_space(ᶜspace);
ᶜz = Fields.coordinate_field(ᶜspace).z;
ᶠz = Fields.coordinate_field(ᶠspace).z;
zmax = maximum(ᶠz);
vs = ViscousSponge{FT}(; zd = 0, κ₂ = 1);
ᶜuₕ = map(z -> zero(Geometry.Covariant12Vector{eltype(z)}), ᶜz);
ᶜuₕₜ = similar(ᶜuₕ);
@. ᶜuₕ.components.data.:1 = 1;
@. ᶜuₕ.components.data.:2 = 1;
rs = RayleighSponge(; zd = FT(0), α_uₕ = FT(1), α_w = FT(1));
rst = rayleigh_sponge_tendency_uₕ(ᶜuₕ, rs);
vst = viscous_sponge_tendency_uₕ(ᶜuₕ, vs);
function main_unfused(ᶜuₕₜ, rst, vst)
@. ᶜuₕₜ += vst
@. ᶜuₕₜ += rst
nothing
end
function main_fused(ᶜuₕₜ, rst, vst)
@. ᶜuₕₜ += vst + rst
nothing
end
using BenchmarkTools
main_fused(ᶜuₕₜ, rst, vst)
main_unfused(ᶜuₕₜ, rst, vst)
device = ClimaComms.device()
trial = @benchmark ClimaComms.@cuda_sync device main_unfused($ᶜuₕₜ, $rst, $vst)
show(stdout, MIME("text/plain"), trial)
trial = @benchmark ClimaComms.@cuda_sync device main_fused($ᶜuₕₜ, $rst, $vst)
show(stdout, MIME("text/plain"), trial)
nothing
Please note the toggle between
horizontal_layout_type = DataLayouts.IJFH, #
# horizontal_layout_type = DataLayouts.IJHF,
We should boil this down, and better understand why fusing hurts performance for the cartesian indexed kernels and improves performance for the linear indexed kernels.
We saw this in MultiBroadcastFusion, too, but, IIRC, that was across different broadcast expressions, this includes when we're fusing into a single broadcast expression, so this is slightly different, but is in agreement with the result found in MBF.jl.