Skip to content

Fusing FH datalayouts is slower than fusing HF datalayouts #2165

Open
@charleskawczynski

Description

@charleskawczynski

Reproducer (from ClimaAtmos):

#=
julia --project=.buildkite
ENV["CLIMACOMMS_DEVICE"] = "CUDA"
using Revise; include("../cc_fusion_repro.jl")
=#
using ClimaComms, Test
ClimaComms.@import_required_backends
using LazyBroadcast: @lazy
using ClimaCore.CommonSpaces
using ClimaCore: Spaces, Fields, Geometry, ClimaCore, Operators
using ClimaCore.DataLayouts

const divₕ = Operators.Divergence()
const wgradₕ = Operators.WeakGradient()
const curlₕ = Operators.Curl()
const wcurlₕ = Operators.WeakCurl()

Base.@kwdef struct ViscousSponge{FT}
    zd::FT
    κ₂::FT
end
Base.Broadcast.broadcastable(x::ViscousSponge) = tuple(x)
Base.@kwdef struct RayleighSponge{FT}
    zd::FT
    α_uₕ::FT
    α_w::FT
end
Base.Broadcast.broadcastable(x::RayleighSponge) = tuple(x)

αₘ(s::RayleighSponge{FT}, z, α) where {FT} = ifelse(z > s.zd, α, FT(0))
ζ_rayleigh(s::RayleighSponge{FT}, z, zmax) where {FT} = sin(FT(π) / 2 * (z - s.zd) / (zmax - s.zd))^2
β_rayleigh_uₕ(s::RayleighSponge{FT}, z, zmax) where {FT} = αₘ(s, z, s.α_uₕ) * ζ_rayleigh(s, z, zmax)

αₘ(s::ViscousSponge{FT}, z) where {FT} = ifelse(z > s.zd, s.κ₂, FT(0))
ζ_viscous(s::ViscousSponge{FT}, z, zmax) where {FT} = sin(FT(π) / 2 * (z - s.zd) / (zmax - s.zd))^2
β_viscous(s::ViscousSponge{FT}, z, zmax) where {FT} = αₘ(s, z) * ζ_viscous(s, z, zmax)

function rayleigh_sponge_tendency_uₕ(ᶜuₕ, s)
    ᶜz = Fields.coordinate_field(axes(ᶜuₕ)).z
    ᶠz = Fields.coordinate_field(Spaces.face_space(axes(ᶜuₕ))).z
    zmax = Spaces.z_max(axes(ᶠz))
    return @lazy @. -β_rayleigh_uₕ(s, ᶜz, zmax) * ᶜuₕ
end

function viscous_sponge_tendency_uₕ(ᶜuₕ, s)
    ᶜz = Fields.coordinate_field(axes(ᶜuₕ)).z
    ᶠz = Fields.coordinate_field(Spaces.face_space(axes(ᶜuₕ))).z
    zmax = Spaces.z_max(axes(ᶠz))
    return @lazy @. β_viscous(s, ᶜz, zmax) * (
        wgradₕ(divₕ(ᶜuₕ)) - Geometry.project(
            Geometry.Covariant12Axis(),
            wcurlₕ(Geometry.project(Geometry.Covariant3Axis(), curlₕ(ᶜuₕ))),
        )
    )
end

FT = Float64;
horizontal_layout_type = DataLayouts.IJFH
# horizontal_layout_type = DataLayouts.IJHF
@info "horizontal_layout_type = $horizontal_layout_type"
ᶜspace = ExtrudedCubedSphereSpace(
    FT;
    z_elem = 30,
    z_min = 0,
    z_max = 1,
    radius = 10,
    h_elem = 15,
    n_quad_points = 4,
    horizontal_layout_type,
    staggering = CellCenter(),
);
ᶠspace = Spaces.face_space(ᶜspace);
ᶜz = Fields.coordinate_field(ᶜspace).z;
ᶠz = Fields.coordinate_field(ᶠspace).z;
zmax = maximum(ᶠz);
vs = ViscousSponge{FT}(; zd = 0, κ₂ = 1);
ᶜuₕ = map(z -> zero(Geometry.Covariant12Vector{eltype(z)}), ᶜz);
ᶜuₕₜ = similar(ᶜuₕ);
@. ᶜuₕ.components.data.:1 = 1;
@. ᶜuₕ.components.data.:2 = 1;
rs = RayleighSponge(; zd = FT(0), α_uₕ = FT(1), α_w = FT(1));
rst = rayleigh_sponge_tendency_uₕ(ᶜuₕ, rs);
vst = viscous_sponge_tendency_uₕ(ᶜuₕ, vs);
function main_unfused(ᶜuₕₜ, rst, vst)
    @. ᶜuₕₜ += vst
    @. ᶜuₕₜ += rst
    nothing
end
function main_fused(ᶜuₕₜ, rst, vst)
    @. ᶜuₕₜ += vst + rst
    nothing
end

using BenchmarkTools
main_fused(ᶜuₕₜ, rst, vst)
main_unfused(ᶜuₕₜ, rst, vst)
device = ClimaComms.device()
trial = @benchmark ClimaComms.@cuda_sync device main_unfused($ᶜuₕₜ, $rst, $vst)
show(stdout, MIME("text/plain"), trial)
trial = @benchmark ClimaComms.@cuda_sync device main_fused($ᶜuₕₜ, $rst, $vst)
show(stdout, MIME("text/plain"), trial)
nothing

Please note the toggle between

    horizontal_layout_type = DataLayouts.IJFH, # 
    # horizontal_layout_type = DataLayouts.IJHF,

We should boil this down, and better understand why fusing hurts performance for the cartesian indexed kernels and improves performance for the linear indexed kernels.

We saw this in MultiBroadcastFusion, too, but, IIRC, that was across different broadcast expressions, this includes when we're fusing into a single broadcast expression, so this is slightly different, but is in agreement with the result found in MBF.jl.

Metadata

Metadata

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions