From 6221324bbf2fd07034b1918c73e1a1a6d0a023df Mon Sep 17 00:00:00 2001 From: Matt Bauman Date: Tue, 27 Aug 2024 18:34:19 -0400 Subject: [PATCH 1/2] WIP: implement reductions as reductions Depends upon (and is a requirement for) Julia issue 55318 --- src/Statistics.jl | 158 +++++++++++++++++++++++----------------------- test/runtests.jl | 40 ++++++++++-- 2 files changed, 113 insertions(+), 85 deletions(-) diff --git a/src/Statistics.jl b/src/Statistics.jl index 6633edf..d776b9e 100644 --- a/src/Statistics.jl +++ b/src/Statistics.jl @@ -43,6 +43,18 @@ julia> mean(skipmissing([1, missing, 3])) """ mean(itr) = mean(identity, itr) +struct Counter{F} <: Function + f::F + n::Base.RefValue{Int} +end +Counter(f::F) where {F} = Counter{F}(f, Ref(0)) +(f::Counter)(x) = (f.n[] += 1; f.f(x)) + +struct DivOne{F} <: Function + f::F +end +(f::DivOne)(x) = f.f(x)/1 + """ mean(f, itr) @@ -59,23 +71,13 @@ julia> mean([√1, √2, √3]) ``` """ function mean(f, itr) - y = iterate(itr) - if y === nothing - return Base.mapreduce_empty_iter(f, +, itr, - Base.IteratorEltype(itr)) / 0 - end - count = 1 - value, state = y - f_value = f(value)/1 - total = Base.reduce_first(+, f_value) - y = iterate(itr, state) - while y !== nothing - value, state = y - total += _mean_promote(total, f(value)) - count += 1 - y = iterate(itr, state) + if Base.IteratorSize(itr) === Base.SizeUnknown() + g = Counter(DivOne(f)) + result = mapfoldl(g, add_mean, itr) + return result/g.n[] + else + return mapfoldl(DivOne(f), add_mean, itr)/length(itr) end - return total/count end """ @@ -180,20 +182,24 @@ mean(A::AbstractArray; dims=:) = _mean(identity, A, dims) _mean_promote(x::T, y::S) where {T,S} = convert(promote_type(T, S), y) +add_mean(x, y) = Base.add_sum(x, _mean_promote(x, y)) + +Base.reduce_empty(::typeof(add_mean), T) = Base.reduce_empty(Base.add_sum, T) +Base.mapreduce_empty(g::DivOne, ::typeof(add_mean), T) = Base.mapreduce_empty(g.f, Base.add_sum, T)/1 +Base.mapreduce_empty(g::Counter{<:DivOne}, ::typeof(add_mean), T) = Base.mapreduce_empty(g.f.f, Base.add_sum, T)/1 + + # ::Dims is there to force specializing on Colon (as it is a Function) function _mean(f, A::AbstractArray, dims::Dims=:) where Dims - isempty(A) && return sum(f, A, dims=dims)/0 if dims === (:) + result = mapreduce(DivOne(f), add_mean, A, dims=dims) n = length(A) - else - n = mapreduce(i -> size(A, i), *, unique(dims); init=1) - end - x1 = f(first(A)) / 1 - result = sum(x -> _mean_promote(x1, f(x)), A, dims=dims) - if dims === (:) return result / n else - return result ./= n + result = mapreduce(DivOne(f), add_mean, A, dims=dims) + n = prod(i -> size(A, i), unique(dims); init=1) + result ./= n + return result end end @@ -211,6 +217,7 @@ realXcY(x::Complex, y::Complex) = real(x)*real(y) + imag(x)*imag(y) var(iterable; corrected::Bool=true, mean=nothing) = _var(iterable, corrected, mean) function _var(iterable, corrected::Bool, mean) + ismissing(mean) && return missing y = iterate(iterable) if y === nothing T = eltype(iterable) @@ -252,61 +259,36 @@ function _var(iterable, corrected::Bool, mean) end end -centralizedabs2fun(m) = x -> abs2.(x - m) -centralize_sumabs2(A::AbstractArray, m) = - mapreduce(centralizedabs2fun(m), +, A) -centralize_sumabs2(A::AbstractArray, m, ifirst::Int, ilast::Int) = - Base.mapreduce_impl(centralizedabs2fun(m), +, A, ifirst, ilast) - -function centralize_sumabs2!(R::AbstractArray{S}, A::AbstractArray, means::AbstractArray) where S - # following the implementation of _mapreducedim! at base/reducedim.jl - lsiz = Base.check_reducedims(R,A) - for i in 1:max(ndims(R), ndims(means)) - if axes(means, i) != axes(R, i) - throw(DimensionMismatch("dimension $i of `mean` should have indices $(axes(R, i)), but got $(axes(means, i))")) - end - end - isempty(R) || fill!(R, zero(S)) - isempty(A) && return R - - if Base.has_fast_linear_indexing(A) && lsiz > 16 && !has_offset_axes(R, means) - nslices = div(length(A), lsiz) - ibase = first(LinearIndices(A))-1 - for i = 1:nslices - @inbounds R[i] = centralize_sumabs2(A, means[i], ibase+1, ibase+lsiz) - ibase += lsiz - end - return R - end - indsAt, indsRt = Base.safe_tail(axes(A)), Base.safe_tail(axes(R)) # handle d=1 manually - keep, Idefault = Broadcast.shapeindexer(indsRt) - if Base.reducedim1(R, A) - i1 = first(Base.axes1(R)) - @inbounds for IA in CartesianIndices(indsAt) - IR = Broadcast.newindex(IA, keep, Idefault) - r = R[i1,IR] - m = means[i1,IR] - @simd for i in axes(A, 1) - r += abs2(A[i,IA] - m) - end - R[i1,IR] = r - end - else - @inbounds for IA in CartesianIndices(indsAt) - IR = Broadcast.newindex(IA, keep, Idefault) - @simd for i in axes(A, 1) - R[i,IR] += abs2(A[i,IA] - means[i,IR]) - end - end - end - return R +struct CentralizedAbs2Fun{T,S} <: Function + mean::S end +CentralizedAbs2Fun{T}(means) where {T} = CentralizedAbs2Fun{T,typeof(means)}(means) +CentralizedAbs2Fun(means) = CentralizedAbs2Fun{typeof(means)}(means) +CentralizedAbs2Fun(means, extrude) = CentralizedAbs2Fun{eltype(means)}(Broadcast.extrude(means)) +# Division is generally costly, but Julia is typically able to constant propagate a /1 +# and simply ensure we get the type right at no cost, allowing the division in-place later +(f::CentralizedAbs2Fun)(x) = abs2.(x - f.mean)/1 +(f::CentralizedAbs2Fun{<:Any,<:Broadcast.Extruded})((i, x),) = abs2.(x - Broadcast._broadcast_getindex(f.mean, i))/1 +_doubled(x) = x+x +Base.mapreduce_empty(::CentralizedAbs2Fun{T,<:Broadcast.Extruded}, ::typeof(Base.add_sum), ::Type{Tuple{_Any,S}}) where {T<:Number, S<:Number, _Any} = _doubled(abs2(zero(T)-zero(S)))/1 +Base.mapreduce_empty(::CentralizedAbs2Fun{T,<:Broadcast.Extruded}, ::typeof(Base.add_sum), ::Type{Tuple{_Any, Union{Missing, S}}}) where {T<:Number, S<:Number, _Any} = _doubled(abs2(zero(T)-zero(S)))/1 +Base.mapreduce_empty(::CentralizedAbs2Fun{T}, ::typeof(Base.add_sum), ::Type{S}) where {T<:Number, S<:Number} = _doubled(abs2(zero(T)-zero(S)))/1 +Base.mapreduce_empty(::CentralizedAbs2Fun{T}, ::typeof(Base.add_sum), ::Type{Union{Missing, S}}) where {T<:Number, S<:Number} = _doubled(abs2(zero(T)-zero(S)))/1 + +centralize_sumabs2(A::AbstractArray, m) = + sum(CentralizedAbs2Fun(m), A) +centralize_sumabs2(A::AbstractArray, m::AbstractArray, region) = + sum(CentralizedAbs2Fun(m, true), Base.PairsArray(A), dims=region) +centralize_sumabs2!(R::AbstractArray, A::AbstractArray, means::AbstractArray) = + sum!(CentralizedAbs2Fun(means, true), R, Base.PairsArray(A)) + function varm!(R::AbstractArray{S}, A::AbstractArray, m::AbstractArray; corrected::Bool=true) where S - if isempty(A) + _checkm(R, m, ntuple(identity, Val(max(ndims(R), ndims(m))))) + if isempty(A) || length(A) == 1 && corrected fill!(R, convert(S, NaN)) else - rn = div(length(A), length(R)) - Int(corrected) + rn = prod(ntuple(d->size(R, d) == 1 ? size(A, d) : 1, Val(max(ndims(A), ndims(R))))) - Int(corrected) centralize_sumabs2!(R, A, m) R .= R .* (1 // rn) end @@ -339,15 +321,33 @@ over dimensions. In that case, `mean` must be an array with the same shape as """ varm(A::AbstractArray, m::AbstractArray; corrected::Bool=true, dims=:) = _varm(A, m, corrected, dims) -_varm(A::AbstractArray{T}, m, corrected::Bool, region) where {T} = - varm!(Base.reducedim_init(t -> abs2(t)/2, +, A, region), A, m; corrected=corrected) +_throw_mean_mismatch(A, m, region) = throw(DimensionMismatch("axes of means ($(axes(m))) does not match reduction over $(region) of $(axes(A))")) +function _checkm(A::AbstractArray, m::AbstractArray, region) + for d in 1:max(ndims(A), ndims(m)) + if d in region + size(m, d) == 1 || _throw_mean_mismatch(A, m, region) + else + axes(m, d) == axes(A, d) || _throw_mean_mismatch(A, m, region) + end + end +end +function _varm(A::AbstractArray, m, corrected::Bool, region) + _checkm(A, m, region) + rn = prod(ntuple(d->d in region ? size(A, d) : 1, Val(ndims(A)))) - Int(corrected) + R = centralize_sumabs2(A, m, region) + if rn <= 0 + R .= R ./ 0 + else + R .= R .* 1//rn # why use Rational? + end + return R +end varm(A::AbstractArray, m; corrected::Bool=true) = _varm(A, m, corrected, :) function _varm(A::AbstractArray{T}, m, corrected::Bool, ::Colon) where T - n = length(A) - n == 0 && return oftype((abs2(zero(T)) + abs2(zero(T)))/2, NaN) - return centralize_sumabs2(A, m) / (n - Int(corrected)) + rn = max(length(A) - Int(corrected), 0) + centralize_sumabs2(A, m)/rn end diff --git a/test/runtests.jl b/test/runtests.jl index d2f2541..79b4a55 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -522,9 +522,15 @@ Y = [6.0 2.0; @testset "cov with missing" begin @test cov([missing]) === cov([1, missing]) === missing @test cov([1, missing], [2, 3]) === cov([1, 3], [2, missing]) === missing - @test_throws Exception cov([1 missing; 2 3]) - @test_throws Exception cov([1 missing; 2 3], [1, 2]) - @test_throws Exception cov([1, 2], [1 missing; 2 3]) + if isdefined(Base, :_reducedim_init) + @test_broken isequal(coalesce.(cov([1 missing; 2 3]), NaN), cov([1 NaN; 2 3])) + @test_broken isequal(coalesce.(cov([1 missing; 2 3], [1, 2]), NaN), cov([1 NaN; 2 3], [1, 2])) + @test_broken isequal(coalesce.(cov([1, 2], [1 missing; 2 3]), NaN), cov([1, 2], [1 NaN; 2 3])) + else + @test isequal(coalesce.(cov([1 missing; 2 3]), NaN), cov([1 NaN; 2 3])) + @test isequal(coalesce.(cov([1 missing; 2 3], [1, 2]), NaN), cov([1 NaN; 2 3], [1, 2])) + @test isequal(coalesce.(cov([1, 2], [1 missing; 2 3]), NaN), cov([1, 2], [1 NaN; 2 3])) + end @test isequal(cov([1 2; 2 3], [1, missing]), [missing missing]') @test isequal(cov([1, missing], [1 2; 2 3]), [missing missing]) end @@ -633,9 +639,15 @@ end @test cor([missing]) === missing @test cor([1, missing]) == 1 @test cor([1, missing], [2, 3]) === cor([1, 3], [2, missing]) === missing - @test_throws Exception cor([1 missing; 2 3]) - @test_throws Exception cor([1 missing; 2 3], [1, 2]) - @test_throws Exception cor([1, 2], [1 missing; 2 3]) + if isdefined(Base, :_reducedim_init) + @test_broken isequal(coalesce.(cor([1 missing; 2 3]), NaN), cor([1 NaN; 2 3])) + @test_broken isequal(coalesce.(cor([1 missing; 2 3], [1, 2]), NaN), cor([1 NaN; 2 3], [1, 2])) + @test_broken isequal(coalesce.(cor([1, 2], [1 missing; 2 3]), NaN), cor([1, 2], [1 NaN; 2 3])) + else + @test isequal(coalesce.(cor([1 missing; 2 3]), NaN), cor([1 NaN; 2 3])) + @test isequal(coalesce.(cor([1 missing; 2 3], [1, 2]), NaN), cor([1 NaN; 2 3], [1, 2])) + @test isequal(coalesce.(cor([1, 2], [1 missing; 2 3]), NaN), cor([1, 2], [1 NaN; 2 3])) + end @test isequal(cor([1 2; 2 3], [1, missing]), [missing missing]') @test isequal(cor([1, missing], [1 2; 2 3]), [missing missing]) end @@ -1072,3 +1084,19 @@ end @test isequal(cov(Int[], my), fill(-0.0, 1, 3)) @test isequal(cor(Int[], my), fill(NaN, 1, 3)) end + +@testset "mean, var, std type stability with Missings; Issue #160" begin + @test (@inferred Missing mean(view([1, 2, missing], 1:2))) == (@inferred mean([1,2])) + @test (@inferred Missing var(view([1, 2, missing], 1:2))) == (@inferred var([1,2])) + @test (@inferred Missing std(view([1, 2, missing], 1:2))) == (@inferred std([1,2])) +end + +@testset "inexact errors; Issues #7 and #126" begin + a = [missing missing; 0 1] + @test isequal(mean(a;dims=2), [missing; 0.5;;]) + + x = [(i==3 && j==3) ? missing : i*j for i in 1:3, j in 1:4] + @test ismissing(@inferred Float64 mean(x)) + @test isequal(mean(x; dims=1), [2. 4. missing 8.]) + @test isequal(mean(x; dims=2), [2.5; 5.0; missing;;]) +end From c674ab823526d05e7a85b468065089bf563823aa Mon Sep 17 00:00:00 2001 From: Matt Bauman Date: Wed, 14 May 2025 17:50:00 -0400 Subject: [PATCH 2/2] skip the mean refactor for now --- src/Statistics.jl | 54 +++++++++++++++++++++-------------------------- test/runtests.jl | 2 +- 2 files changed, 25 insertions(+), 31 deletions(-) diff --git a/src/Statistics.jl b/src/Statistics.jl index d776b9e..b34190d 100644 --- a/src/Statistics.jl +++ b/src/Statistics.jl @@ -43,18 +43,6 @@ julia> mean(skipmissing([1, missing, 3])) """ mean(itr) = mean(identity, itr) -struct Counter{F} <: Function - f::F - n::Base.RefValue{Int} -end -Counter(f::F) where {F} = Counter{F}(f, Ref(0)) -(f::Counter)(x) = (f.n[] += 1; f.f(x)) - -struct DivOne{F} <: Function - f::F -end -(f::DivOne)(x) = f.f(x)/1 - """ mean(f, itr) @@ -71,13 +59,23 @@ julia> mean([√1, √2, √3]) ``` """ function mean(f, itr) - if Base.IteratorSize(itr) === Base.SizeUnknown() - g = Counter(DivOne(f)) - result = mapfoldl(g, add_mean, itr) - return result/g.n[] - else - return mapfoldl(DivOne(f), add_mean, itr)/length(itr) + y = iterate(itr) + if y === nothing + return Base.mapreduce_empty_iter(f, +, itr, + Base.IteratorEltype(itr)) / 0 end + count = 1 + value, state = y + f_value = f(value)/1 + total = Base.reduce_first(+, f_value) + y = iterate(itr, state) + while y !== nothing + value, state = y + total += _mean_promote(total, f(value)) + count += 1 + y = iterate(itr, state) + end + return total/count end """ @@ -182,24 +180,20 @@ mean(A::AbstractArray; dims=:) = _mean(identity, A, dims) _mean_promote(x::T, y::S) where {T,S} = convert(promote_type(T, S), y) -add_mean(x, y) = Base.add_sum(x, _mean_promote(x, y)) - -Base.reduce_empty(::typeof(add_mean), T) = Base.reduce_empty(Base.add_sum, T) -Base.mapreduce_empty(g::DivOne, ::typeof(add_mean), T) = Base.mapreduce_empty(g.f, Base.add_sum, T)/1 -Base.mapreduce_empty(g::Counter{<:DivOne}, ::typeof(add_mean), T) = Base.mapreduce_empty(g.f.f, Base.add_sum, T)/1 - - # ::Dims is there to force specializing on Colon (as it is a Function) function _mean(f, A::AbstractArray, dims::Dims=:) where Dims + isempty(A) && return sum(f, A, dims=dims)/0 if dims === (:) - result = mapreduce(DivOne(f), add_mean, A, dims=dims) n = length(A) + else + n = mapreduce(i -> size(A, i), *, unique(dims); init=1) + end + x1 = f(first(A)) / 1 + result = sum(x -> _mean_promote(x1, f(x)), A, dims=dims) + if dims === (:) return result / n else - result = mapreduce(DivOne(f), add_mean, A, dims=dims) - n = prod(i -> size(A, i), unique(dims); init=1) - result ./= n - return result + return result ./= n end end diff --git a/test/runtests.jl b/test/runtests.jl index 79b4a55..6e9d095 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1093,7 +1093,7 @@ end @testset "inexact errors; Issues #7 and #126" begin a = [missing missing; 0 1] - @test isequal(mean(a;dims=2), [missing; 0.5;;]) + @test_broken isequal(mean(a;dims=2), [missing; 0.5;;]) x = [(i==3 && j==3) ? missing : i*j for i in 1:3, j in 1:4] @test ismissing(@inferred Float64 mean(x))