diff --git a/src/Statistics.jl b/src/Statistics.jl index 6633edf..b34190d 100644 --- a/src/Statistics.jl +++ b/src/Statistics.jl @@ -211,6 +211,7 @@ realXcY(x::Complex, y::Complex) = real(x)*real(y) + imag(x)*imag(y) var(iterable; corrected::Bool=true, mean=nothing) = _var(iterable, corrected, mean) function _var(iterable, corrected::Bool, mean) + ismissing(mean) && return missing y = iterate(iterable) if y === nothing T = eltype(iterable) @@ -252,61 +253,36 @@ function _var(iterable, corrected::Bool, mean) end end -centralizedabs2fun(m) = x -> abs2.(x - m) -centralize_sumabs2(A::AbstractArray, m) = - mapreduce(centralizedabs2fun(m), +, A) -centralize_sumabs2(A::AbstractArray, m, ifirst::Int, ilast::Int) = - Base.mapreduce_impl(centralizedabs2fun(m), +, A, ifirst, ilast) - -function centralize_sumabs2!(R::AbstractArray{S}, A::AbstractArray, means::AbstractArray) where S - # following the implementation of _mapreducedim! at base/reducedim.jl - lsiz = Base.check_reducedims(R,A) - for i in 1:max(ndims(R), ndims(means)) - if axes(means, i) != axes(R, i) - throw(DimensionMismatch("dimension $i of `mean` should have indices $(axes(R, i)), but got $(axes(means, i))")) - end - end - isempty(R) || fill!(R, zero(S)) - isempty(A) && return R - - if Base.has_fast_linear_indexing(A) && lsiz > 16 && !has_offset_axes(R, means) - nslices = div(length(A), lsiz) - ibase = first(LinearIndices(A))-1 - for i = 1:nslices - @inbounds R[i] = centralize_sumabs2(A, means[i], ibase+1, ibase+lsiz) - ibase += lsiz - end - return R - end - indsAt, indsRt = Base.safe_tail(axes(A)), Base.safe_tail(axes(R)) # handle d=1 manually - keep, Idefault = Broadcast.shapeindexer(indsRt) - if Base.reducedim1(R, A) - i1 = first(Base.axes1(R)) - @inbounds for IA in CartesianIndices(indsAt) - IR = Broadcast.newindex(IA, keep, Idefault) - r = R[i1,IR] - m = means[i1,IR] - @simd for i in axes(A, 1) - r += abs2(A[i,IA] - m) - end - R[i1,IR] = r - end - else - @inbounds for IA in CartesianIndices(indsAt) - IR = Broadcast.newindex(IA, keep, Idefault) - @simd for i in axes(A, 1) - R[i,IR] += abs2(A[i,IA] - means[i,IR]) - end - end - end - return R +struct CentralizedAbs2Fun{T,S} <: Function + mean::S end +CentralizedAbs2Fun{T}(means) where {T} = CentralizedAbs2Fun{T,typeof(means)}(means) +CentralizedAbs2Fun(means) = CentralizedAbs2Fun{typeof(means)}(means) +CentralizedAbs2Fun(means, extrude) = CentralizedAbs2Fun{eltype(means)}(Broadcast.extrude(means)) +# Division is generally costly, but Julia is typically able to constant propagate a /1 +# and simply ensure we get the type right at no cost, allowing the division in-place later +(f::CentralizedAbs2Fun)(x) = abs2.(x - f.mean)/1 +(f::CentralizedAbs2Fun{<:Any,<:Broadcast.Extruded})((i, x),) = abs2.(x - Broadcast._broadcast_getindex(f.mean, i))/1 +_doubled(x) = x+x +Base.mapreduce_empty(::CentralizedAbs2Fun{T,<:Broadcast.Extruded}, ::typeof(Base.add_sum), ::Type{Tuple{_Any,S}}) where {T<:Number, S<:Number, _Any} = _doubled(abs2(zero(T)-zero(S)))/1 +Base.mapreduce_empty(::CentralizedAbs2Fun{T,<:Broadcast.Extruded}, ::typeof(Base.add_sum), ::Type{Tuple{_Any, Union{Missing, S}}}) where {T<:Number, S<:Number, _Any} = _doubled(abs2(zero(T)-zero(S)))/1 +Base.mapreduce_empty(::CentralizedAbs2Fun{T}, ::typeof(Base.add_sum), ::Type{S}) where {T<:Number, S<:Number} = _doubled(abs2(zero(T)-zero(S)))/1 +Base.mapreduce_empty(::CentralizedAbs2Fun{T}, ::typeof(Base.add_sum), ::Type{Union{Missing, S}}) where {T<:Number, S<:Number} = _doubled(abs2(zero(T)-zero(S)))/1 + +centralize_sumabs2(A::AbstractArray, m) = + sum(CentralizedAbs2Fun(m), A) +centralize_sumabs2(A::AbstractArray, m::AbstractArray, region) = + sum(CentralizedAbs2Fun(m, true), Base.PairsArray(A), dims=region) +centralize_sumabs2!(R::AbstractArray, A::AbstractArray, means::AbstractArray) = + sum!(CentralizedAbs2Fun(means, true), R, Base.PairsArray(A)) + function varm!(R::AbstractArray{S}, A::AbstractArray, m::AbstractArray; corrected::Bool=true) where S - if isempty(A) + _checkm(R, m, ntuple(identity, Val(max(ndims(R), ndims(m))))) + if isempty(A) || length(A) == 1 && corrected fill!(R, convert(S, NaN)) else - rn = div(length(A), length(R)) - Int(corrected) + rn = prod(ntuple(d->size(R, d) == 1 ? size(A, d) : 1, Val(max(ndims(A), ndims(R))))) - Int(corrected) centralize_sumabs2!(R, A, m) R .= R .* (1 // rn) end @@ -339,15 +315,33 @@ over dimensions. In that case, `mean` must be an array with the same shape as """ varm(A::AbstractArray, m::AbstractArray; corrected::Bool=true, dims=:) = _varm(A, m, corrected, dims) -_varm(A::AbstractArray{T}, m, corrected::Bool, region) where {T} = - varm!(Base.reducedim_init(t -> abs2(t)/2, +, A, region), A, m; corrected=corrected) +_throw_mean_mismatch(A, m, region) = throw(DimensionMismatch("axes of means ($(axes(m))) does not match reduction over $(region) of $(axes(A))")) +function _checkm(A::AbstractArray, m::AbstractArray, region) + for d in 1:max(ndims(A), ndims(m)) + if d in region + size(m, d) == 1 || _throw_mean_mismatch(A, m, region) + else + axes(m, d) == axes(A, d) || _throw_mean_mismatch(A, m, region) + end + end +end +function _varm(A::AbstractArray, m, corrected::Bool, region) + _checkm(A, m, region) + rn = prod(ntuple(d->d in region ? size(A, d) : 1, Val(ndims(A)))) - Int(corrected) + R = centralize_sumabs2(A, m, region) + if rn <= 0 + R .= R ./ 0 + else + R .= R .* 1//rn # why use Rational? + end + return R +end varm(A::AbstractArray, m; corrected::Bool=true) = _varm(A, m, corrected, :) function _varm(A::AbstractArray{T}, m, corrected::Bool, ::Colon) where T - n = length(A) - n == 0 && return oftype((abs2(zero(T)) + abs2(zero(T)))/2, NaN) - return centralize_sumabs2(A, m) / (n - Int(corrected)) + rn = max(length(A) - Int(corrected), 0) + centralize_sumabs2(A, m)/rn end diff --git a/test/runtests.jl b/test/runtests.jl index d2f2541..6e9d095 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -522,9 +522,15 @@ Y = [6.0 2.0; @testset "cov with missing" begin @test cov([missing]) === cov([1, missing]) === missing @test cov([1, missing], [2, 3]) === cov([1, 3], [2, missing]) === missing - @test_throws Exception cov([1 missing; 2 3]) - @test_throws Exception cov([1 missing; 2 3], [1, 2]) - @test_throws Exception cov([1, 2], [1 missing; 2 3]) + if isdefined(Base, :_reducedim_init) + @test_broken isequal(coalesce.(cov([1 missing; 2 3]), NaN), cov([1 NaN; 2 3])) + @test_broken isequal(coalesce.(cov([1 missing; 2 3], [1, 2]), NaN), cov([1 NaN; 2 3], [1, 2])) + @test_broken isequal(coalesce.(cov([1, 2], [1 missing; 2 3]), NaN), cov([1, 2], [1 NaN; 2 3])) + else + @test isequal(coalesce.(cov([1 missing; 2 3]), NaN), cov([1 NaN; 2 3])) + @test isequal(coalesce.(cov([1 missing; 2 3], [1, 2]), NaN), cov([1 NaN; 2 3], [1, 2])) + @test isequal(coalesce.(cov([1, 2], [1 missing; 2 3]), NaN), cov([1, 2], [1 NaN; 2 3])) + end @test isequal(cov([1 2; 2 3], [1, missing]), [missing missing]') @test isequal(cov([1, missing], [1 2; 2 3]), [missing missing]) end @@ -633,9 +639,15 @@ end @test cor([missing]) === missing @test cor([1, missing]) == 1 @test cor([1, missing], [2, 3]) === cor([1, 3], [2, missing]) === missing - @test_throws Exception cor([1 missing; 2 3]) - @test_throws Exception cor([1 missing; 2 3], [1, 2]) - @test_throws Exception cor([1, 2], [1 missing; 2 3]) + if isdefined(Base, :_reducedim_init) + @test_broken isequal(coalesce.(cor([1 missing; 2 3]), NaN), cor([1 NaN; 2 3])) + @test_broken isequal(coalesce.(cor([1 missing; 2 3], [1, 2]), NaN), cor([1 NaN; 2 3], [1, 2])) + @test_broken isequal(coalesce.(cor([1, 2], [1 missing; 2 3]), NaN), cor([1, 2], [1 NaN; 2 3])) + else + @test isequal(coalesce.(cor([1 missing; 2 3]), NaN), cor([1 NaN; 2 3])) + @test isequal(coalesce.(cor([1 missing; 2 3], [1, 2]), NaN), cor([1 NaN; 2 3], [1, 2])) + @test isequal(coalesce.(cor([1, 2], [1 missing; 2 3]), NaN), cor([1, 2], [1 NaN; 2 3])) + end @test isequal(cor([1 2; 2 3], [1, missing]), [missing missing]') @test isequal(cor([1, missing], [1 2; 2 3]), [missing missing]) end @@ -1072,3 +1084,19 @@ end @test isequal(cov(Int[], my), fill(-0.0, 1, 3)) @test isequal(cor(Int[], my), fill(NaN, 1, 3)) end + +@testset "mean, var, std type stability with Missings; Issue #160" begin + @test (@inferred Missing mean(view([1, 2, missing], 1:2))) == (@inferred mean([1,2])) + @test (@inferred Missing var(view([1, 2, missing], 1:2))) == (@inferred var([1,2])) + @test (@inferred Missing std(view([1, 2, missing], 1:2))) == (@inferred std([1,2])) +end + +@testset "inexact errors; Issues #7 and #126" begin + a = [missing missing; 0 1] + @test_broken isequal(mean(a;dims=2), [missing; 0.5;;]) + + x = [(i==3 && j==3) ? missing : i*j for i in 1:3, j in 1:4] + @test ismissing(@inferred Float64 mean(x)) + @test isequal(mean(x; dims=1), [2. 4. missing 8.]) + @test isequal(mean(x; dims=2), [2.5; 5.0; missing;;]) +end