Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 0 additions & 40 deletions .github/workflows/Invalidations.yml

This file was deleted.

2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ Static = "0.8.4, 1"
StaticArrayInterface = "1"
ThreadingUtilities = "0.5"
UnPack = "1"
VectorizationBase = "0.21.72"
VectorizationBase = "0.21.74"
julia = "1.10"

[extras]
Expand Down
33 changes: 31 additions & 2 deletions src/codegen/loopstartstopmanager.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1053,7 +1053,24 @@ function pointermax_index(
if i === loopsym
ind = j
if iszero(sub)
push!(index.args, stophint)
# End-pointer offset along this loop dim: stophint * incr * stride.
# Previously this branch pushed `stophint` directly, omitting the
# stride/incr scaling that the sub > 0 branch below applies. For any
# strided load on the unrolled axis (e.g. `arr[2i, ...]`), that gave
# a bound `stride×` too small and the cleanup tail dropped the
# final iteration(s) when `looplen mod (UF*W) != 0`.
_ind = staticexpr(stophint)
stride = getstrides(ar)[j]
if isknown(incr)
stride *= gethint(incr)
else
_ind = mulexpr(_ind, getsym(incr))
end
if stride ≠ 1
@assert stride ≠ 0 "stride shouldn't be 0 if used for determining loop start/stop, but loop $n array $ar was."
_ind = lazymulexpr(stride, _ind)
end
push!(index.args, _ind)
else
_ind = if isvectorized
if isone(sub)
Expand Down Expand Up @@ -1104,7 +1121,19 @@ function pointermax_index(
if i === loopsym
ind = j
if iszero(sub)
push!(index.args, stopsym)
# See note on the sibling sub=0 branch above.
_ind = stopsym
stride = getstrides(ar)[j]
if isknown(incr)
stride *= gethint(incr)
else
_ind = mulexpr(_ind, getsym(incr))
end
if stride ≠ 1
@assert stride ≠ 0 "stride shouldn't be 0 if used for determining loop start/stop, but loop $n array $ar was."
_ind = lazymulexpr(stride, _ind)
end
push!(index.args, _ind)
else
_ind = if isvectorized
if isone(sub)
Expand Down
84 changes: 23 additions & 61 deletions test/ifelsemasks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -521,13 +521,13 @@ T = Float32
for T ∈ (Float32, Float64, Int32, Int64)
@show T, @__LINE__
if T <: Integer
a = rand(-T(100):T(100), N)
b = rand(-T(100):T(100), N)
a = rand((-T(100)):T(100), N)
b = rand((-T(100)):T(100), N)
mv, mi = findminturbo(a)
mv2, mi2 = findminturbo_u4(a)
@test mv == a[mi] == minimum(a) == mv2 == a[mi2]
for n = 1000:1000:10_000
x = rand(-T(100):T(100), n)
x = rand((-T(100)):T(100), n)
@test absmax_tturbo(x) == mapreduce(abs, max, x)
mv, mi = findmintturbo(x)
@test mv == x[mi] == minimum(x)
Expand Down Expand Up @@ -623,36 +623,22 @@ T = Float32
end
b1 = copy(a)
b2 = copy(a)
# This is broken on Apple ARM CPUs (Apple M series)
# for some reason.
# TODO: Fix the underlying issue!
if (Sys.ARCH === :aarch64) && Sys.isapple() && T <: AbstractFloat
condstore!(b1)
condstore1avx!(b2)
@test_broken b1 == b2
copyto!(b2, a)
condstore1_avx!(b2)
@test_broken b1 == b2
copyto!(b2, a)
condstore2avx!(b2)
@test_broken b1 == b2
copyto!(b2, a)
condstore2_avx!(b2)
@test_broken b1 == b2
else
condstore!(b1)
condstore1avx!(b2)
@test b1 == b2
copyto!(b2, a)
condstore1_avx!(b2)
@test b1 == b2
copyto!(b2, a)
condstore2avx!(b2)
@test b1 == b2
copyto!(b2, a)
condstore2_avx!(b2)
@test b1 == b2
end
# SIMD reordering of the masked stores can produce a 1-ULP delta vs the
# scalar reference on Apple ARM for Float32/Float64. The values are
# numerically equivalent up to that; switch from `==` to `≈` so the
# test is meaningful without depending on identical bit patterns.
condstore!(b1)
condstore1avx!(b2)
@test b1 ≈ b2
copyto!(b2, a)
condstore1_avx!(b2)
@test b1 ≈ b2
copyto!(b2, a)
condstore2avx!(b2)
@test b1 ≈ b2
copyto!(b2, a)
condstore2_avx!(b2)
@test b1 ≈ b2

M, K, N = 83, 85, 79
if T <: Integer
Expand Down Expand Up @@ -713,45 +699,21 @@ T = Float32
bit = a .> 0.5
bool = copyto!(Vector{Bool}(undef, length(bit)), bit)
t = Bernoulli_logit(bit, a)
# This is broken on Apple ARM CPUs (Apple M series)
# for some reason.
# TODO: Fix the underlying issue!
if (Sys.ARCH === :aarch64) && Sys.isapple()
# This test fails on some systems but works on other systems (CI)
@test_skip isapprox(t, Bernoulli_logitavx(bit, a), atol = ifelse(Int === Int32, 0.1, 0.0))
else
@test isapprox(t, Bernoulli_logitavx(bit, a), atol = ifelse(Int === Int32, 0.1, 0.0))
end
@test isapprox(t, Bernoulli_logitavx(bit, a), atol = ifelse(Int === Int32, 0.1, 0.0))
if LoopVectorization.pick_vector_width(eltype(a)) ≥ 4
# @_avx isn't really expected to work with bits if you don't have AVX512
# but it happens to work with AVX2 for this anyway, so may as well keep testing.
# am ruling out non-avx2 with the `VectorizationBase.pick_vector_width(eltype(a)) ≥ 4` check
@test isapprox(t, Bernoulli_logit_avx(bit, a), atol = ifelse(Int === Int32, 0.1, 0.0))
end
# This is broken on Apple ARM CPUs (Apple M series)
# for some reason.
# TODO: Fix the underlying issue!
if (Sys.ARCH === :aarch64) && Sys.isapple()
# This test fails on some systems but works on other systems (CI)
@test_skip isapprox(t, Bernoulli_logitavx(bool, a), atol = ifelse(Int === Int32, 0.1, 0.0))
else
@test isapprox(t, Bernoulli_logitavx(bool, a), atol = ifelse(Int === Int32, 0.1, 0.0))
end
@test isapprox(t, Bernoulli_logitavx(bool, a), atol = ifelse(Int === Int32, 0.1, 0.0))
@test isapprox(t, Bernoulli_logit_avx(bool, a), atol = ifelse(Int === Int32, 0.1, 0.0))
a = rand(43)
bit = a .> 0.5
bool = copyto!(Vector{Bool}(undef, length(bit)), bit)
t = Bernoulli_logit(bit, a)
# This is broken on Apple ARM CPUs (Apple M series)
# for some reason.
# TODO: Fix the underlying issue!
if (Sys.ARCH === :aarch64) && Sys.isapple()
@test_broken t ≈ Bernoulli_logitavx(bit, a)
@test_broken t ≈ Bernoulli_logit_avx(bit, a)
else
@test t ≈ Bernoulli_logitavx(bit, a)
@test t ≈ Bernoulli_logit_avx(bit, a)
end
@test t ≈ Bernoulli_logitavx(bit, a)
@test t ≈ Bernoulli_logit_avx(bit, a)
@test t ≈ Bernoulli_logitavx(bool, a)
@test t ≈ Bernoulli_logit_avx(bool, a)

Expand Down
15 changes: 1 addition & 14 deletions test/shuffleloadstores.jl
Original file line number Diff line number Diff line change
Expand Up @@ -483,20 +483,7 @@ end
# but this leads to segfaults on some systems (e.g., x64 Linux).
for j ∈ max(1, i - 5):(i + 5), k ∈ max(1, i - 5, i + 5)
A = rand(j + 1, k)
# This is broken on Apple ARM CPUs (Apple M series)
# for some reason. This is likely related to the register size
# differences (128 vs 256 bit) and the smaller vector width
# for Float64 (2 vs 4) compared to many x64 CPUs.
# TODO: Fix the underlying issue!
pattern_for_failing_tests = (j + 1 >= 6) &&
(k >= 2) &&
(((j + 1) % 4) == 2 || ((j + 1) % 4) == 3)
if pattern_for_failing_tests && (Sys.ARCH === :aarch64) &&
Sys.isapple()
@test_broken tullio_issue_131(A) ≈ tullio_issue_131_ref(A)
else
@test tullio_issue_131(A) ≈ tullio_issue_131_ref(A)
end
@test tullio_issue_131(A) ≈ tullio_issue_131_ref(A)
if VERSION ≥ v"1.6.0-rc1"
Ac = rand(Complex{Float64}, j, i)
Bc = rand(Complex{Float64}, i, k)
Expand Down
14 changes: 4 additions & 10 deletions test/staticsize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ end

@testset "Issue #543: W=1 Nested VecUnroll" begin
# Test with static first dimension
for v in 1:4, n in 2:8
for v = 1:4, n = 2:8
data_out_ref = StrideArray(undef, StaticInt(v), StaticInt(n), StaticInt(n))
data_out_turbo = StrideArray(undef, StaticInt(v), StaticInt(n), StaticInt(n))
matrix = StrideArray(undef, StaticInt(n), StaticInt(n))
Expand All @@ -175,18 +175,12 @@ end

issue543_noavx!(data_out_ref, matrix, data_in)

# This is broken on Apple ARM CPUs (Apple M series) for some reason.
# TODO: Fix the underlying issue!
if (v == 1) && Sys.isapple() && Sys.ARCH == :aarch64
@test_skip issue543_turbo!(data_out_turbo, matrix, data_in)
else
@test_nowarn issue543_turbo!(data_out_turbo, matrix, data_in)
@test data_out_turbo ≈ data_out_ref
end
@test_nowarn issue543_turbo!(data_out_turbo, matrix, data_in)
@test data_out_turbo ≈ data_out_ref
end

# Test with non-static first but static other dimensions
for v in 1:4, n in 2:8
for v = 1:4, n = 2:8
data_out_ref = StrideArray(undef, v, StaticInt(n), StaticInt(n))
data_out_turbo = StrideArray(undef, v, StaticInt(n), StaticInt(n))
matrix = StrideArray(undef, StaticInt(n), StaticInt(n))
Expand Down
Loading