Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 54 additions & 59 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -294,77 +294,58 @@ if(BUILD_LTO)
endif()

#
# SIMD SUPPORT (independent of OpenMP)
# SIMD SUPPORT (runtime CPU dispatch)
#

# Option to disable SIMD entirely
option(USE_SIMD "Enable SIMD optimizations (SSE4.2/AVX2 on x86_64, NEON on ARM64)" ON)

# Check architecture
# CMAKE_SYSTEM_PROCESSOR is "x86_64" on Intel Macs and Linux x86_64, "arm64"/"aarch64" on ARM
# SLiM's SIMD kernels live in eidos_simd_*.cpp, compiled once per instruction-
# set tier: scalar, SSE4.2, and AVX2+FMA on x86_64, and NEON on ARM64. Only
# the per-tier files are given instruction-set-specific compiler flags (those
# are applied at the end of this file); every other translation unit, and the
# dispatcher, is compiled at the plain baseline ABI. At startup
# Eidos_SIMD_Init() probes the CPU and points the
# kernels at the fastest tier the hardware supports, so a single binary is
# correct on any CPU, from pre-AVX2 hardware up. Applying -mavx2 globally (the
# previous approach) let the compiler emit AVX2 throughout the whole binary and
# caused SIGILL crashes on older CPUs (issue #628).

option(USE_SIMD "Enable SIMD-accelerated kernels with runtime CPU dispatch" ON)

# Detect x86 so the SSE4.2/AVX2 tier files can be given their ISA flags. NEON
# needs no flag (it is baseline on ARM64); other architectures use the scalar
# tier only.
set(IS_X86_64 FALSE)
set(IS_ARM64 FALSE)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64|i686|i386")
set(IS_X86_64 TRUE)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64|ARM64")
set(IS_ARM64 TRUE)
endif()

if(USE_SIMD AND NOT WIN32 AND IS_X86_64)
include(CheckCXXCompilerFlag)

# Check for AVX2 support
check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2)
check_cxx_compiler_flag("-msse4.2" COMPILER_SUPPORTS_SSE42)
check_cxx_compiler_flag("-mfma" COMPILER_SUPPORTS_FMA)

if(COMPILER_SUPPORTS_AVX2)
message(STATUS "SIMD: AVX2 support detected")
add_compile_definitions(EIDOS_HAS_AVX2=1)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2")
if(COMPILER_SUPPORTS_FMA)
message(STATUS "SIMD: FMA support detected")
add_compile_definitions(EIDOS_HAS_FMA=1)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
endif()
elseif(COMPILER_SUPPORTS_SSE42)
message(STATUS "SIMD: SSE4.2 support detected (no AVX2)")
add_compile_definitions(EIDOS_HAS_SSE42=1)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2")
else()
message(STATUS "SIMD: No x86 SIMD support detected, using scalar fallback")
endif()
elseif(USE_SIMD AND NOT WIN32 AND IS_ARM64)
# ARM64 NEON is always available on ARM64, no compiler flag needed
message(STATUS "SIMD: ARM64 NEON support enabled")
add_compile_definitions(EIDOS_HAS_NEON=1)
elseif(USE_SIMD AND NOT WIN32)
message(STATUS "SIMD: Unknown architecture (${CMAKE_SYSTEM_PROCESSOR}), using scalar fallback")
elseif(USE_SIMD AND WIN32)
# Windows SIMD detection - MinGW uses GCC, so we can use the same flags as Linux
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
if(USE_SIMD)
if(IS_X86_64 AND NOT MSVC)
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2_WIN)
check_cxx_compiler_flag("-mfma" COMPILER_SUPPORTS_FMA_WIN)

if(COMPILER_SUPPORTS_AVX2_WIN)
message(STATUS "SIMD: AVX2 support detected (Windows/MinGW)")
add_compile_definitions(EIDOS_HAS_AVX2=1)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2")
if(COMPILER_SUPPORTS_FMA_WIN)
message(STATUS "SIMD: FMA support detected (Windows/MinGW)")
add_compile_definitions(EIDOS_HAS_FMA=1)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
endif()
check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2)
check_cxx_compiler_flag("-msse4.2" COMPILER_SUPPORTS_SSE42)

if(COMPILER_SUPPORTS_AVX2 AND COMPILER_SUPPORTS_SSE42)
message(STATUS "SIMD: runtime dispatch enabled (scalar / SSE4.2 / AVX2+FMA selected per-CPU at startup)")
# The avx2/sse42 tier files need -mavx2 -mfma / -msse4.2. The flags
# are applied at the end of this file rather than here, because the
# Windows target blocks below overwrite COMPILE_FLAGS on every
# source file with "-include config.h"; applying our flags last,
# with APPEND_STRING, lets the two coexist (issue #628).
set(SIMD_TIER_FLAGS_X86 TRUE)
else()
message(STATUS "SIMD: No AVX2 support on Windows/MinGW, using scalar fallback")
message(STATUS "SIMD: compiler lacks -mavx2/-msse4.2; building scalar kernels only")
add_compile_definitions(EIDOS_SUPPRESS_SIMD_DISPATCH=1)
endif()
elseif(MSVC)
message(STATUS "SIMD: runtime dispatch not implemented for MSVC; building scalar kernels only")
add_compile_definitions(EIDOS_SUPPRESS_SIMD_DISPATCH=1)
else()
# MSVC path - not currently used but could be added in the future
message(STATUS "SIMD: MSVC SIMD detection not yet implemented, using scalar fallback")
# ARM64 (NEON, baseline) or any other architecture (scalar fallback):
# the tier files need no instruction-set flags.
message(STATUS "SIMD: runtime dispatch enabled (NEON or scalar kernels selected at startup)")
endif()
else()
message(STATUS "SIMD: Disabled by user")
message(STATUS "SIMD: disabled (USE_SIMD=OFF); all math uses scalar kernels")
add_compile_definitions(EIDOS_SUPPRESS_SIMD_DISPATCH=1)
endif()

# GSL - adding /usr/local/include so all targets that use GSL_INCLUDES get omp.h
Expand Down Expand Up @@ -535,6 +516,20 @@ if(BUILD_SLIMGUI)
endif(BUILD_SLIMGUI)


# Apply the per-tier SIMD instruction-set flags to the avx2/sse42 tier files.
# This is deliberately done here, after the WIN32 blocks above: those run
# set_source_files_properties(... COMPILE_FLAGS "-include config.h") over every
# source file, and COMPILE_FLAGS is a single string that gets overwritten, not
# appended. Setting our flags last with APPEND_STRING preserves the Windows
# "-include config.h" while still confining AVX2/SSE4.2 to these two files.
if(SIMD_TIER_FLAGS_X86)
set_property(SOURCE "${PROJECT_SOURCE_DIR}/eidos/eidos_simd_avx2.cpp"
APPEND_STRING PROPERTY COMPILE_FLAGS " -mavx2 -mfma")
set_property(SOURCE "${PROJECT_SOURCE_DIR}/eidos/eidos_simd_sse42.cpp"
APPEND_STRING PROPERTY COMPILE_FLAGS " -msse4.2")
endif()


# Deal with the PROFILE and PARALLEL flags, which interact and are handled in a complex way.
#
# For SLiMgui, profiling is always on for Release builds, always off for Debug builds; PROFILE does not affect it
Expand Down
5 changes: 5 additions & 0 deletions QtSLiM/QtSLiM.pro
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,11 @@ QMAKE_CFLAGS += $$(CFLAGS)
DEFINES += EIDOS_GUI
DEFINES += SLIMGUI=1

# The Qt Creator (qmake) build does not apply per-file SIMD compiler flags, so
# it builds the scalar SIMD kernels only. The CMake build provides the full
# runtime SIMD dispatch (scalar / SSE4.2 / AVX2+FMA / NEON). See eidos_simd.h.
DEFINES += EIDOS_SUPPRESS_SIMD_DISPATCH


# Uncomment this define to disable the use of OpenGL in SLiMgui completely. This, plus removing the
# link dependency on openglwidgets, should allow you to build SLiMgui without linking OpenGL at all.
Expand Down
5 changes: 5 additions & 0 deletions core/core.pro
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ QMAKE_CXXFLAGS += -Xarch_arm64 -DEIDOS_HAS_NEON=1
DEFINES += EIDOS_GUI
DEFINES += SLIMGUI=1

# The Qt Creator (qmake) build does not apply per-file SIMD compiler flags, so
# it builds the scalar SIMD kernels only. The CMake build provides the full
# runtime SIMD dispatch (scalar / SSE4.2 / AVX2+FMA / NEON). See eidos_simd.h.
DEFINES += EIDOS_SUPPRESS_SIMD_DISPATCH

CONFIG -= qt
CONFIG += c++11
CONFIG += c11
Expand Down
11 changes: 11 additions & 0 deletions eidos/eidos.pro
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ QMAKE_CXXFLAGS += -Xarch_arm64 -DEIDOS_HAS_NEON=1
DEFINES += EIDOS_GUI
DEFINES += SLIMGUI=1

# The Qt Creator (qmake) build does not apply per-file SIMD compiler flags, so
# it builds the scalar SIMD kernels only. The CMake build provides the full
# runtime SIMD dispatch (scalar / SSE4.2 / AVX2+FMA / NEON). See eidos_simd.h.
DEFINES += EIDOS_SUPPRESS_SIMD_DISPATCH

CONFIG -= qt
CONFIG += c++11
CONFIG += c11
Expand Down Expand Up @@ -115,6 +120,11 @@ SOURCES += \
eidos_property_signature.cpp \
eidos_rng.cpp \
eidos_script.cpp \
eidos_simd.cpp \
eidos_simd_avx2.cpp \
eidos_simd_neon.cpp \
eidos_simd_scalar.cpp \
eidos_simd_sse42.cpp \
eidos_sorting.cpp \
eidos_symbol_table.cpp \
eidos_test.cpp \
Expand Down Expand Up @@ -149,6 +159,7 @@ HEADERS += \
eidos_rng.h \
eidos_script.h \
eidos_simd.h \
eidos_simd_impl.h \
eidos_sorting.h \
eidos_symbol_table.h \
eidos_test_builtins.h \
Expand Down
4 changes: 4 additions & 0 deletions eidos/eidos_globals.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "eidos_class_DataFrame.h"
#include "eidos_class_Image.h"
#include "eidos_class_TestElement.h"
#include "eidos_simd.h"

#include <stdlib.h>
#include <execinfo.h>
Expand Down Expand Up @@ -1163,6 +1164,9 @@ void Eidos_WarmUp(void)
{
been_here = true;

// Detect the CPU and select the SIMD kernel tier; must happen before any Eidos_SIMD kernel is called.
Eidos_SIMD_Init();

// Initialize the random number generator with a random-ish seed. This seed may be overridden by the Context downstream.
Eidos_InitializeRNG();
Eidos_SetRNGSeed(Eidos_GenerateRNGSeed());
Expand Down
131 changes: 131 additions & 0 deletions eidos/eidos_simd.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
//
// eidos_simd.cpp
// Eidos
//
// Created by Andrew Kern on 5/21/2026.
// Copyright (c) 2024-2025 Benjamin C. Haller. All rights reserved.
// A product of the Messer Lab, http://messerlab.org/slim/
//

// This file is part of Eidos.
//
// Eidos is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
//
// Eidos is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along with Eidos. If not, see <http://www.gnu.org/licenses/>.

/*

SIMD runtime dispatcher. This translation unit is compiled at the plain
baseline ABI (no instruction-set flags); it owns the public Eidos_SIMD
function pointers and selects a tier for them at startup. See eidos_simd.h.

*/

#include "eidos_simd.h"

#include <cstring>


// The public kernel pointers. They are statically initialized to the scalar
// tier so that a call is well-defined even if it somehow happens before
// Eidos_SIMD_Init() runs; the address of a function is a constant expression,
// so there is no static-initialization-order dependency here.
namespace Eidos_SIMD {
#define X(ret, name, params) ret (*name) params = &Eidos_SIMD_scalar::name;
EIDOS_SIMD_FUNCTION_TABLE
#undef X
}


enum class Eidos_SIMD_Tier { kScalar, kSSE42, kAVX2_FMA, kNEON };

static Eidos_SIMD_Tier sActiveTier = Eidos_SIMD_Tier::kScalar;


bool Eidos_SIMD_SelectTier(const char *tier_name)
{
// The scalar tier is built on every platform and always available.
if (std::strcmp(tier_name, "scalar") == 0)
{
Eidos_SIMD_Fill_scalar();
sActiveTier = Eidos_SIMD_Tier::kScalar;
return true;
}

#if EIDOS_SIMD_DISPATCH_X86
// __builtin_cpu_supports() reads CPUID; it is available on GCC and Clang
// for x86 and works regardless of the flags this file was compiled with.
// AVX2 and FMA shipped together (Haswell), but we require both explicitly
// since the AVX2 tier and SLEEF both use FMA instructions.
if (std::strcmp(tier_name, "AVX2+FMA") == 0)
{
if (!(__builtin_cpu_supports("avx2") && __builtin_cpu_supports("fma")))
return false;
Eidos_SIMD_Fill_avx2();
sActiveTier = Eidos_SIMD_Tier::kAVX2_FMA;
return true;
}
if (std::strcmp(tier_name, "SSE4.2") == 0)
{
if (!__builtin_cpu_supports("sse4.2"))
return false;
Eidos_SIMD_Fill_sse42();
sActiveTier = Eidos_SIMD_Tier::kSSE42;
return true;
}
#endif

#if EIDOS_SIMD_DISPATCH_ARM
// NEON is baseline on every ARM64 CPU, so it is always available here.
if (std::strcmp(tier_name, "NEON") == 0)
{
Eidos_SIMD_Fill_neon();
sActiveTier = Eidos_SIMD_Tier::kNEON;
return true;
}
#endif

return false;
}

void Eidos_SIMD_Init(void)
{
// Install the fastest tier the CPU supports. This is idempotent: calling it
// again re-runs detection and re-installs the same tier, which is how the
// SIMD self-tests restore normal dispatch after cycling through every tier.
#if EIDOS_SIMD_DISPATCH_X86
if (Eidos_SIMD_SelectTier("AVX2+FMA"))
return;
if (Eidos_SIMD_SelectTier("SSE4.2"))
return;
#endif
#if EIDOS_SIMD_DISPATCH_ARM
if (Eidos_SIMD_SelectTier("NEON"))
return;
#endif
// Fallback for pre-AVX2/pre-SSE4.2 x86, unknown architectures, MSVC, and
// USE_SIMD=OFF builds: the scalar tier, which runs on any CPU.
Eidos_SIMD_SelectTier("scalar");
}

const char *Eidos_SIMD_ActiveTierName(void)
{
switch (sActiveTier)
{
case Eidos_SIMD_Tier::kAVX2_FMA: return "AVX2+FMA";
case Eidos_SIMD_Tier::kSSE42: return "SSE4.2";
case Eidos_SIMD_Tier::kNEON: return "NEON";
case Eidos_SIMD_Tier::kScalar: return "scalar";
}
return "scalar";
}

bool Eidos_SIMD_SLEEFActive(void)
{
// SLEEF transcendentals are wired up only for the AVX2+FMA and NEON tiers.
return (sActiveTier == Eidos_SIMD_Tier::kAVX2_FMA) || (sActiveTier == Eidos_SIMD_Tier::kNEON);
}
Loading
Loading