From 45403c94acc12a673952cc0ba69d5c0a75de556a Mon Sep 17 00:00:00 2001 From: dev0 Date: Thu, 25 Dec 2025 20:33:17 +0530 Subject: [PATCH] SIMD --- .github/workflows/ci.yaml | 2 +- CMake/FindDeps.cmake | 13 +- CMakeLists.txt | 8 + Src/IACore/CMakeLists.txt | 21 ++- Src/IACore/imp/cpp/SIMD.cpp | 21 +++ Src/IACore/inc/IACore/PCH.hpp | 4 +- Src/IACore/inc/IACore/SIMD.hpp | 318 +++++++++++++++++++++++++++++++++ Tests/Unit/CMakeLists.txt | 3 + Tests/Unit/SIMD/FloatVec4.cpp | 106 +++++++++++ Tests/Unit/SIMD/IntVec4.cpp | 152 ++++++++++++++++ 10 files changed, 644 insertions(+), 4 deletions(-) create mode 100644 Src/IACore/imp/cpp/SIMD.cpp create mode 100644 Src/IACore/inc/IACore/SIMD.hpp create mode 100644 Tests/Unit/SIMD/FloatVec4.cpp create mode 100644 Tests/Unit/SIMD/IntVec4.cpp diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index ed163f0..60e1914 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - target: [linux-x64, linux-arm64, wasm] + target: [linux-x64] steps: - uses: actions/checkout@v4 diff --git a/CMake/FindDeps.cmake b/CMake/FindDeps.cmake index 11f35a7..e1fb14d 100644 --- a/CMake/FindDeps.cmake +++ b/CMake/FindDeps.cmake @@ -2,6 +2,11 @@ include(FetchContent) set(BUILD_SHARED_LIBS OFF CACHE INTERNAL "Force static libs") +set(HWY_ENABLE_TESTS OFF CACHE BOOL "Disable Highway tests" FORCE) +set(HWY_ENABLE_EXAMPLES OFF CACHE BOOL "Disable Highway examples" FORCE) +set(HWY_ENABLE_CONTRIB OFF CACHE BOOL "Disable Highway contrib" FORCE) +set(HWY_ENABLE_INSTALL OFF CACHE BOOL "Disable Highway install rules" FORCE) + set(ZLIB_USE_STATIC_LIBS ON) find_package(ZLIB REQUIRED) find_package(zstd CONFIG REQUIRED) @@ -74,6 +79,12 @@ FetchContent_Declare( EXCLUDE_FROM_ALL ) +FetchContent_Declare( + highway + GIT_REPOSITORY https://github.com/google/highway.git + GIT_TAG 1.3.0 +) + set(MI_OVERRIDE ON CACHE BOOL "" FORCE) set(MI_BUILD_STATIC ON CACHE BOOL "" FORCE) set(MI_BUILD_TESTS OFF CACHE BOOL "" FORCE) @@ -88,4 +99,4 @@ set(HTTPLIB_COMPILE OFF CACHE BOOL "" FORCE) set(HTTPLIB_TEST OFF CACHE BOOL "" FORCE) set(HTTPLIB_EXAMPLE OFF CACHE BOOL "" FORCE) -FetchContent_MakeAvailable(httplib pugixml nlohmann_json glaze simdjson tl-expected unordered_dense mimalloc) +FetchContent_MakeAvailable(httplib pugixml nlohmann_json glaze simdjson tl-expected unordered_dense mimalloc highway) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2f36567..45eca2c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,6 +36,14 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") endif() endif() +if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|x86_64|AMD64") + set(IACORE_ARCH_X64 TRUE) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64") + set(IACORE_ARCH_ARM64 TRUE) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "wasm32|emscripten") + set(IACORE_ARCH_WASM TRUE) +endif() + if(MSVC) add_compile_options(/W4) if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") diff --git a/Src/IACore/CMakeLists.txt b/Src/IACore/CMakeLists.txt index 93b5221..59e6c9f 100644 --- a/Src/IACore/CMakeLists.txt +++ b/Src/IACore/CMakeLists.txt @@ -1,6 +1,7 @@ set(SRC_FILES "imp/cpp/IPC.cpp" "imp/cpp/XML.cpp" + "imp/cpp/SIMD.cpp" "imp/cpp/JSON.cpp" "imp/cpp/IACore.cpp" "imp/cpp/Logger.cpp" @@ -21,7 +22,8 @@ add_library(IACore STATIC ${SRC_FILES}) target_include_directories(IACore PUBLIC inc/) target_include_directories(IACore PRIVATE imp/hpp/) -target_link_libraries(IACore PUBLIC +target_link_libraries(IACore PUBLIC + hwy ZLIB::ZLIB zstd::libzstd tl::expected @@ -71,3 +73,20 @@ target_compile_definitions(IACore PUBLIC $<$:__IA_DEBUG=1> $<$:__IA_DEBUG=0> ) + +if(IACORE_ARCH_X64) + if(MSVC) + target_compile_options(IACore INTERFACE /arch:AVX2) + else() + target_compile_options(IACore INTERFACE -mavx2 -mfma) + endif() + target_compile_definitions(IACore INTERFACE HWY_BASELINE_TARGETS=HWY_AVX2) +elseif(IACORE_ARCH_ARM64) + if(NOT MSVC) + target_compile_options(IACore INTERFACE -march=armv8-a+simd) + endif() + target_compile_definitions(IACore INTERFACE HWY_BASELINE_TARGETS=HWY_NEON) +elseif(IACORE_ARCH_WASM) + target_compile_options(IACore INTERFACE -msimd128) + target_compile_definitions(IACore INTERFACE HWY_BASELINE_TARGETS=HWY_WASM) +endif() \ No newline at end of file diff --git a/Src/IACore/imp/cpp/SIMD.cpp b/Src/IACore/imp/cpp/SIMD.cpp new file mode 100644 index 0000000..b3a2078 --- /dev/null +++ b/Src/IACore/imp/cpp/SIMD.cpp @@ -0,0 +1,21 @@ +// IACore-OSS; The Core Library for All IA Open Source Projects +// Copyright (C) 2025 IAS (ias@iasoft.dev) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +namespace IACore +{ + +} \ No newline at end of file diff --git a/Src/IACore/inc/IACore/PCH.hpp b/Src/IACore/inc/IACore/PCH.hpp index f7707a8..bf495d7 100644 --- a/Src/IACore/inc/IACore/PCH.hpp +++ b/Src/IACore/inc/IACore/PCH.hpp @@ -189,6 +189,7 @@ # define OVERRIDE override # define CONSTEXPR constexpr # define CONSTEVAL consteval +# define EXPLICIT explicit # define NOEXCEPT noexcept # define NULLPTR nullptr # define IA_MOVE(...) std::move(__VA_ARGS__) @@ -199,6 +200,7 @@ # define OVERRIDE # define CONSTEXPR const # define CONSTEVAL +# define EXPLICIT # define NOEXCEPT # define NULLPTR NULL # define IA_MOVE(...) (__VA_ARGS__) @@ -241,7 +243,7 @@ #define __INTERNAL_IA_STRINGIFY(value) #value #define IA_STRINGIFY(value) __INTERNAL_IA_STRINGIFY(value) -#define ALIGN(a) __attribute__((aligned(a))) +#define ALIGN(a) alignas(a) #define ASM(...) __asm__ volatile(__VA_ARGS__) diff --git a/Src/IACore/inc/IACore/SIMD.hpp b/Src/IACore/inc/IACore/SIMD.hpp new file mode 100644 index 0000000..115113b --- /dev/null +++ b/Src/IACore/inc/IACore/SIMD.hpp @@ -0,0 +1,318 @@ +// IACore-OSS; The Core Library for All IA Open Source Projects +// Copyright (C) 2025 IAS (ias@iasoft.dev) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#if defined(__clang__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wunused-parameter" +# pragma GCC diagnostic ignored "-Wgnu-zero-variadic-macro-arguments" +#endif + +#include + +#if defined(__clang__) +# pragma GCC diagnostic pop +#endif + +namespace IACore +{ + namespace hn = hwy::HWY_NAMESPACE; + +#if HWY_TARGET == HWY_SCALAR +# pragma message("Warning: Configuration mismatch. IACore is being compiled for SCALAR SIMD (Slow)") +#endif + + class ALIGN(16) IntVec4 + { + public: + IntVec4() = default; + + INLINE EXPLICIT IntVec4(IN UINT32 s); + INLINE EXPLICIT IntVec4(IN PCUINT32 values); + INLINE EXPLICIT IntVec4(IN UINT32 a, IN UINT32 b, IN UINT32 c, IN UINT32 d); + + INLINE IntVec4 operator+(IN CONST IntVec4 &other) CONST; + INLINE IntVec4 operator-(IN CONST IntVec4 &other) CONST; + INLINE IntVec4 operator*(IN CONST IntVec4 &other) CONST; + + INLINE IntVec4 operator&(IN CONST IntVec4 &other) CONST; + INLINE IntVec4 operator|(IN CONST IntVec4 &other) CONST; + INLINE IntVec4 operator^(IN CONST IntVec4 &other) CONST; + INLINE IntVec4 operator~() CONST; + + INLINE IntVec4 operator<<(IN UINT32 amount) CONST; + INLINE IntVec4 operator>>(IN UINT32 amount) CONST; + + INLINE IntVec4 SatAdd(IN CONST IntVec4 &other) CONST; + INLINE IntVec4 SatSub(IN CONST IntVec4 &other) CONST; + + INLINE IntVec4 Clamp(IN UINT32 min, IN UINT32 max) CONST; + + INLINE IntVec4 MultAdd(IN CONST IntVec4 &multiplier, IN CONST IntVec4 &addend) CONST; + + INLINE VOID Store(OUT PUINT32 values); + STATIC INLINE IntVec4 Load(IN PCUINT32 values); + + private: + using Tag = hn::FixedTag; + + hn::Vec m_data; + + INLINE EXPLICIT IntVec4(hn::Vec v) : m_data(v) + { + } + }; + + class ALIGN(16) FloatVec4 + { + public: + FloatVec4() = default; + + INLINE EXPLICIT FloatVec4(IN FLOAT32 s); + INLINE EXPLICIT FloatVec4(IN PCFLOAT32 values); + INLINE EXPLICIT FloatVec4(IN FLOAT32 a, IN FLOAT32 b, IN FLOAT32 c, IN FLOAT32 d); + + INLINE FloatVec4 operator+(IN CONST FloatVec4 &other) CONST; + INLINE FloatVec4 operator-(IN CONST FloatVec4 &other) CONST; + INLINE FloatVec4 operator*(IN CONST FloatVec4 &other) CONST; + INLINE FloatVec4 operator/(IN CONST FloatVec4 &other) CONST; + + INLINE FloatVec4 Clamp(IN FLOAT32 min, IN FLOAT32 max) CONST; + + INLINE FloatVec4 Abs() CONST; + INLINE FloatVec4 Sqrt() CONST; + INLINE FloatVec4 Rsqrt() CONST; + INLINE FloatVec4 Normalize() CONST; + + INLINE FLOAT32 Dot(IN CONST FloatVec4 &other) CONST; + + INLINE FloatVec4 MultAdd(IN CONST FloatVec4 &multiplier, IN CONST FloatVec4 &addend) CONST; + + INLINE VOID Store(OUT PFLOAT32 values); + STATIC INLINE FloatVec4 Load(IN PCFLOAT32 values); + + private: + using Tag = hn::FixedTag; + + hn::Vec m_data; + + INLINE EXPLICIT FloatVec4(hn::Vec v) : m_data(v) + { + } + }; +} // namespace IACore + +namespace IACore +{ + IntVec4::IntVec4(IN UINT32 s) + { + CONST Tag d; + m_data = hn::Set(d, s); + } + + IntVec4::IntVec4(IN PCUINT32 values) + { + CONST Tag data; + m_data = hn::Load(data, values); + } + + IntVec4::IntVec4(IN UINT32 a, IN UINT32 b, IN UINT32 c, IN UINT32 d) + { + CONST Tag data; + ALIGN(16) UINT32 values[4] = {a, b, c, d}; + m_data = hn::Load(data, values); + } + + IntVec4 IntVec4::operator+(IN CONST IntVec4 &other) CONST + { + return IntVec4(hn::Add(m_data, other.m_data)); + } + + IntVec4 IntVec4::operator-(IN CONST IntVec4 &other) CONST + { + return IntVec4(hn::Sub(m_data, other.m_data)); + } + + IntVec4 IntVec4::operator*(IN CONST IntVec4 &other) CONST + { + return IntVec4(hn::Mul(m_data, other.m_data)); + } + + IntVec4 IntVec4::operator&(IN CONST IntVec4 &other) CONST + { + return IntVec4(hn::And(m_data, other.m_data)); + } + + IntVec4 IntVec4::operator|(IN CONST IntVec4 &other) CONST + { + return IntVec4(hn::Or(m_data, other.m_data)); + } + + IntVec4 IntVec4::operator^(IN CONST IntVec4 &other) CONST + { + return IntVec4(hn::Xor(m_data, other.m_data)); + } + + IntVec4 IntVec4::operator~() CONST + { + return IntVec4(hn::Not(m_data)); + } + + IntVec4 IntVec4::operator<<(IN UINT32 amount) CONST + { + return IntVec4(hn::ShiftLeftSame(m_data, amount)); + } + + IntVec4 IntVec4::operator>>(IN UINT32 amount) CONST + { + return IntVec4(hn::ShiftRightSame(m_data, amount)); + } + + IntVec4 IntVec4::MultAdd(IN CONST IntVec4 &multiplier, IN CONST IntVec4 &addend) CONST + { + return IntVec4(hn::MulAdd(m_data, multiplier.m_data, addend.m_data)); + } + + IntVec4 IntVec4::SatAdd(IN CONST IntVec4 &other) CONST + { + return IntVec4(hn::SaturatedAdd(m_data, other.m_data)); + } + + IntVec4 IntVec4::SatSub(IN CONST IntVec4 &other) CONST + { + return IntVec4(hn::SaturatedSub(m_data, other.m_data)); + } + + IntVec4 IntVec4::Clamp(IN UINT32 min, IN UINT32 max) CONST + { + CONST Tag d; + auto vMin = hn::Set(d, min); + auto vMax = hn::Set(d, max); + return IntVec4(hn::Min(hn::Max(m_data, vMin), vMax)); + } + + VOID IntVec4::Store(OUT PUINT32 values) + { + CONST Tag d; + hn::Store(m_data, d, values); + } + + IntVec4 IntVec4::Load(IN PCUINT32 values) + { + CONST Tag d; + return IntVec4(hn::Load(d, values)); + } +} // namespace IACore + +namespace IACore +{ + FloatVec4::FloatVec4(IN FLOAT32 s) + { + const Tag d; + m_data = hn::Set(d, s); + } + + FloatVec4::FloatVec4(IN PCFLOAT32 values) + { + const Tag d; + m_data = hn::Load(d, values); + } + + FloatVec4::FloatVec4(IN FLOAT32 a, IN FLOAT32 b, IN FLOAT32 c, IN FLOAT32 d) + { + const Tag data; + ALIGN(16) FLOAT32 temp[4] = {a, b, c, d}; + m_data = hn::Load(data, temp); + } + + FloatVec4 FloatVec4::operator+(IN CONST FloatVec4 &other) CONST + { + return FloatVec4(hn::Add(m_data, other.m_data)); + } + + FloatVec4 FloatVec4::operator-(IN CONST FloatVec4 &other) CONST + { + return FloatVec4(hn::Sub(m_data, other.m_data)); + } + + FloatVec4 FloatVec4::operator*(IN CONST FloatVec4 &other) CONST + { + return FloatVec4(hn::Mul(m_data, other.m_data)); + } + + FloatVec4 FloatVec4::operator/(IN CONST FloatVec4 &other) CONST + { + return FloatVec4(hn::Div(m_data, other.m_data)); + } + + FloatVec4 FloatVec4::MultAdd(IN CONST FloatVec4 &multiplier, IN CONST FloatVec4 &addend) CONST + { + return FloatVec4(hn::MulAdd(m_data, multiplier.m_data, addend.m_data)); + } + + FloatVec4 FloatVec4::Clamp(IN FLOAT32 min, IN FLOAT32 max) CONST + { + const Tag d; + auto vMin = hn::Set(d, min); + auto vMax = hn::Set(d, max); + return FloatVec4(hn::Min(hn::Max(m_data, vMin), vMax)); + } + + FloatVec4 FloatVec4::Sqrt() CONST + { + return FloatVec4(hn::Sqrt(m_data)); + } + + FloatVec4 FloatVec4::Rsqrt() CONST + { + return FloatVec4(hn::ApproximateReciprocalSqrt(m_data)); + } + + FloatVec4 FloatVec4::Abs() CONST + { + return FloatVec4(hn::Abs(m_data)); + } + + FLOAT32 FloatVec4::Dot(IN CONST FloatVec4 &other) CONST + { + const Tag d; + auto vMul = hn::Mul(m_data, other.m_data); + return hn::ReduceSum(d, vMul); + } + + FloatVec4 FloatVec4::Normalize() CONST + { + const Tag d; + auto vMul = hn::Mul(m_data, m_data); + auto vLenSq = hn::SumOfLanes(d, vMul); + auto vInvLen = hn::ApproximateReciprocalSqrt(vLenSq); + return FloatVec4(hn::Mul(m_data, vInvLen)); + } + + VOID FloatVec4::Store(OUT PFLOAT32 values) + { + const Tag d; + hn::Store(m_data, d, values); + } + + FloatVec4 FloatVec4::Load(IN PCFLOAT32 values) + { + const Tag d; + return FloatVec4(hn::Load(d, values)); + } +} // namespace IACore \ No newline at end of file diff --git a/Tests/Unit/CMakeLists.txt b/Tests/Unit/CMakeLists.txt index 4bf7ea0..0256cb1 100644 --- a/Tests/Unit/CMakeLists.txt +++ b/Tests/Unit/CMakeLists.txt @@ -20,6 +20,9 @@ set(TEST_SOURCES ProcessOps.cpp StreamReader.cpp RingBuffer.cpp + + SIMD/IntVec4.cpp + SIMD/FloatVec4.cpp ) add_executable(IACore_Test_Suite ${TEST_SOURCES}) diff --git a/Tests/Unit/SIMD/FloatVec4.cpp b/Tests/Unit/SIMD/FloatVec4.cpp new file mode 100644 index 0000000..98dd795 --- /dev/null +++ b/Tests/Unit/SIMD/FloatVec4.cpp @@ -0,0 +1,106 @@ +// IACore-OSS; The Core Library for All IA Open Source Projects +// Copyright (C) 2025 IAS (ias@iasoft.dev) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +using namespace IACore; + +IAT_BEGIN_BLOCK(Core, FloatVec4) + +BOOL TestFloatArithmetic() +{ + FloatVec4 v1(10.0f, 20.0f, 30.0f, 40.0f); + FloatVec4 v2(2.0f, 4.0f, 5.0f, 8.0f); + + ALIGN(16) FLOAT32 res[4]; + + (v1 / v2).Store(res); + IAT_CHECK_APPROX(res[0], 5.0f); + IAT_CHECK_APPROX(res[3], 5.0f); + + (v1 * v2).Store(res); + IAT_CHECK_APPROX(res[0], 20.0f); + + (v1 + v2).Store(res); + IAT_CHECK_APPROX(res[0], 12.0f); + + return TRUE; +} + +BOOL TestMathHelpers() +{ + ALIGN(16) FLOAT32 res[4]; + + FloatVec4 vSq(4.0f, 9.0f, 16.0f, 25.0f); + vSq.Sqrt().Store(res); + IAT_CHECK_APPROX(res[0], 2.0f); + IAT_CHECK_APPROX(res[3], 5.0f); + + FloatVec4 vNeg(-1.0f, -5.0f, 10.0f, -0.0f); + vNeg.Abs().Store(res); + IAT_CHECK_APPROX(res[0], 1.0f); + IAT_CHECK_APPROX(res[2], 10.0f); + + FloatVec4 vClamp(-100.0f, 0.0f, 50.0f, 200.0f); + vClamp.Clamp(0.0f, 100.0f).Store(res); + IAT_CHECK_APPROX(res[0], 0.0f); + IAT_CHECK_APPROX(res[2], 50.0f); + IAT_CHECK_APPROX(res[3], 100.0f); + + return TRUE; +} + +BOOL TestApproxMath() +{ + ALIGN(16) FLOAT32 res[4]; + FloatVec4 v(16.0f, 25.0f, 100.0f, 1.0f); + + v.Rsqrt().Store(res); + + IAT_CHECK_APPROX(res[0], 0.25f); + IAT_CHECK_APPROX(res[2], 0.1f); + + return TRUE; +} + +BOOL TestLinearAlgebra() +{ + FloatVec4 v1(1.0f, 2.0f, 3.0f, 4.0f); + FloatVec4 v2(1.0f, 0.0f, 1.0f, 0.0f); + + FLOAT32 dot = v1.Dot(v2); + IAT_CHECK_APPROX(dot, 4.0f); + + FloatVec4 vNorm(10.0f, 0.0f, 0.0f, 0.0f); + ALIGN(16) FLOAT32 res[4]; + + vNorm.Normalize().Store(res); + IAT_CHECK_APPROX(res[0], 1.0f); + IAT_CHECK_APPROX(res[1], 0.0f); + + return TRUE; +} + +IAT_BEGIN_TEST_LIST() +IAT_ADD_TEST(TestFloatArithmetic); +IAT_ADD_TEST(TestMathHelpers); +IAT_ADD_TEST(TestApproxMath); +IAT_ADD_TEST(TestLinearAlgebra); +IAT_END_TEST_LIST() + +IAT_END_BLOCK() + +IAT_REGISTER_ENTRY(Core, FloatVec4) \ No newline at end of file diff --git a/Tests/Unit/SIMD/IntVec4.cpp b/Tests/Unit/SIMD/IntVec4.cpp new file mode 100644 index 0000000..072dd91 --- /dev/null +++ b/Tests/Unit/SIMD/IntVec4.cpp @@ -0,0 +1,152 @@ +// IACore-OSS; The Core Library for All IA Open Source Projects +// Copyright (C) 2025 IAS (ias@iasoft.dev) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// IACore-OSS; The Core Library for All IA Open Source Projects +// Copyright (C) 2025 IAS (ias@iasoft.dev) + +#include +#include + +using namespace IACore; + +IAT_BEGIN_BLOCK(Core, IntVec4) + +BOOL TestConstructors() +{ + IntVec4 vBroadcast(10); + ALIGN(16) UINT32 storeBuf[4]; + vBroadcast.Store(storeBuf); + + IAT_CHECK_EQ(storeBuf[0], 10U); + IAT_CHECK_EQ(storeBuf[3], 10U); + + IntVec4 vComp(1, 2, 3, 4); + vComp.Store(storeBuf); + IAT_CHECK_EQ(storeBuf[0], 1U); + IAT_CHECK_EQ(storeBuf[3], 4U); + + ALIGN(16) UINT32 srcBuf[4] = {100, 200, 300, 400}; + IntVec4 vLoad = IntVec4::Load(srcBuf); + vLoad.Store(storeBuf); + IAT_CHECK_EQ(storeBuf[1], 200U); + + return TRUE; +} + +BOOL TestArithmetic() +{ + IntVec4 v1(10, 20, 30, 40); + IntVec4 v2(1, 2, 3, 4); + + IntVec4 vAdd = v1 + v2; + ALIGN(16) UINT32 res[4]; + vAdd.Store(res); + IAT_CHECK_EQ(res[0], 11U); + IAT_CHECK_EQ(res[3], 44U); + + IntVec4 vSub = v1 - v2; + vSub.Store(res); + IAT_CHECK_EQ(res[0], 9U); + + IntVec4 vMul = v1 * v2; + vMul.Store(res); + IAT_CHECK_EQ(res[0], 10U); + IAT_CHECK_EQ(res[2], 90U); + IAT_CHECK_EQ(res[3], 160U); + + return TRUE; +} + +BOOL TestBitwise() +{ + IntVec4 vAllOnes(0xFFFFFFFF); + IntVec4 vZero((UINT32) 0); + IntVec4 vPattern(0xAAAAAAAA); + + ALIGN(16) UINT32 res[4]; + + (vAllOnes & vPattern).Store(res); + IAT_CHECK_EQ(res[0], 0xAAAAAAAAU); + + (vZero | vPattern).Store(res); + IAT_CHECK_EQ(res[0], 0xAAAAAAAAU); + + (vAllOnes ^ vPattern).Store(res); + IAT_CHECK_EQ(res[0], 0x55555555U); + + (~vPattern).Store(res); + IAT_CHECK_EQ(res[0], 0x55555555U); + + IntVec4 vShift(1); + (vShift << 1).Store(res); + IAT_CHECK_EQ(res[0], 2U); + + IntVec4 vShiftRight(4); + (vShiftRight >> 1).Store(res); + IAT_CHECK_EQ(res[0], 2U); + + return TRUE; +} + +BOOL TestSaturation() +{ + UINT32 max = 0xFFFFFFFF; + IntVec4 vHigh(max - 10); + IntVec4 vAdd(20); + + ALIGN(16) UINT32 res[4]; + + vHigh.SatAdd(vAdd).Store(res); + IAT_CHECK_EQ(res[0], max); + + IntVec4 vLow(10); + IntVec4 vSub(20); + vLow.SatSub(vSub).Store(res); + IAT_CHECK_EQ(res[0], 0U); + + return TRUE; +} + +BOOL TestAdvancedOps() +{ + IntVec4 v(0, 50, 100, 150); + ALIGN(16) UINT32 res[4]; + + v.Clamp(40, 110).Store(res); + IAT_CHECK_EQ(res[0], 40U); + IAT_CHECK_EQ(res[1], 50U); + IAT_CHECK_EQ(res[2], 100U); + IAT_CHECK_EQ(res[3], 110U); + + IntVec4 A(2); + IntVec4 B(10); + IntVec4 C(5); + A.MultAdd(B, C).Store(res); + IAT_CHECK_EQ(res[0], 25U); + + return TRUE; +} + +IAT_BEGIN_TEST_LIST() +IAT_ADD_TEST(TestConstructors); +IAT_ADD_TEST(TestArithmetic); +IAT_ADD_TEST(TestBitwise); +IAT_ADD_TEST(TestSaturation); +IAT_ADD_TEST(TestAdvancedOps); +IAT_END_TEST_LIST() + +IAT_END_BLOCK() + +IAT_REGISTER_ENTRY(Core, IntVec4) \ No newline at end of file