From 9a77bf28e8d18450443311b369344af7724d11b0 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Thu, 11 Jul 2024 17:16:02 -0600 Subject: [PATCH 01/11] jp/horizontal-sum --- inc/zoo/swar/associative_iteration.h | 133 ++++++++++++++++++++++++++- pokerbotic/inc/ep/core/SWAR.h | 15 +++ 2 files changed, 144 insertions(+), 4 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index a515dc38..2b25250c 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -2,6 +2,8 @@ #define ZOO_SWAR_ASSOCIATIVE_ITERATION_H #include "zoo/swar/SWAR.h" +#include "zoo/meta/popcount.h" +#include //#define ZOO_DEVELOPMENT_DEBUGGING #ifdef ZOO_DEVELOPMENT_DEBUGGING @@ -392,11 +394,16 @@ template< typename CountHalver > constexpr auto associativeOperatorIterated_regressive( - Base base, Base neutral, IterationCount count, IterationCount forSquaring, - Operator op, unsigned log2Count, CountHalver ch + Base base, + Base neutral, + IterationCount count, + IterationCount forSquaring, + Operator op, + unsigned log2Count, + CountHalver ch ) { - auto result = neutral; - if(!log2Count) { return result; } + auto result = neutral; // sum = 0 + if(!log2Count) { return result; } // NBits per lane for(;;) { result = op(result, base, count); if(!--log2Count) { break; } @@ -535,6 +542,124 @@ constexpr auto halvePrecision(SWAR even, SWAR odd) { return evenHalf | oddHalf; } +template +constexpr auto basic_popcount(T x) { + constexpr auto NBits = T{sizeof(x) * 8}, + One = T{1}; + + auto total = T{0}; + for (auto i = 0; i < NBits; i++) { + total += x & One; + x >>= 1; + } + return total; +} +static_assert(basic_popcount(0b111) == 3); +static_assert(basic_popcount(0xFF) == 8); +static_assert(basic_popcount(0xFF'FF'FF'FF) == 32); +static_assert(basic_popcount(0xFF'FF'FF'FF'FF'FF'FF'FF) == 64); + +template +constexpr auto horsumai( + SWAR input +) { + using S = SWAR; + + auto operation = [](auto sum, auto input, auto counts) { + auto popcount = basic_popcount(input); + sum += popcount * counts; + return sum; + }; + + auto halver = [](auto counts) { + return counts >> 1; + }; + + T base = input.value(); + T neutral = 0; + T count = S::MostSignificantBit; + T forSquaring = S::MostSignificantBit; + + return associativeOperatorIterated_regressive( + base, + neutral, + count, + forSquaring, + operation, + S::NBits, // todo make template + halver + ); +} + +using S = SWAR<8, uint32_t>; +static_assert(horsumai(S{0x01'02'03'04}) == 10); + +template +constexpr auto horizontalSum_reg(S x) { + constexpr auto MSBs = S::MostSignificantBit, + NBits = S::NBits, + InitialSquare = typename S::type { 1 << (NBits - 1)}; + static_assert(InitialSquare == 0b10000000); + + auto sum = 0; + auto square = InitialSquare; + auto value = x.value(); + + for (int i = 0; i < NBits; i++) { + auto msb_masked = value & MSBs; + auto popcount = basic_popcount(msb_masked); + auto value_at_square = popcount * square; + sum += value_at_square; + square >>= 1; + value <<= 1; + } + + return sum; +} + +template +constexpr auto horizontalSum_prog(S x) { + constexpr auto Ones = S::LeastSignificantBit, + NBits = S::NBits, + InitialSquare = typename S::type { 1 }; + + auto sum = 0; + auto square = InitialSquare; + auto value = x.value(); + + for (int i = 0; i < NBits; i++) { + auto msb_masked = value & Ones; + auto popcount = basic_popcount(msb_masked); + auto value_at_square = popcount * square; + sum += value_at_square; + square <<= 1; + value >>= 1; + } + + return sum; +} + +static_assert(S::Lanes == 4); + + +#define HORSUM_TESTS \ + HS_FN(0x01'02'03'04, 10) \ + HS_FN(0x02'02'03'04, 11) \ + HS_FN(0x04'04'04'03, 15) \ + HS_FN(0x04'04'04'04, 16) \ + HS_FN(0x04'03'02'01, 10) + +#define HS_FN(a, b) \ + static_assert(horizontalSum_reg(S{a}) == b); \ + static_assert(horizontalSum_prog(S{a}) == b); \ + // static_assert(horsumai(S{a}) == b); + HORSUM_TESTS + +#undef HS_FN +#undef HORSUM_TESTS + + + } #endif diff --git a/pokerbotic/inc/ep/core/SWAR.h b/pokerbotic/inc/ep/core/SWAR.h index 5154dc1d..598cbd0f 100644 --- a/pokerbotic/inc/ep/core/SWAR.h +++ b/pokerbotic/inc/ep/core/SWAR.h @@ -104,6 +104,21 @@ static_assert(0x210 == popcount<0>(0x320), ""); static_assert(0x4321 == popcount<1>(0xF754), ""); static_assert(0x50004 == popcount<3>(0x3E001122), ""); +static_assert(4 == popcount<1>(0b1111)); +static_assert(3 == popcount<1>(0b1011)); + +static_assert(3 == popcount<1>(0b1011)); +static_assert(8 == popcount<2>(0xFF)); +static_assert(16 == popcount<3>(0xFF'FF)); +static_assert(24 == popcount<4>(0xFF'FF'FF)); +static_assert(32 == popcount<4>(0xFF'FF'FF'FF)); +static_assert(40 == popcount<5>(0xFF'FF'FF'FF'FF)); +static_assert(48 == popcount<5>(0xFF'FF'FF'FF'FF'FF)); +static_assert(55 == popcount<5>(0xFF'FF'FF'FF'FF'FF'FF - 8)); + +// todo eduardo why is this broken? +// static_assert(64 == popcount<6>(0xFF'FF'FF'FF'FF'FF'FF'FF)); + template constexpr typename std::make_unsigned::type msb(T v) { return 8*sizeof(T) - 1 - __builtin_clzll(v); From da19c1dbb31bac577a95d3f03f836b565aa63de4 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Thu, 11 Jul 2024 17:27:14 -0600 Subject: [PATCH 02/11] bosh, there you have it --- inc/zoo/swar/associative_iteration.h | 38 +++++++++++++++------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 2b25250c..222ea83a 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -2,7 +2,6 @@ #define ZOO_SWAR_ASSOCIATIVE_ITERATION_H #include "zoo/swar/SWAR.h" -#include "zoo/meta/popcount.h" #include //#define ZOO_DEVELOPMENT_DEBUGGING @@ -565,20 +564,25 @@ constexpr auto horsumai( ) { using S = SWAR; + constexpr auto MSBs = S::MostSignificantBit, + NBits = S::NBits, + InitialSquare = typename S::type { 1 << (NBits - 1)}; + auto operation = [](auto sum, auto input, auto counts) { - auto popcount = basic_popcount(input); + auto masked = input & MSBs; + auto popcount = basic_popcount(masked); sum += popcount * counts; return sum; }; auto halver = [](auto counts) { - return counts >> 1; + return counts << 1; }; - T base = input.value(); + T base = 0; T neutral = 0; - T count = S::MostSignificantBit; - T forSquaring = S::MostSignificantBit; + T count = input.value(); + T forSquaring = InitialSquare; return associativeOperatorIterated_regressive( base, @@ -592,7 +596,7 @@ constexpr auto horsumai( } using S = SWAR<8, uint32_t>; -static_assert(horsumai(S{0x01'02'03'04}) == 10); +// static_assert(horsumai(S{0x01'02'03'04}) == 10); template constexpr auto horizontalSum_reg(S x) { @@ -601,20 +605,20 @@ constexpr auto horizontalSum_reg(S x) { InitialSquare = typename S::type { 1 << (NBits - 1)}; static_assert(InitialSquare == 0b10000000); - auto sum = 0; - auto square = InitialSquare; - auto value = x.value(); + auto neutral = 0; + auto base = neutral; + auto count = x.value(); - for (int i = 0; i < NBits; i++) { - auto msb_masked = value & MSBs; + for (auto log2Count = NBits;;) { + auto msb_masked = count & MSBs; auto popcount = basic_popcount(msb_masked); - auto value_at_square = popcount * square; - sum += value_at_square; - square >>= 1; - value <<= 1; + base *= 2; + base += popcount; + if (!--log2Count) { break; } + count <<= 1; } - return sum; + return base; } template From 990f58670ad02917a27a55a0c17577efaca3a3ea Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Thu, 11 Jul 2024 17:47:23 -0600 Subject: [PATCH 03/11] more wip --- inc/zoo/swar/associative_iteration.h | 30 +++++++++++++--------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 222ea83a..a888e495 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -545,7 +545,6 @@ template constexpr auto basic_popcount(T x) { constexpr auto NBits = T{sizeof(x) * 8}, One = T{1}; - auto total = T{0}; for (auto i = 0; i < NBits; i++) { total += x & One; @@ -557,6 +556,7 @@ static_assert(basic_popcount(0b111) == 3); static_assert(basic_popcount(0xFF) == 8); static_assert(basic_popcount(0xFF'FF'FF'FF) == 32); static_assert(basic_popcount(0xFF'FF'FF'FF'FF'FF'FF'FF) == 64); +static_assert(basic_popcount(0xFF'FF'FF'FF'FF'FF'FF'FF - 2 - 4 - 8) == 61); template constexpr auto horsumai( @@ -565,24 +565,24 @@ constexpr auto horsumai( using S = SWAR; constexpr auto MSBs = S::MostSignificantBit, - NBits = S::NBits, - InitialSquare = typename S::type { 1 << (NBits - 1)}; + NBits = S::NBits; - auto operation = [](auto sum, auto input, auto counts) { - auto masked = input & MSBs; + auto operation = [](auto result, auto base, auto counts) { + auto masked = counts & MSBs; auto popcount = basic_popcount(masked); - sum += popcount * counts; - return sum; + result <<= 1; + result += popcount; + return result; }; auto halver = [](auto counts) { return counts << 1; }; - T base = 0; T neutral = 0; + T base = 0; T count = input.value(); - T forSquaring = InitialSquare; + T forSquaring = 1; return associativeOperatorIterated_regressive( base, @@ -602,23 +602,21 @@ template constexpr auto horizontalSum_reg(S x) { constexpr auto MSBs = S::MostSignificantBit, NBits = S::NBits, - InitialSquare = typename S::type { 1 << (NBits - 1)}; - static_assert(InitialSquare == 0b10000000); + Neutral = typename S::type {0}; - auto neutral = 0; - auto base = neutral; + auto result = Neutral; auto count = x.value(); for (auto log2Count = NBits;;) { auto msb_masked = count & MSBs; auto popcount = basic_popcount(msb_masked); - base *= 2; - base += popcount; + result <<= 1; + result += popcount; if (!--log2Count) { break; } count <<= 1; } - return base; + return result; } template From e91b3bbfd1dc19ba6f4d8b675a55795318803773 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Sat, 13 Jul 2024 19:15:17 -0700 Subject: [PATCH 04/11] works with template! --- inc/zoo/swar/associative_iteration.h | 109 ++++++++++++++++----------- 1 file changed, 64 insertions(+), 45 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index a888e495..ad11ee41 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -558,20 +558,18 @@ static_assert(basic_popcount(0xFF'FF'FF'FF) == 32); static_assert(basic_popcount(0xFF'FF'FF'FF'FF'FF'FF'FF) == 64); static_assert(basic_popcount(0xFF'FF'FF'FF'FF'FF'FF'FF - 2 - 4 - 8) == 61); -template -constexpr auto horsumai( - SWAR input -) { - using S = SWAR; - +template +constexpr auto horizontalSum_associativeIteration_regressive(S input) { constexpr auto MSBs = S::MostSignificantBit, - NBits = S::NBits; + NBits = S::NBits, + Neutral = typename S::type {0}, + ForSquaring = Neutral, + Base = Neutral; - auto operation = [](auto result, auto base, auto counts) { - auto masked = counts & MSBs; - auto popcount = basic_popcount(masked); - result <<= 1; - result += popcount; + auto operation = [](auto result, auto base, auto count) { + auto msb_masked = count & MSBs; + auto popcount = basic_popcount(msb_masked); + result += popcount + base; return result; }; @@ -579,45 +577,16 @@ constexpr auto horsumai( return counts << 1; }; - T neutral = 0; - T base = 0; - T count = input.value(); - T forSquaring = 1; - + auto count = input.value(); return associativeOperatorIterated_regressive( - base, - neutral, - count, - forSquaring, - operation, - S::NBits, // todo make template - halver + Base, Neutral, count, ForSquaring, + operation, S::NBits, halver ); } using S = SWAR<8, uint32_t>; // static_assert(horsumai(S{0x01'02'03'04}) == 10); -template -constexpr auto horizontalSum_reg(S x) { - constexpr auto MSBs = S::MostSignificantBit, - NBits = S::NBits, - Neutral = typename S::type {0}; - - auto result = Neutral; - auto count = x.value(); - - for (auto log2Count = NBits;;) { - auto msb_masked = count & MSBs; - auto popcount = basic_popcount(msb_masked); - result <<= 1; - result += popcount; - if (!--log2Count) { break; } - count <<= 1; - } - - return result; -} template constexpr auto horizontalSum_prog(S x) { @@ -643,24 +612,74 @@ constexpr auto horizontalSum_prog(S x) { static_assert(S::Lanes == 4); +template +constexpr auto horizontalSum_reg(S x) { + constexpr auto MSBs = S::MostSignificantBit, + NBits = S::NBits, + Neutral = typename S::type {0}; + auto result = Neutral; + auto count = x.value(); + + auto operation = [](auto result, auto count) { + auto msb_masked = count & MSBs; + auto popcount = basic_popcount(msb_masked); + result <<= 1; + result += popcount; + return result; + }; + + auto halver = [](auto counts) { + return counts << 1; + }; + + for (auto log2Count = NBits;;) { + result = operation(result, count); + if (!--log2Count) { break; } + count = halver(count); + } + + return result; +} + +// template +// auto scottVersion(S input) { +// SumLanes(Popcount(SWAR<4, int64)) == Popcount(SWAR<64, int64>) +// } #define HORSUM_TESTS \ + HS_FN(0x00'00'00'00, 0) \ + HS_FN(0x00'00'00'01, 1) \ + HS_FN(0x00'00'01'00, 1) \ + HS_FN(0x00'00'10'00, 16) \ HS_FN(0x01'02'03'04, 10) \ HS_FN(0x02'02'03'04, 11) \ HS_FN(0x04'04'04'03, 15) \ HS_FN(0x04'04'04'04, 16) \ + HS_FN(0x08'08'08'09, 33) \ + HS_FN(0xFF'FF'FF'FF, 1020) \ HS_FN(0x04'03'02'01, 10) #define HS_FN(a, b) \ static_assert(horizontalSum_reg(S{a}) == b); \ static_assert(horizontalSum_prog(S{a}) == b); \ - // static_assert(horsumai(S{a}) == b); + static_assert(horizontalSum_associativeIteration_regressive(S{a}) == b); HORSUM_TESTS #undef HS_FN #undef HORSUM_TESTS +auto regressive_ai (uint32_t x) { + return horizontalSum_associativeIteration_regressive(S{x}); +} + +auto progresssive (uint32_t x) { + return horizontalSum_prog(S{x}); +} + +auto regressive (uint32_t x) { + return horizontalSum_reg(S{x}); +} } From fcf948a6a74be895a4c349b0a477fd4216b8517f Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Sun, 14 Jul 2024 17:21:23 -0700 Subject: [PATCH 05/11] add more tests --- inc/zoo/swar/associative_iteration.h | 24 +++++++++++++++++++++++- test/swar/BasicOperations.cpp | 8 ++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index ad11ee41..2c3662e0 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -561,7 +561,6 @@ static_assert(basic_popcount(0xFF'FF'FF'FF'FF'FF'FF'FF - 2 - 4 - 8) == template constexpr auto horizontalSum_associativeIteration_regressive(S input) { constexpr auto MSBs = S::MostSignificantBit, - NBits = S::NBits, Neutral = typename S::type {0}, ForSquaring = Neutral, Base = Neutral; @@ -668,6 +667,29 @@ constexpr auto horizontalSum_reg(S x) { #undef HS_FN #undef HORSUM_TESTS +#define ZOO_PP_UNPARENTHESIZE(...) __VA_ARGS__ +#define X(TYPE, av, expected) \ +static_assert(horizontalSum_associativeIteration_regressive(\ + SWAR{\ + Literals,\ + {ZOO_PP_UNPARENTHESIZE av}\ + }) ==\ + expected\ +); + +#define SWAR_TESTS \ + X((32, u64), (2, 1), 3); \ + X((5, u32), (1, 1, 1, 1, 1, 1), 6); \ + X((5, u32), (1, 2, 3, 4, 5, 6), 21); \ + X((5, u32), (6, 5, 4, 3, 2, 1), 21); \ + X((8, u32), (255, 255, 255, 255), 1020); + +SWAR_TESTS + + + +static_assert(horizontalSum_associativeIteration_regressive(SWAR{Literals<8, u32>, {1, 2, 3, 4}}) == 10); + auto regressive_ai (uint32_t x) { return horizontalSum_associativeIteration_regressive(S{x}); diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index 602384ae..bc351a06 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -52,6 +52,8 @@ static_assert(\ expected\ ); +static_assert(SWAR{Literals<16, u32>, {1, 2}}.value() == 0x0001'0002); + /* Preserved to illustrate a technique, remove in a few revisions static_assert(SWAR{Literals<32, u64>, {2, 1}}.value() == 0x00000002'00000001); static_assert(SWAR{Literals<32, u64>, {1, 2}}.value() == 0x00000001'00000002); @@ -126,6 +128,11 @@ X(\ (2, 1),\ 0x00000002'00000001\ );\ +X(\ + (8, u32),\ + (255, 255, 255, 255),\ + 0xFF'FF'FF'FF\ +);\ X(\ (32, u64),\ (1, 2),\ @@ -182,6 +189,7 @@ X(\ 0x12\ ) + LITERALS_TESTS From 39ca9e01cf24707fe234e8a37d78ce68969d6ad5 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Mon, 15 Jul 2024 00:23:51 -0700 Subject: [PATCH 06/11] upgrade popcount --- inc/zoo/meta/popcount.h | 26 ++++++++ inc/zoo/swar/SWAR.h | 4 ++ inc/zoo/swar/associative_iteration.h | 97 +++++----------------------- 3 files changed, 47 insertions(+), 80 deletions(-) diff --git a/inc/zoo/meta/popcount.h b/inc/zoo/meta/popcount.h index 3b00056b..03555908 100644 --- a/inc/zoo/meta/popcount.h +++ b/inc/zoo/meta/popcount.h @@ -2,6 +2,8 @@ #define ZOO_HEADER_META_POPCOUNT_H #include "zoo/meta/BitmaskMaker.h" +#include +#include namespace zoo { namespace meta { @@ -90,6 +92,30 @@ struct PopcountIntrinsic { }; #endif +template +constexpr auto NumBits() { + return sizeof(T) * 8; +} +static_assert(NumBits() == 16); +static_assert(NumBits() == 64); + +template +constexpr +std::enable_if_t && NumBits() <= 64, T> +basic_popcount(T x) { + if constexpr (NumBits() <= 32) { + return __builtin_popcountl(x); + } else { + return __builtin_popcountll(x); + } +} + +static_assert(basic_popcount(0b111) == 3); +static_assert(basic_popcount(0xFF) == 8); +static_assert(basic_popcount(0xFF'FF'FF'FF) == 32); +static_assert(basic_popcount(0xFF'FF'FF'FF'FF'FF'FF'FF) == 64); +static_assert(basic_popcount(0xFF'FF'FF'FF'FF'FF'FF'FF - 2 - 4 - 8) == 61); + }} #endif diff --git a/inc/zoo/swar/SWAR.h b/inc/zoo/swar/SWAR.h index 60ba9540..2bbad28a 100644 --- a/inc/zoo/swar/SWAR.h +++ b/inc/zoo/swar/SWAR.h @@ -586,6 +586,10 @@ constexpr SWAR logarithmFloor(SWAR v) noexcept { return SWAR{popcounts - ones}; } + + + + static_assert( logarithmFloor(SWAR<8>{0x8040201008040201ull}).value() == 0x0706050403020100ull diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 2c3662e0..4e0932cd 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -1,8 +1,8 @@ #ifndef ZOO_SWAR_ASSOCIATIVE_ITERATION_H #define ZOO_SWAR_ASSOCIATIVE_ITERATION_H +#include "zoo/meta/popcount.h" #include "zoo/swar/SWAR.h" -#include //#define ZOO_DEVELOPMENT_DEBUGGING #ifdef ZOO_DEVELOPMENT_DEBUGGING @@ -541,25 +541,8 @@ constexpr auto halvePrecision(SWAR even, SWAR odd) { return evenHalf | oddHalf; } -template -constexpr auto basic_popcount(T x) { - constexpr auto NBits = T{sizeof(x) * 8}, - One = T{1}; - auto total = T{0}; - for (auto i = 0; i < NBits; i++) { - total += x & One; - x >>= 1; - } - return total; -} -static_assert(basic_popcount(0b111) == 3); -static_assert(basic_popcount(0xFF) == 8); -static_assert(basic_popcount(0xFF'FF'FF'FF) == 32); -static_assert(basic_popcount(0xFF'FF'FF'FF'FF'FF'FF'FF) == 64); -static_assert(basic_popcount(0xFF'FF'FF'FF'FF'FF'FF'FF - 2 - 4 - 8) == 61); - template -constexpr auto horizontalSum_associativeIteration_regressive(S input) { +constexpr auto horizontalSum(S input) { constexpr auto MSBs = S::MostSignificantBit, Neutral = typename S::type {0}, ForSquaring = Neutral, @@ -567,7 +550,7 @@ constexpr auto horizontalSum_associativeIteration_regressive(S input) { auto operation = [](auto result, auto base, auto count) { auto msb_masked = count & MSBs; - auto popcount = basic_popcount(msb_masked); + auto popcount = meta::basic_popcount(msb_masked); result += popcount + base; return result; }; @@ -583,10 +566,6 @@ constexpr auto horizontalSum_associativeIteration_regressive(S input) { ); } -using S = SWAR<8, uint32_t>; -// static_assert(horsumai(S{0x01'02'03'04}) == 10); - - template constexpr auto horizontalSum_prog(S x) { constexpr auto Ones = S::LeastSignificantBit, @@ -609,8 +588,6 @@ constexpr auto horizontalSum_prog(S x) { return sum; } -static_assert(S::Lanes == 4); - template constexpr auto horizontalSum_reg(S x) { constexpr auto MSBs = S::MostSignificantBit, @@ -640,68 +617,28 @@ constexpr auto horizontalSum_reg(S x) { return result; } -// template -// auto scottVersion(S input) { -// SumLanes(Popcount(SWAR<4, int64)) == Popcount(SWAR<64, int64>) -// } - -#define HORSUM_TESTS \ - HS_FN(0x00'00'00'00, 0) \ - HS_FN(0x00'00'00'01, 1) \ - HS_FN(0x00'00'01'00, 1) \ - HS_FN(0x00'00'10'00, 16) \ - HS_FN(0x01'02'03'04, 10) \ - HS_FN(0x02'02'03'04, 11) \ - HS_FN(0x04'04'04'03, 15) \ - HS_FN(0x04'04'04'04, 16) \ - HS_FN(0x08'08'08'09, 33) \ - HS_FN(0xFF'FF'FF'FF, 1020) \ - HS_FN(0x04'03'02'01, 10) - -#define HS_FN(a, b) \ - static_assert(horizontalSum_reg(S{a}) == b); \ - static_assert(horizontalSum_prog(S{a}) == b); \ - static_assert(horizontalSum_associativeIteration_regressive(S{a}) == b); - HORSUM_TESTS - -#undef HS_FN -#undef HORSUM_TESTS - #define ZOO_PP_UNPARENTHESIZE(...) __VA_ARGS__ -#define X(TYPE, av, expected) \ -static_assert(horizontalSum_associativeIteration_regressive(\ - SWAR{\ - Literals,\ - {ZOO_PP_UNPARENTHESIZE av}\ - }) ==\ - expected\ -); - -#define SWAR_TESTS \ +#define X(TYPE, av, expected) \ + static_assert(horizontalSum( \ + SWAR{ \ + Literals, \ + {ZOO_PP_UNPARENTHESIZE av} \ + }) == \ + expected \ + ); + +#define HORIZONTAL_SUM_TESTS \ X((32, u64), (2, 1), 3); \ + X((31, u64), (1, 2), 3); \ X((5, u32), (1, 1, 1, 1, 1, 1), 6); \ X((5, u32), (1, 2, 3, 4, 5, 6), 21); \ X((5, u32), (6, 5, 4, 3, 2, 1), 21); \ + X((5, u32), (6, 5, 4, 3, 2, 1), 21); \ X((8, u32), (255, 255, 255, 255), 1020); -SWAR_TESTS - - - -static_assert(horizontalSum_associativeIteration_regressive(SWAR{Literals<8, u32>, {1, 2, 3, 4}}) == 10); - +HORIZONTAL_SUM_TESTS -auto regressive_ai (uint32_t x) { - return horizontalSum_associativeIteration_regressive(S{x}); -} - -auto progresssive (uint32_t x) { - return horizontalSum_prog(S{x}); -} - -auto regressive (uint32_t x) { - return horizontalSum_reg(S{x}); -} +#undef X } From b1db264f9386428717138db55a857c9900413a21 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Tue, 23 Jul 2024 14:33:48 -0700 Subject: [PATCH 07/11] clean slightly --- inc/zoo/swar/associative_iteration.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 4e0932cd..b96749b1 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -543,10 +543,14 @@ constexpr auto halvePrecision(SWAR even, SWAR odd) { template constexpr auto horizontalSum(S input) { - constexpr auto MSBs = S::MostSignificantBit, - Neutral = typename S::type {0}, - ForSquaring = Neutral, - Base = Neutral; + constexpr auto + MSBs = S::MostSignificantBit, + Neutral = typename S::type {0}, + ForSquaring = Neutral, + Base = Neutral, + Log2Count = S::NBits; + + auto count = input.value(); auto operation = [](auto result, auto base, auto count) { auto msb_masked = count & MSBs; @@ -559,10 +563,9 @@ constexpr auto horizontalSum(S input) { return counts << 1; }; - auto count = input.value(); return associativeOperatorIterated_regressive( Base, Neutral, count, ForSquaring, - operation, S::NBits, halver + operation, Log2Count, halver ); } From 5d7557fd2a37476ac394b7f693d0b38fe244fb86 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Fri, 26 Jul 2024 18:05:38 -0700 Subject: [PATCH 08/11] tidy up horsum --- inc/zoo/swar/associative_iteration.h | 45 ++++++++++++++++++---------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index b96749b1..82f64a7d 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -569,6 +569,8 @@ constexpr auto horizontalSum(S input) { ); } +namespace experimental { + template constexpr auto horizontalSum_prog(S x) { constexpr auto Ones = S::LeastSignificantBit, @@ -581,7 +583,7 @@ constexpr auto horizontalSum_prog(S x) { for (int i = 0; i < NBits; i++) { auto msb_masked = value & Ones; - auto popcount = basic_popcount(msb_masked); + auto popcount = meta::basic_popcount(msb_masked); auto value_at_square = popcount * square; sum += value_at_square; square <<= 1; @@ -601,7 +603,7 @@ constexpr auto horizontalSum_reg(S x) { auto operation = [](auto result, auto count) { auto msb_masked = count & MSBs; - auto popcount = basic_popcount(msb_masked); + auto popcount = meta::basic_popcount(msb_masked); result <<= 1; result += popcount; return result; @@ -620,28 +622,41 @@ constexpr auto horizontalSum_reg(S x) { return result; } +} // namespace experimental + + #define ZOO_PP_UNPARENTHESIZE(...) __VA_ARGS__ -#define X(TYPE, av, expected) \ - static_assert(horizontalSum( \ - SWAR{ \ +#define Y(fn, TYPE, values, expected) \ + static_assert(fn( \ + SWAR { \ Literals, \ - {ZOO_PP_UNPARENTHESIZE av} \ + {ZOO_PP_UNPARENTHESIZE values} \ }) == \ expected \ ); -#define HORIZONTAL_SUM_TESTS \ - X((32, u64), (2, 1), 3); \ - X((31, u64), (1, 2), 3); \ - X((5, u32), (1, 1, 1, 1, 1, 1), 6); \ - X((5, u32), (1, 2, 3, 4, 5, 6), 21); \ - X((5, u32), (6, 5, 4, 3, 2, 1), 21); \ - X((5, u32), (6, 5, 4, 3, 2, 1), 21); \ - X((8, u32), (255, 255, 255, 255), 1020); +#define HORIZONTAL_SUM_TESTS(fn) \ + Y(fn, (32, u64), (2, 1), 3) \ + Y(fn, (31, u64), (1, 2), 3) \ + Y(fn, (5, u32), (1, 1, 1, 1, 1, 1), 6) \ + Y(fn, (5, u32), (1, 2, 3, 4, 5, 6), 21) \ + Y(fn, (5, u32), (6, 5, 4, 3, 2, 1), 21) \ + Y(fn, (8, u32), (255, 255, 255, 255), 1020) \ + Y(fn, (8, u32), (255, 254, 255, 255), 1019) \ + Y(fn, (8, u32), (255, 255, 255, 255), 1020) + + +#define HORIZONTAL_SUM_TESTS_ALL \ + HORIZONTAL_SUM_TESTS(horizontalSum) \ + HORIZONTAL_SUM_TESTS(experimental::horizontalSum_prog) \ + HORIZONTAL_SUM_TESTS(experimental::horizontalSum_reg) -HORIZONTAL_SUM_TESTS +HORIZONTAL_SUM_TESTS_ALL #undef X +#undef Y +#undef HORIZONTAL_SUM_TESTS +#undef HORIZONTAL_SUM_TESTS_ALL } From 9cb8202c1c30ec1ab6ff977523949ebaf2fb9df9 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Fri, 26 Jul 2024 18:43:05 -0700 Subject: [PATCH 09/11] temp add 20 --- inc/zoo/swar/associative_iteration.h | 108 +++++++++++++++++++++------ test/CMakeLists.txt | 4 +- 2 files changed, 89 insertions(+), 23 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 82f64a7d..fd4535c3 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -364,7 +364,7 @@ constexpr auto negate(SWAR input) { return fullAddition(~input, Ones).result; } -/// \brief Performs a generalized iterated application of an associative operator to a base +/// \brief Performs a generalized iterated application of an associative operator to a bases /// /// In algebra, the repeated application of an operator to a "base" has different names depending on the /// operator, for example "a + a + a + ... + a" n-times would be called "repeated addition", @@ -412,6 +412,36 @@ constexpr auto associativeOperatorIterated_regressive( return result; } +namespace associative_iteration { + +template< + auto Operator, + auto CountHalver, + typename Base, + typename IterationCount +> +constexpr auto regressive( + Base base, + Base neutral, + IterationCount count, + IterationCount forSquaring, + unsigned log2Count +) { + auto result = neutral; // sum = 0 + if(!log2Count) { return result; } // NBits per lane + for(;;) { + result = Operator(result, base, count); + if(!--log2Count) { break; } + result = Operator(result, result, forSquaring); + count = CountHalver(count); + } + return result; +} + +} + +namespace ai = associative_iteration; + template constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( SWAR multiplicand, SWAR multiplier @@ -541,31 +571,64 @@ constexpr auto halvePrecision(SWAR even, SWAR odd) { return evenHalf | oddHalf; } +namespace associative { + +constexpr auto sum_via_popcount = [](auto result, auto base, auto count, auto msbs) { +}; + + + +} + +namespace count_halving { + +constexpr auto ConsumeMsb = [](auto counts) { + return counts << 1; +}; + +constexpr auto ConsumeLsb = [](auto counts) { + return counts << 1; +}; + +} + +template +auto multiply_and_double_p(S a, S b) { + auto product = a * b; + return doublePrecision(product); +} + template -constexpr auto horizontalSum(S input) { +constexpr auto horizontalSum_lanes(S input) { + auto result = typename S::type {0}; + for (int i = 0; i < S::Lanes; i++) { + } +} + +template +constexpr auto horizontalSum_bits(S input) { constexpr auto MSBs = S::MostSignificantBit, - Neutral = typename S::type {0}, + Neutral = typename S::type {0}, ForSquaring = Neutral, Base = Neutral, Log2Count = S::NBits; auto count = input.value(); - auto operation = [](auto result, auto base, auto count) { + constexpr auto Operation = [](auto result, auto base, auto count) { auto msb_masked = count & MSBs; auto popcount = meta::basic_popcount(msb_masked); result += popcount + base; return result; }; - auto halver = [](auto counts) { - return counts << 1; - }; - - return associativeOperatorIterated_regressive( - Base, Neutral, count, ForSquaring, - operation, Log2Count, halver + return ai::regressive ( + Base, + Neutral, + count, + ForSquaring, + Log2Count ); } @@ -626,13 +689,13 @@ constexpr auto horizontalSum_reg(S x) { #define ZOO_PP_UNPARENTHESIZE(...) __VA_ARGS__ -#define Y(fn, TYPE, values, expected) \ - static_assert(fn( \ - SWAR { \ - Literals, \ - {ZOO_PP_UNPARENTHESIZE values} \ - }) == \ - expected \ +#define Y(fn, TYPE, values, expected) \ + static_assert(fn( \ + SWAR { \ + Literals, \ + {ZOO_PP_UNPARENTHESIZE values} \ + }) == \ + expected \ ); #define HORIZONTAL_SUM_TESTS(fn) \ @@ -646,9 +709,9 @@ constexpr auto horizontalSum_reg(S x) { Y(fn, (8, u32), (255, 255, 255, 255), 1020) -#define HORIZONTAL_SUM_TESTS_ALL \ - HORIZONTAL_SUM_TESTS(horizontalSum) \ - HORIZONTAL_SUM_TESTS(experimental::horizontalSum_prog) \ +#define HORIZONTAL_SUM_TESTS_ALL \ + HORIZONTAL_SUM_TESTS(horizontalSum_bits) \ + HORIZONTAL_SUM_TESTS(experimental::horizontalSum_prog) \ HORIZONTAL_SUM_TESTS(experimental::horizontalSum_reg) HORIZONTAL_SUM_TESTS_ALL @@ -658,6 +721,9 @@ HORIZONTAL_SUM_TESTS_ALL #undef HORIZONTAL_SUM_TESTS #undef HORIZONTAL_SUM_TESTS_ALL + +static_assert(((0x01'01 * 0x05'01) & 0xFF'00) == 0x06'00, "Test failed"); + } #endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b4b4050e..ffb0691b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -59,7 +59,7 @@ if(MSVC) ${CMAKE_BINARY_DIR}/temporary SOURCES ${CMAKE_SOURCE_DIR}/../compiler_bugs/msvc/sfinae.cpp - CMAKE_FLAGS "-DCMAKE_CXX_STANDARD=17" + CMAKE_FLAGS "-DCMAKE_CXX_STANDARD=20" COMPILE_DEFINITIONS -DTRIGGER_MSVC_SFINAE_BUG OUTPUT_VARIABLE RESULT @@ -84,7 +84,7 @@ if(MSVC) endif() else() # Non-MSVC specific configuration (original content) - set(CMAKE_CXX_STANDARD 17) + set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_FLAGS_UBSAN "-fsanitize=undefined -fno-omit-frame-pointer -fno-optimize-sibling-calls -O1 -g") set(CMAKE_CXX_FLAGS_ASAN "-fsanitize=address -fno-omit-frame-pointer") From 3b44ad3dcae61c6243d8357d1bfebaf688ae6a56 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Sun, 28 Jul 2024 20:33:57 -0700 Subject: [PATCH 10/11] wip --- inc/zoo/swar/SWAR.h | 16 ++++- inc/zoo/swar/associative_iteration.h | 100 +++++++++++++++------------ test/swar/BasicOperations.cpp | 11 +++ 3 files changed, 78 insertions(+), 49 deletions(-) diff --git a/inc/zoo/swar/SWAR.h b/inc/zoo/swar/SWAR.h index 2bbad28a..4344575c 100644 --- a/inc/zoo/swar/SWAR.h +++ b/inc/zoo/swar/SWAR.h @@ -299,7 +299,7 @@ constexpr auto broadcast(SWAR v) { /// BooleanSWAR treats the MSB of each SWAR lane as the boolean associated with that lane. template -struct BooleanSWAR: SWAR { +struct BooleanSWAR : SWAR { using Base = SWAR; template> @@ -308,8 +308,13 @@ struct BooleanSWAR: SWAR { { this->m_v <<= (NBits - 1); } // Booleanness is stored in the MSBs - static constexpr auto MaskMSB = - broadcast(Base(T(1) << (NBits -1))); + static constexpr auto MaskMSB = []{ + if constexpr (SWAR::Lanes == 1) { + return Base(T{~0}); // all on, no lanes + } + return broadcast(Base(T(1) << (NBits - 1))); + }(); + static constexpr auto AllTrue = MaskMSB; static constexpr auto MaskLSB = broadcast(Base(T(1))); @@ -392,6 +397,11 @@ BooleanSWAR( const bool (&values)[BooleanSWAR::Lanes] ) -> BooleanSWAR; +template +BooleanSWAR( + SWAR arg +) -> BooleanSWAR; + template constexpr BooleanSWAR convertToBooleanSWAR(SWAR arg) noexcept { diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index fd4535c3..48f3fec1 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -8,6 +8,7 @@ #ifdef ZOO_DEVELOPMENT_DEBUGGING #include + inline std::ostream &binary(std::ostream &out, uint64_t input, int count) { while(count--) { out << (1 & input); @@ -412,6 +413,25 @@ constexpr auto associativeOperatorIterated_regressive( return result; } +namespace count_halving { + +constexpr auto ConsumeMsb = [](auto counts) { + return counts << 1; +}; + +constexpr auto ConsumeLsb = [](auto counts) { + return counts << 1; +}; + +template +constexpr auto ConsumeMsbLaneWise = [](auto counts) { + auto msbCleared = counts & ~S{S::MostSignificantBit}; + return S{msbCleared.value() << 1}; +}; + + +} + namespace associative_iteration { template< @@ -453,15 +473,10 @@ constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( return left + (addendums & right); }; - auto halver = [](auto counts) { - auto msbCleared = counts & ~S{S::MostSignificantBit}; - return S{msbCleared.value() << 1}; - }; - auto shifted = S{multiplier.value() << (NB - ActualBits)}; return associativeOperatorIterated_regressive( multiplicand, S{0}, shifted, S{S::MostSignificantBit}, operation, - ActualBits, halver + ActualBits, count_halving::ConsumeMsbLaneWise ); } @@ -511,7 +526,7 @@ constexpr auto exponentiation_OverflowUnsafe_SpecificBitCount( exponent = S{static_cast(exponent.value() << (NB - ActualBits))}; return associativeOperatorIterated_regressive( x, - S{meta::BitmaskMaker().value}, // neutral is lane wise.. + S{S::LeastSignificantBit}, exponent, S{S::MostSignificantBit}, operation, @@ -545,10 +560,11 @@ constexpr SWAR doublingMask() { } template + constexpr auto doublePrecision(SWAR input) { using S = SWAR; static_assert( - 0 == S::NSlots % 2, + 0 == S::Lanes % 2, "Precision can only be doubled for SWARs of even element count" ); using RV = SWAR; @@ -559,6 +575,25 @@ constexpr auto doublePrecision(SWAR input) { }; } +template +constexpr +std::enable_if_t= 2 && (S::Lanes % 2) == 0, typename S::type> +horizontalSum_lanes(S s) { + using Next = SWAR; + constexpr auto + Ones = Next::LeastSignificantBit, + ShiftBackAmount = Next::NBits * (Next::Lanes - 1); + + constexpr auto sum = [](auto a) { + return (a.value() * Ones) >> ShiftBackAmount; + }; + + auto [even, odd] = doublePrecision(s); + return sum(even) + sum(odd); +} + +static_assert(horizontalSum_lanes(SWAR { Literals<32, u64>, {2, 1} }) == 3, "Test failed"); + template constexpr auto halvePrecision(SWAR even, SWAR odd) { using S = SWAR; @@ -571,40 +606,12 @@ constexpr auto halvePrecision(SWAR even, SWAR odd) { return evenHalf | oddHalf; } -namespace associative { - -constexpr auto sum_via_popcount = [](auto result, auto base, auto count, auto msbs) { -}; - - - -} - -namespace count_halving { - -constexpr auto ConsumeMsb = [](auto counts) { - return counts << 1; -}; - -constexpr auto ConsumeLsb = [](auto counts) { - return counts << 1; -}; - -} - template auto multiply_and_double_p(S a, S b) { auto product = a * b; return doublePrecision(product); } -template -constexpr auto horizontalSum_lanes(S input) { - auto result = typename S::type {0}; - for (int i = 0; i < S::Lanes; i++) { - } -} - template constexpr auto horizontalSum_bits(S input) { constexpr auto @@ -672,14 +679,10 @@ constexpr auto horizontalSum_reg(S x) { return result; }; - auto halver = [](auto counts) { - return counts << 1; - }; - for (auto log2Count = NBits;;) { result = operation(result, count); if (!--log2Count) { break; } - count = halver(count); + count = count_halving::ConsumeMsb(count); } return result; @@ -700,17 +703,22 @@ constexpr auto horizontalSum_reg(S x) { #define HORIZONTAL_SUM_TESTS(fn) \ Y(fn, (32, u64), (2, 1), 3) \ - Y(fn, (31, u64), (1, 2), 3) \ - Y(fn, (5, u32), (1, 1, 1, 1, 1, 1), 6) \ - Y(fn, (5, u32), (1, 2, 3, 4, 5, 6), 21) \ - Y(fn, (5, u32), (6, 5, 4, 3, 2, 1), 21) \ Y(fn, (8, u32), (255, 255, 255, 255), 1020) \ Y(fn, (8, u32), (255, 254, 255, 255), 1019) \ - Y(fn, (8, u32), (255, 255, 255, 255), 1020) + Y(fn, (8, u32), (255, 255, 255, 255), 1020) \ + Y(fn, (8, u64), (255, 255, 255, 255, 255, 255, 255, 255), 2040) \ + Y(fn, (4, u64), (15, 15, 15, 15, 15, 15, 15, 15, \ + 15, 15, 15, 15, 15, 15, 15, 15), (15 * 16)) \ + Y(fn, (31, u64), (1, 2), 3) \ + Y(fn, (15, u32), (1, 1), 2) \ + Y(fn, (11, u32), (1, 1), 2) // hmm ok some lanes sizes don't work yet, when they are too small + // Y(fn, (5, u32), (1, 2, 3, 4, 5, 6), 21) \ + // Y(fn, (5, u32), (6, 5, 4, 3, 2, 1), 21) \ #define HORIZONTAL_SUM_TESTS_ALL \ HORIZONTAL_SUM_TESTS(horizontalSum_bits) \ + HORIZONTAL_SUM_TESTS(horizontalSum_lanes) \ HORIZONTAL_SUM_TESTS(experimental::horizontalSum_prog) \ HORIZONTAL_SUM_TESTS(experimental::horizontalSum_reg) diff --git a/test/swar/BasicOperations.cpp b/test/swar/BasicOperations.cpp index bc351a06..c4b80c7f 100644 --- a/test/swar/BasicOperations.cpp +++ b/test/swar/BasicOperations.cpp @@ -53,6 +53,9 @@ static_assert(\ ); static_assert(SWAR{Literals<16, u32>, {1, 2}}.value() == 0x0001'0002); +static_assert(SWAR<5, u32>::Lanes == 6); +static_assert(SWAR<8, u32>::Lanes == 4); +static_assert(SWAR<9, u32>::Lanes == 3); /* Preserved to illustrate a technique, remove in a few revisions static_assert(SWAR{Literals<32, u64>, {2, 1}}.value() == 0x00000002'00000001); @@ -122,12 +125,20 @@ static_assert(BS{Literals<4, u16>, {T, F, F, F}}.value() == 0b1000'0000'0000'000 static_assert(SWAR{Literals<8, u16>, {2, 1}}.value() == 0x0201); static_assert(SWAR{Literals<8, u16>, {1, 2}}.value() == 0x0102); */ + +static_assert(SWAR{Literals<5, u32>, {1, 1, 1, 1, 1, 1}}.value() == 0b00001'00001'00001'00001'00001'00001); + #define LITERALS_TESTS \ X(\ (32, u64),\ (2, 1),\ 0x00000002'00000001\ );\ +X(\ + (5, u32),\ + (1, 1, 1, 1, 1, 1),\ + 0b00001'00001'00001'00001'00001'00001\ +);\ X(\ (8, u32),\ (255, 255, 255, 255),\ From 1f6f610e726bbe2105598d16c200a9b5392c86d4 Mon Sep 17 00:00:00 2001 From: Jamie Pond <73431532+jamierpond@users.noreply.github.com> Date: Fri, 6 Sep 2024 11:24:06 -0700 Subject: [PATCH 11/11] wip --- inc/zoo/swar/associative_iteration.h | 52 +++++++++++++++++----------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/inc/zoo/swar/associative_iteration.h b/inc/zoo/swar/associative_iteration.h index 48f3fec1..f78838f5 100644 --- a/inc/zoo/swar/associative_iteration.h +++ b/inc/zoo/swar/associative_iteration.h @@ -1,6 +1,7 @@ #ifndef ZOO_SWAR_ASSOCIATIVE_ITERATION_H #define ZOO_SWAR_ASSOCIATIVE_ITERATION_H +#include "Operations.h" #include "zoo/meta/popcount.h" #include "zoo/swar/SWAR.h" @@ -388,19 +389,13 @@ constexpr auto negate(SWAR input) { /// \param log2Count is to potentially reduce the number of iterations if the caller a-priori knows /// there are fewer iterations than what the type of exponent would allow template< - typename Base, typename IterationCount, typename Operator, - // the critical use of associativity is that it allows halving the - // iteration count - typename CountHalver + typename Base, typename IterationCount, + typename Operator, typename CountHalver > constexpr auto associativeOperatorIterated_regressive( - Base base, - Base neutral, - IterationCount count, - IterationCount forSquaring, - Operator op, - unsigned log2Count, - CountHalver ch + Base base, Base neutral, IterationCount count, + IterationCount forSquaring, Operator op, + unsigned log2Count, CountHalver ch ) { auto result = neutral; // sum = 0 if(!log2Count) { return result; } // NBits per lane @@ -579,10 +574,10 @@ template constexpr std::enable_if_t= 2 && (S::Lanes % 2) == 0, typename S::type> horizontalSum_lanes(S s) { - using Next = SWAR; + using STwiceWider = SWAR; constexpr auto - Ones = Next::LeastSignificantBit, - ShiftBackAmount = Next::NBits * (Next::Lanes - 1); + Ones = STwiceWider::LeastSignificantBit, + ShiftBackAmount = STwiceWider::NBits * (STwiceWider::Lanes - 1); constexpr auto sum = [](auto a) { return (a.value() * Ones) >> ShiftBackAmount; @@ -639,6 +634,20 @@ constexpr auto horizontalSum_bits(S input) { ); } +template +constexpr auto is_odd(T input) { + return input & T{1}; +} + +template +constexpr auto horizontalSum(S input) { + if constexpr (is_odd(S::NBits)) { + return horizontalSum_bits(input); + } else { + return horizontalSum_lanes(input); + } +} + namespace experimental { template @@ -653,7 +662,7 @@ constexpr auto horizontalSum_prog(S x) { for (int i = 0; i < NBits; i++) { auto msb_masked = value & Ones; - auto popcount = meta::basic_popcount(msb_masked); + auto popcount = zoo::meta::basic_popcount(msb_masked); auto value_at_square = popcount * square; sum += value_at_square; square <<= 1; @@ -673,7 +682,7 @@ constexpr auto horizontalSum_reg(S x) { auto operation = [](auto result, auto count) { auto msb_masked = count & MSBs; - auto popcount = meta::basic_popcount(msb_masked); + auto popcount = zoo::meta::basic_popcount(msb_masked); result <<= 1; result += popcount; return result; @@ -711,14 +720,15 @@ constexpr auto horizontalSum_reg(S x) { 15, 15, 15, 15, 15, 15, 15, 15), (15 * 16)) \ Y(fn, (31, u64), (1, 2), 3) \ Y(fn, (15, u32), (1, 1), 2) \ - Y(fn, (11, u32), (1, 1), 2) // hmm ok some lanes sizes don't work yet, when they are too small - // Y(fn, (5, u32), (1, 2, 3, 4, 5, 6), 21) \ - // Y(fn, (5, u32), (6, 5, 4, 3, 2, 1), 21) \ + Y(fn, (11, u32), (1, 1), 2) \ + Y(fn, (5, u32), (1, 2, 3, 4, 5, 6), 21) \ + Y(fn, (5, u32), (6, 5, 4, 3, 2, 1), 21) \ #define HORIZONTAL_SUM_TESTS_ALL \ + HORIZONTAL_SUM_TESTS(horizontalSum) \ HORIZONTAL_SUM_TESTS(horizontalSum_bits) \ - HORIZONTAL_SUM_TESTS(horizontalSum_lanes) \ + // HORIZONTAL_SUM_TESTS(horizontalSum_lanes) /* doesn't work in all by itself */ \ HORIZONTAL_SUM_TESTS(experimental::horizontalSum_prog) \ HORIZONTAL_SUM_TESTS(experimental::horizontalSum_reg) @@ -734,4 +744,6 @@ static_assert(((0x01'01 * 0x05'01) & 0xFF'00) == 0x06'00, "Test failed"); } + #endif +