diff --git a/benchmark/c_str-functions/c_str.cpp b/benchmark/c_str-functions/c_str.cpp index 645f2753..d3c1e37b 100644 --- a/benchmark/c_str-functions/c_str.cpp +++ b/benchmark/c_str-functions/c_str.cpp @@ -554,3 +554,132 @@ std::size_t neon_strlen(const char *str) { } #endif + +namespace zoo { + +//template +int c_strCmp(const char *a, const char *b) { + using S = swar::SWAR<8>; + S as, bs; + auto [aB, aM] = blockAlignedLoad(a, &as); + auto [bB, bM] = blockAlignedLoad(b, &bs); + + auto misalignmentDifference = aM - bM; + // to establish the loop invariant there is the need to fill the + // bytes of the blocks that do not belong to the inputs. + // the bytes that do not belong are those up to the misalignment. + // let's say: + // **NOTE: THE DIAGRAMS ARE IN LITTLE ENDIAN!** + // [ a0, a1, a2, a3, a4, a5, a6, a7 ] + // ^ ^ misalignment = 3 + // | base of A + // [ b0, b1, b2, b3, b4, b5, b6, b7 ] + // ^ ^ misalignment of b = 2 + // | base of B + // The bytes that really belong to A are + // [ ?, ?, ?, a3, a4, a5, a6, a7 ] + // To avoid the first three bytes interfering in the comparison, we + // fill them with lanes of all ones: + // [ ~0, ~0, ~0, a3, a4, a5, a6, a7 ], for this, we do this: + // [ 0, 0, 0, ~0, ..., ~0 ] = S{S::AllOnes}.shiftLanesLeft(3) = SLL + // [ ~0, ~0, ~0, 0, ..., 0 ] = ~SLL + // [ ~0, ~0, ~0, a3, a4, a5, a6, a7 ] = ASL | ~SLL + // now, we can use all the bytes in as. + // We need to do something similar for bs, but because bs is less misaligned + // we will process the bytes we can in this iteration, but we have to + // leave a remainder: + // [ 0, b0, b1, b2, b3, b4, b5, b6 ] = bs.shiftLanesLeft(3 - 2) = BSL + // [~0, ~0, ~0, b2, b3, b4, b5, b6 ] = BSL | ~SLL + // [ ?, ?, ?, ?, ?, ?, ?, b6 ] = remainder for the next iteration + + // The prefix mma means "more mis-aligned", lma "less mis-aligned" + const char *mmaBase, *lmaBase; + S mmaBytes, lmaBytes, lmaRemainder; + int returnMultiplier; + auto loopInvariantMaker = + [&]( + auto largerMisalignment, + auto mmaBa, auto mmaBy, auto lmaBa, auto lmaBy, + int reM + ) { + // a is more misaligned than b, a provides less bytes + auto initialFiller = + ~S{S::AllOnes}.shiftLanesLeft(largerMisalignment); + mmaBase = mmaBa; + mmaBytes = mmaBy | initialFiller; + lmaBase = lmaBa; + auto lmaAdjusted = lmaBy.shiftLanesLeft(misalignmentDifference); + lmaBytes = lmaAdjusted | initialFiller; + lmaRemainder = + lmaBy | + S{S::AllOnes}.shiftLanesRight(S::Lanes - misalignmentDifference); + returnMultiplier = reM; + }; + if(0 <= misalignmentDifference) { + loopInvariantMaker(aM, aB, as, bB, bs, 1); + } else { + misalignmentDifference = -misalignmentDifference; + loopInvariantMaker(bM, bB, bs, aB, as, -1); + } + auto nulls = [](S bytes) { + return swar::constantIsGreaterEqual<0>(bytes); + }; + for(;;) { + // invariant: + // 1. ready to compare mmaBytes with lmaBytes + // 2. there is at least one byte of input in both mmaBytes and lmaBytes + // 3. mmaBytes and lmaBytes are equal + // 4. There is no null in the bytes + // 5. There is no null in the significant bytes in the remainder + // Step 1: determine if the swars are different + auto exor = mmaBytes ^ lmaBytes; + if(!exor.value()) { + // There is a difference. Will terminate + // There are several cases. + // Is any string terminated? + auto + mNulls = nulls(mmaBytes), + lNulls = nulls(lmaBytes); + auto thereIsANull = mNulls | lNulls; + auto returner = + [&](S s) { + auto firstNullIndex = s.lsbIndex(); + auto + comparison = mmaBytes - lmaBytes, + inLeast = comparison.shiftLanesRight(firstNullIndex), + onlyLeast = inLeast & S{S::LeastSignificantLaneMask}; + return returnMultiplier * int8_t(onlyLeast.value()); + }; + if(thereIsANull) { + return returner(thereIsANull); + } + auto diffs = swar::constantIsGreaterEqual<0>(exor); + return returner(diffs); + } + // despite equality, we might have reached the end of the strings, + // this needs to be tested explicitly + if(nulls(mmaBytes)) { return 0; } + // preparation of the next iteration, grab a block from mmaBase + mmaBase += sizeof(S); + memcpy(&mmaBytes.m_v, mmaBase, sizeof(S)); + // there can be a null in the lmaRemainder, thus we can't just + // load more bytes + if(nulls(lmaRemainder)) { + // prepare the next iteration knowing it will terminate: + lmaBytes = + lmaRemainder.shiftLanesRight(S::Lanes - misalignmentDifference); + continue; + } + lmaBase += sizeof(S); + auto remShifted = + lmaRemainder.shiftLanesRight(S::Lanes - misalignmentDifference); + memcpy(&lmaRemainder.m_v, lmaBase, sizeof(S)); + auto newBytes = lmaRemainder.shiftLanesLeft(misalignmentDifference); + lmaBytes = remShifted | newBytes; + // note: if there are nulls in the lmaRemainder part that was + // copied to newBytes, they will be compared against mmaBytes + // and thus taken into account + } +} + +} diff --git a/inc/zoo/pp/platform.h b/inc/zoo/pp/platform.h index e6120667..886a4de2 100644 --- a/inc/zoo/pp/platform.h +++ b/inc/zoo/pp/platform.h @@ -20,8 +20,10 @@ #endif #ifdef _MSC_VER +#define ZOO_WINDOWS_BUILD() 1 #define MSVC_EMPTY_BASES __declspec(empty_bases) #else +#define ZOO_WINDOWS_BUILD() 0 #define MSVC_EMPTY_BASES #endif diff --git a/inc/zoo/swar/SWAR.h b/inc/zoo/swar/SWAR.h index 60ba9540..e41c8876 100644 --- a/inc/zoo/swar/SWAR.h +++ b/inc/zoo/swar/SWAR.h @@ -3,6 +3,7 @@ /// \file Swar.h SWAR operations #include "zoo/meta/log.h" +#include "zoo/pp/platform.h" #include #include @@ -39,9 +40,36 @@ constexpr uint64_t popcount(uint64_t a) noexcept { >::execute(a); } +template +struct ToUnsigned_impl { + using type = T; +}; + +template +struct ToUnsigned_impl>> { + using type = std::make_unsigned_t; +}; + +#if !ZOO_WINDOWS_BUILD() +template<> +struct ToUnsigned_impl<__int128_t, void> { + using type = __uint128_t; +}; + +template<> +struct ToUnsigned_impl<__uint128_t, void> { + using type = __uint128_t; +}; +#endif + +static_assert(std::is_same_v::type, unsigned>); + +template +using ToUnsigned = typename ToUnsigned_impl::type; + /// Index into the bits of the type T that contains the MSB. template -constexpr std::make_unsigned_t msbIndex(T v) noexcept { +constexpr ToUnsigned msbIndex(T v) noexcept { return meta::logFloor(v); } @@ -49,12 +77,12 @@ constexpr std::make_unsigned_t msbIndex(T v) noexcept { /// /// \todo incorporate __builtin_ctzg when it is more widely available template -constexpr std::make_unsigned_t lsbIndex(T v) noexcept { +constexpr ToUnsigned lsbIndex(T v) noexcept { // This check should be SFINAE, but supporting all sorts // of base types is an ongoing task, we put a bare-minimum // temporary preventive measure with static_assert static_assert(sizeof(T) <= 8, "Unsupported"); - #ifdef _MSC_VER + #if ZOO_WINDOWS_BUILD() // ~v & (v - 1) turns on all trailing zeroes, zeroes the rest return meta::logFloor(1 + (~v & (v - 1))); #else @@ -62,7 +90,7 @@ constexpr std::make_unsigned_t lsbIndex(T v) noexcept { #endif } -#ifndef _MSC_VER +#if !ZOO_WINDOWS_BUILD() constexpr __uint128_t lsbIndex(__uint128_t v) noexcept { auto low = (v << 64) >> 64; if(low) { return __builtin_ctzll(low); } @@ -77,7 +105,9 @@ constexpr __uint128_t lsbIndex(__uint128_t v) noexcept { /// Certain computational workloads can be materially sped up using SWAR techniques. template struct SWAR { - using type = std::make_unsigned_t; + using type = + // std::make_unsigned_t; + ToUnsigned; constexpr static auto Literal = Literals; constexpr static inline type NBits = NBits_,