Skip to content

Commit 0ea0b91

Browse files
Nicoshevfacebook-github-bot
authored andcommitted
Add writeVarintSve for aarch64 - retry
Summary: Implemented an explicit SVE version of writeVarint. Throughput for 64-bit types shows a ~15% improvement. 16-bit and 32-bit cases seem to show a small improvement as well. All three functions are branch-free, their disassembly can be seen here: https://godbolt.org/z/jG5d8Wfe8 before: bench_write(u16_any_branch_free) 110.66% 2.00us 500.10K bench_write(u32_any_branch_free) 126.90% 2.00us 499.37K bench_write(u64_any_branch_free) 193.56% 2.33us 429.37K bench_write(u16_1b_branch_free) 99.562% 1.91us 522.97K bench_write(u16_2b_branch_free) 114.92% 2.00us 500.59K bench_write(u16_3b_branch_free) 111.66% 2.00us 500.99K bench_write(u32_1b_branch_free) 97.918% 1.93us 518.38K bench_write(u32_2b_branch_free) 113.76% 1.99us 502.29K bench_write(u32_3b_branch_free) 111.14% 1.99us 503.03K bench_write(u32_4b_branch_free) 115.72% 1.97us 507.52K bench_write(u32_5b_branch_free) 122.05% 2.00us 498.82K bench_write(u64_1b_branch_free) 99.089% 1.95us 511.71K bench_write(u64_2b_branch_free) 90.484% 2.53us 396.00K bench_write(u64_3b_branch_free) 93.335% 2.38us 419.63K bench_write(u64_4b_branch_free) 100.61% 2.24us 446.86K bench_write(u64_5b_branch_free) 123.18% 2.37us 421.24K bench_write(u64_6b_branch_free) 120.10% 2.33us 429.84K bench_write(u64_7b_branch_free) 144.69% 2.36us 423.79K bench_write(u64_8b_branch_free) 149.44% 2.25us 443.92K bench_write(u64_9b_branch_free) 174.37% 2.31us 433.60K bench_write(u64_10b_branch_free) 176.81% 2.28us 438.61K bench_write(exponential_1b_branch_free) 108.05% 1.91us 522.52K bench_write(exponential_2b_branch_free) 118.34% 1.98us 504.37K bench_write(exponential_3b_branch_free) 114.22% 1.99us 501.87K after: bench_write(u16_any_branch_free) 115.30% 1.97us 507.43K bench_write(u32_any_branch_free) 130.06% 1.97us 508.40K bench_write(u64_any_branch_free) 226.45% 1.96us 509.18K bench_write(u16_1b_branch_free) 101.37% 1.84us 543.01K bench_write(u16_2b_branch_free) 116.65% 1.97us 508.51K bench_write(u16_3b_branch_free) 111.17% 1.96us 510.12K bench_write(u32_1b_branch_free) 99.679% 1.93us 519.42K bench_write(u32_2b_branch_free) 115.98% 1.98us 506.04K bench_write(u32_3b_branch_free) 111.45% 1.98us 503.85K bench_write(u32_4b_branch_free) 116.04% 1.95us 513.18K bench_write(u32_5b_branch_free) 124.59% 1.97us 508.35K bench_write(u64_1b_branch_free) 99.669% 1.91us 522.26K bench_write(u64_2b_branch_free) 117.53% 1.93us 518.86K bench_write(u64_3b_branch_free) 111.95% 1.95us 511.77K bench_write(u64_4b_branch_free) 111.29% 1.98us 504.98K bench_write(u64_5b_branch_free) 124.53% 1.96us 510.52K bench_write(u64_6b_branch_free) 145.48% 1.90us 526.18K bench_write(u64_7b_branch_free) 172.51% 1.97us 506.83K bench_write(u64_8b_branch_free) 174.92% 1.95us 514.13K bench_write(u64_9b_branch_free) 202.27% 1.97us 508.08K bench_write(u64_10b_branch_free) 205.43% 1.96us 510.44K bench_write(exponential_1b_branch_free) 105.67% 1.91us 523.63K bench_write(exponential_2b_branch_free) 116.10% 1.95us 512.64K bench_write(exponential_3b_branch_free) 119.08% 1.95us 513.34K Reviewed By: embg Differential Revision: D73513003
1 parent ea2855e commit 0ea0b91

File tree

2 files changed

+106
-12
lines changed

2 files changed

+106
-12
lines changed

third-party/thrift/src/thrift/lib/cpp/util/VarintUtils-inl.h

Lines changed: 104 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -50,13 +50,14 @@
5050
// apple silicon can run most x86-64 instructions, but not necessarily all
5151
#define THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER 1
5252
#elif defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE2_BITPERM) && \
53-
__has_include(<arm_neon_sve_bridge.h>)
53+
__has_include(<arm_neon_sve_bridge.h>) && !FOLLY_MOBILE
5454
#define THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER 1
5555
#else
5656
#define THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER 0
5757
#endif
5858

5959
#if THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER && FOLLY_AARCH64
60+
#include <arm_neon.h>
6061
#include <arm_neon_sve_bridge.h> // @manual
6162
#include <arm_sve.h>
6263
#endif
@@ -430,20 +431,102 @@ uint8_t writeVarintUnrolled(Cursor& c, T value) {
430431

431432
#if THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER
432433

434+
#if FOLLY_AARCH64
435+
436+
template <class Cursor, class T>
437+
uint8_t writeVarintSve(Cursor& c, T valueS) {
438+
auto value = folly::to_unsigned(valueS);
439+
if (FOLLY_LIKELY((value & ~0x7f) == 0)) {
440+
c.template write<uint8_t>(static_cast<uint8_t>(value));
441+
return 1;
442+
}
443+
444+
if constexpr (sizeof(T) == 1) {
445+
c.template write<uint16_t>(static_cast<uint16_t>(value | 0x100));
446+
return 2;
447+
}
448+
449+
enum { maxSize = (8 * sizeof(T) + 6) / 7 };
450+
c.ensure(maxSize);
451+
452+
svuint8_t bdepMask = svset_neonq_u8(svundef_u8(), vdupq_n_u8(0x7f));
453+
uint64x2_t clzMask = vreinterpretq_u64_u8(vdupq_n_u8(0xff));
454+
uint64x2_t vec;
455+
vec[0] = value;
456+
457+
vec = svget_neonq_u64(svbdep_u64(
458+
svset_neonq_u64(svundef_u64(), vec), svreinterpret_u64_u8(bdepMask)));
459+
460+
svuint64_t clzV;
461+
uint64x2_t clzMaskV;
462+
if constexpr (sizeof(T) == 2) {
463+
clzV = svset_neonq_u64(
464+
svundef_u64(),
465+
vreinterpretq_u64_u32(vclzq_u32(vreinterpretq_u32_u64(vec))));
466+
clzMaskV = vreinterpretq_u64_u32(svget_neonq_u32(svlsr_u32_x(
467+
svptrue_b32(),
468+
svset_neonq_u32(svundef_u32(), vreinterpretq_u32_u64(clzMask)),
469+
svreinterpret_u32_u64(clzV))));
470+
} else {
471+
clzV = svclz_u64_x(svptrue_b64(), svset_neonq_u64(svundef_u64(), vec));
472+
clzMaskV = svget_neonq_u64(svlsr_u64_x(
473+
svptrue_b64(), svset_neonq_u64(svundef_u64(), clzMask), clzV));
474+
}
475+
476+
svuint64_t sizeSV = svlsr_n_u64_x(svptrue_b64(), clzV, 3);
477+
478+
if constexpr (sizeof(T) == 2) {
479+
sizeSV = svsubr_n_u64_x(svptrue_b64(), sizeSV, 4);
480+
} else {
481+
sizeSV = svsubr_n_u64_x(svptrue_b64(), sizeSV, 8);
482+
}
483+
484+
vec = vreinterpretq_u64_u8(svget_neonq_u8(svorr_n_u8_x(
485+
svptrue_b8(),
486+
svset_neonq_u8(svundef_u8(), vreinterpretq_u8_u64(vec)),
487+
0x80)));
488+
489+
vec = vandq_u64(vec, clzMaskV);
490+
491+
if constexpr (sizeof(T) == 8) {
492+
uint8_t orMask = value < (1ull << 56) ? 0 : 0x80;
493+
uint64x2_t orMaskV = vreinterpretq_u64_u8(vdupq_n_u8(orMask));
494+
vec = vorrq_u64(vec, orMaskV);
495+
}
496+
497+
uint8_t* p = c.writableData();
498+
499+
if constexpr (sizeof(T) == sizeof(uint16_t)) {
500+
vst1q_lane_u16(
501+
reinterpret_cast<uint16_t*>(p), vreinterpretq_u16_u64(vec), 0);
502+
vst1q_lane_u8(p + 2, vreinterpretq_u8_u64(vec), 2);
503+
} else if constexpr (sizeof(T) == sizeof(uint32_t)) {
504+
vst1q_lane_u32(
505+
reinterpret_cast<uint32_t*>(p), vreinterpretq_u32_u64(vec), 0);
506+
vst1q_lane_u8(p + 4, vreinterpretq_u8_u64(vec), 4);
507+
} else {
508+
vst1q_lane_u64(reinterpret_cast<uint64_t*>(p), vec, 0);
509+
p[8] = value >> 56;
510+
p[9] = value >> 63;
511+
}
512+
513+
uint8_t size = vreinterpretq_u8_u64(svget_neonq_u64(sizeSV))[0];
514+
if constexpr (sizeof(T) == 8) {
515+
size = value < (1ull << 56) ? size : (value >> 63) + 9;
516+
}
517+
518+
c.append(size);
519+
return size;
520+
}
521+
522+
#else
523+
433524
inline uint64_t compressBits(uint64_t value, uint64_t mask) {
434-
#if FOLLY_X64
435525
return _pdep_u64(value, mask);
436-
#elif FOLLY_AARCH64
437-
// See https://godbolt.org/z/nhc443acd
438-
const auto vec = svbdep_u64(svdup_n_u64(value), svdup_n_u64(mask));
439-
return vgetq_lane_u64(svget_neonq_u64(vec), 0);
440-
#else
441-
static_assert(0, "no pdep-equivalent instruction is available");
442-
#endif // __BMI2__, __ARM_FEATURE_SVE2_BITPERM
443526
}
444527

445528
template <class Cursor, class T>
446-
uint8_t writeVarintBranchFree(Cursor& c, T valueS) {
529+
uint8_t writeVarintBranchFreeX86(Cursor& c, T valueS) {
447530
auto value = folly::to_unsigned(valueS);
448531
if (FOLLY_LIKELY((value & ~0x7f) == 0)) {
449532
c.template write<uint8_t>(static_cast<uint8_t>(value));
@@ -494,6 +577,17 @@ uint8_t writeVarintBranchFree(Cursor& c, T valueS) {
494577
return size;
495578
}
496579

580+
#endif
581+
582+
template <class Cursor, class T>
583+
uint8_t writeVarintBranchFree(Cursor& c, T valueS) {
584+
#if FOLLY_AARCH64
585+
return writeVarintSve(c, valueS);
586+
#else
587+
return writeVarintBranchFreeX86(c, valueS);
588+
#endif
589+
}
590+
497591
template <class Cursor, class T>
498592
uint8_t writeVarint(Cursor& c, T value) {
499593
return writeVarintBranchFree(c, value);

third-party/thrift/src/thrift/lib/cpp/util/test/VarintUtilsBench.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -235,8 +235,8 @@ BENCHMARK_NAMED_PARAM(bench_read, u64_9b, u64_9b())
235235
BENCHMARK_NAMED_PARAM(bench_read, u64_10b, u64_10b())
236236

237237
BENCHMARK_NAMED_PARAM(bench_read, exponential_1b, exponential_1b())
238-
BENCHMARK_NAMED_PARAM(bench_read, exponential_2b, exponential_1b())
239-
BENCHMARK_NAMED_PARAM(bench_read, exponential_3b, exponential_1b())
238+
BENCHMARK_NAMED_PARAM(bench_read, exponential_2b, exponential_2b())
239+
BENCHMARK_NAMED_PARAM(bench_read, exponential_3b, exponential_3b())
240240

241241
int main(int argc, char** argv) {
242242
folly::Init init(&argc, &argv, true);

0 commit comments

Comments
 (0)