|
50 | 50 | // apple silicon can run most x86-64 instructions, but not necessarily all
|
51 | 51 | #define THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER 1
|
52 | 52 | #elif defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE2_BITPERM) && \
|
53 |
| - __has_include(<arm_neon_sve_bridge.h>) |
| 53 | + __has_include(<arm_neon_sve_bridge.h>) && !FOLLY_MOBILE |
54 | 54 | #define THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER 1
|
55 | 55 | #else
|
56 | 56 | #define THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER 0
|
57 | 57 | #endif
|
58 | 58 |
|
59 | 59 | #if THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER && FOLLY_AARCH64
|
| 60 | +#include <arm_neon.h> |
60 | 61 | #include <arm_neon_sve_bridge.h> // @manual
|
61 | 62 | #include <arm_sve.h>
|
62 | 63 | #endif
|
@@ -430,20 +431,102 @@ uint8_t writeVarintUnrolled(Cursor& c, T value) {
|
430 | 431 |
|
431 | 432 | #if THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER
|
432 | 433 |
|
| 434 | +#if FOLLY_AARCH64 |
| 435 | + |
| 436 | +template <class Cursor, class T> |
| 437 | +uint8_t writeVarintSve(Cursor& c, T valueS) { |
| 438 | + auto value = folly::to_unsigned(valueS); |
| 439 | + if (FOLLY_LIKELY((value & ~0x7f) == 0)) { |
| 440 | + c.template write<uint8_t>(static_cast<uint8_t>(value)); |
| 441 | + return 1; |
| 442 | + } |
| 443 | + |
| 444 | + if constexpr (sizeof(T) == 1) { |
| 445 | + c.template write<uint16_t>(static_cast<uint16_t>(value | 0x100)); |
| 446 | + return 2; |
| 447 | + } |
| 448 | + |
| 449 | + enum { maxSize = (8 * sizeof(T) + 6) / 7 }; |
| 450 | + c.ensure(maxSize); |
| 451 | + |
| 452 | + svuint8_t bdepMask = svset_neonq_u8(svundef_u8(), vdupq_n_u8(0x7f)); |
| 453 | + uint64x2_t clzMask = vreinterpretq_u64_u8(vdupq_n_u8(0xff)); |
| 454 | + uint64x2_t vec; |
| 455 | + vec[0] = value; |
| 456 | + |
| 457 | + vec = svget_neonq_u64(svbdep_u64( |
| 458 | + svset_neonq_u64(svundef_u64(), vec), svreinterpret_u64_u8(bdepMask))); |
| 459 | + |
| 460 | + svuint64_t clzV; |
| 461 | + uint64x2_t clzMaskV; |
| 462 | + if constexpr (sizeof(T) == 2) { |
| 463 | + clzV = svset_neonq_u64( |
| 464 | + svundef_u64(), |
| 465 | + vreinterpretq_u64_u32(vclzq_u32(vreinterpretq_u32_u64(vec)))); |
| 466 | + clzMaskV = vreinterpretq_u64_u32(svget_neonq_u32(svlsr_u32_x( |
| 467 | + svptrue_b32(), |
| 468 | + svset_neonq_u32(svundef_u32(), vreinterpretq_u32_u64(clzMask)), |
| 469 | + svreinterpret_u32_u64(clzV)))); |
| 470 | + } else { |
| 471 | + clzV = svclz_u64_x(svptrue_b64(), svset_neonq_u64(svundef_u64(), vec)); |
| 472 | + clzMaskV = svget_neonq_u64(svlsr_u64_x( |
| 473 | + svptrue_b64(), svset_neonq_u64(svundef_u64(), clzMask), clzV)); |
| 474 | + } |
| 475 | + |
| 476 | + svuint64_t sizeSV = svlsr_n_u64_x(svptrue_b64(), clzV, 3); |
| 477 | + |
| 478 | + if constexpr (sizeof(T) == 2) { |
| 479 | + sizeSV = svsubr_n_u64_x(svptrue_b64(), sizeSV, 4); |
| 480 | + } else { |
| 481 | + sizeSV = svsubr_n_u64_x(svptrue_b64(), sizeSV, 8); |
| 482 | + } |
| 483 | + |
| 484 | + vec = vreinterpretq_u64_u8(svget_neonq_u8(svorr_n_u8_x( |
| 485 | + svptrue_b8(), |
| 486 | + svset_neonq_u8(svundef_u8(), vreinterpretq_u8_u64(vec)), |
| 487 | + 0x80))); |
| 488 | + |
| 489 | + vec = vandq_u64(vec, clzMaskV); |
| 490 | + |
| 491 | + if constexpr (sizeof(T) == 8) { |
| 492 | + uint8_t orMask = value < (1ull << 56) ? 0 : 0x80; |
| 493 | + uint64x2_t orMaskV = vreinterpretq_u64_u8(vdupq_n_u8(orMask)); |
| 494 | + vec = vorrq_u64(vec, orMaskV); |
| 495 | + } |
| 496 | + |
| 497 | + uint8_t* p = c.writableData(); |
| 498 | + |
| 499 | + if constexpr (sizeof(T) == sizeof(uint16_t)) { |
| 500 | + vst1q_lane_u16( |
| 501 | + reinterpret_cast<uint16_t*>(p), vreinterpretq_u16_u64(vec), 0); |
| 502 | + vst1q_lane_u8(p + 2, vreinterpretq_u8_u64(vec), 2); |
| 503 | + } else if constexpr (sizeof(T) == sizeof(uint32_t)) { |
| 504 | + vst1q_lane_u32( |
| 505 | + reinterpret_cast<uint32_t*>(p), vreinterpretq_u32_u64(vec), 0); |
| 506 | + vst1q_lane_u8(p + 4, vreinterpretq_u8_u64(vec), 4); |
| 507 | + } else { |
| 508 | + vst1q_lane_u64(reinterpret_cast<uint64_t*>(p), vec, 0); |
| 509 | + p[8] = value >> 56; |
| 510 | + p[9] = value >> 63; |
| 511 | + } |
| 512 | + |
| 513 | + uint8_t size = vreinterpretq_u8_u64(svget_neonq_u64(sizeSV))[0]; |
| 514 | + if constexpr (sizeof(T) == 8) { |
| 515 | + size = value < (1ull << 56) ? size : (value >> 63) + 9; |
| 516 | + } |
| 517 | + |
| 518 | + c.append(size); |
| 519 | + return size; |
| 520 | +} |
| 521 | + |
| 522 | +#else |
| 523 | + |
433 | 524 | inline uint64_t compressBits(uint64_t value, uint64_t mask) {
|
434 |
| -#if FOLLY_X64 |
435 | 525 | return _pdep_u64(value, mask);
|
436 |
| -#elif FOLLY_AARCH64 |
437 |
| - // See https://godbolt.org/z/nhc443acd |
438 |
| - const auto vec = svbdep_u64(svdup_n_u64(value), svdup_n_u64(mask)); |
439 |
| - return vgetq_lane_u64(svget_neonq_u64(vec), 0); |
440 |
| -#else |
441 |
| - static_assert(0, "no pdep-equivalent instruction is available"); |
442 |
| -#endif // __BMI2__, __ARM_FEATURE_SVE2_BITPERM |
443 | 526 | }
|
444 | 527 |
|
445 | 528 | template <class Cursor, class T>
|
446 |
| -uint8_t writeVarintBranchFree(Cursor& c, T valueS) { |
| 529 | +uint8_t writeVarintBranchFreeX86(Cursor& c, T valueS) { |
447 | 530 | auto value = folly::to_unsigned(valueS);
|
448 | 531 | if (FOLLY_LIKELY((value & ~0x7f) == 0)) {
|
449 | 532 | c.template write<uint8_t>(static_cast<uint8_t>(value));
|
@@ -494,6 +577,17 @@ uint8_t writeVarintBranchFree(Cursor& c, T valueS) {
|
494 | 577 | return size;
|
495 | 578 | }
|
496 | 579 |
|
| 580 | +#endif |
| 581 | + |
| 582 | +template <class Cursor, class T> |
| 583 | +uint8_t writeVarintBranchFree(Cursor& c, T valueS) { |
| 584 | +#if FOLLY_AARCH64 |
| 585 | + return writeVarintSve(c, valueS); |
| 586 | +#else |
| 587 | + return writeVarintBranchFreeX86(c, valueS); |
| 588 | +#endif |
| 589 | +} |
| 590 | + |
497 | 591 | template <class Cursor, class T>
|
498 | 592 | uint8_t writeVarint(Cursor& c, T value) {
|
499 | 593 | return writeVarintBranchFree(c, value);
|
|
0 commit comments