Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/api/reducer_index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ Reduction operators
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`reduce_min` | min of the batch elements |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`reduce_mul` | product of the batch elements |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`haddp` | horizontal sum across batches |
+---------------------------------------+----------------------------------------------------+

Expand Down
14 changes: 0 additions & 14 deletions include/xsimd/arch/common/xsimd_common_arithmetic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,20 +139,6 @@ namespace xsimd
return fma(x, y, select(mask, neg(z), z));
}

// hadd
template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
XSIMD_INLINE T hadd(batch<T, A> const& self, requires_arch<common>) noexcept
{
alignas(A::alignment()) T buffer[batch<T, A>::size];
self.store_aligned(buffer);
T res = 0;
for (T val : buffer)
{
res += val;
}
return res;
}

// incr
template <class A, class T>
XSIMD_INLINE batch<T, A> incr(batch<T, A> const& self, requires_arch<common>) noexcept
Expand Down
2 changes: 2 additions & 0 deletions include/xsimd/arch/common/xsimd_common_details.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ namespace xsimd
template <class T, class A>
XSIMD_INLINE T reduce_add(batch<T, A> const&) noexcept;
template <class T, class A>
XSIMD_INLINE T reduce_mul(batch<T, A> const&) noexcept;
template <class T, class A>
XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const&, batch<T, A> const&, batch<T, A> const&) noexcept;
template <class T, class A>
XSIMD_INLINE batch<std::complex<T>, A> select(batch_bool<T, A> const&, batch<std::complex<T>, A> const&, batch<std::complex<T>, A> const&) noexcept;
Expand Down
41 changes: 41 additions & 0 deletions include/xsimd/arch/common/xsimd_common_math.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2103,6 +2103,19 @@ namespace xsimd
return { reduce_add(self.real()), reduce_add(self.imag()) };
}

template <class A, class T, class /*=typename std::enable_if<std::is_scalar<T>::value, void>::type*/>
XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<common>) noexcept
{
alignas(A::alignment()) T buffer[batch<T, A>::size];
self.store_aligned(buffer);
T res = 0;
for (T val : buffer)
{
res += val;
}
return res;
}

namespace detail
{
template <class T, T N>
Expand Down Expand Up @@ -2147,6 +2160,34 @@ namespace xsimd
self, std::integral_constant<unsigned, batch<T, A>::size>());
}

// reduce_mul
template <class A, class T>
XSIMD_INLINE std::complex<T> reduce_mul(batch<std::complex<T>, A> const& self, requires_arch<common>) noexcept
{
// FIXME: could do better
alignas(A::alignment()) std::complex<T> buffer[batch<std::complex<T>, A>::size];
self.store_aligned(buffer);
std::complex<T> res = 1;
for (auto val : buffer)
{
res *= val;
}
return res;
}

template <class A, class T, class /*=typename std::enable_if<std::is_scalar<T>::value, void>::type*/>
XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<common>) noexcept
{
alignas(A::alignment()) T buffer[batch<T, A>::size];
self.store_aligned(buffer);
T res = 1;
for (T val : buffer)
{
res *= val;
}
return res;
}

// remainder
template <class A>
XSIMD_INLINE batch<float, A> remainder(batch<float, A> const& self, batch<float, A> const& other, requires_arch<common>) noexcept
Expand Down
14 changes: 12 additions & 2 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1046,7 +1046,7 @@ namespace xsimd
}

// reduce_add
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value || std::is_same<T, double>::value, void>::type>
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
{
typename batch<T, sse4_2>::register_type low, high;
Expand Down Expand Up @@ -1077,6 +1077,16 @@ namespace xsimd
return reduce_min(batch<T, sse4_2>(low));
}

// reduce_mul
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx>) noexcept
{
typename batch<T, sse4_2>::register_type low, high;
detail::split_avx(self, low, high);
batch<T, sse4_2> blow(low), bhigh(high);
return reduce_mul(blow * bhigh);
}

// rsqrt
template <class A>
XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
Expand Down Expand Up @@ -1911,4 +1921,4 @@ namespace xsimd
}
}

#endif
#endif
10 changes: 10 additions & 0 deletions include/xsimd/arch/xsimd_avx512dq.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,16 @@ namespace xsimd
return reduce_add(batch<float, avx2>(res1), avx2 {});
}

// reduce_mul
template <class A>
XSIMD_INLINE float reduce_mul(batch<float, A> const& rhs, requires_arch<avx512dq>) noexcept
{
__m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
__m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
__m256 res1 = _mm256_mul_ps(tmp1, tmp2);
return reduce_mul(batch<float, avx2>(res1), avx2 {});
}

// swizzle constant mask
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7,
uint32_t V8, uint32_t V9, uint32_t V10, uint32_t V11, uint32_t V12, uint32_t V13, uint32_t V14, uint32_t V15>
Expand Down
31 changes: 31 additions & 0 deletions include/xsimd/arch/xsimd_avx512f.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1558,6 +1558,37 @@ namespace xsimd
return reduce_min(batch<T, avx2>(low));
}

// reduce_mul
template <class A>
XSIMD_INLINE float reduce_mul(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
{
return _mm512_reduce_mul_ps(rhs);
}
template <class A>
XSIMD_INLINE double reduce_mul(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
{
return _mm512_reduce_mul_pd(rhs);
}
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx512f>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm512_reduce_mul_epi32(self);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
return _mm512_reduce_mul_epi64(self);
}
else
{
__m256i low, high;
detail::split_avx512(self, low, high);
batch<T, avx2> blow(low), bhigh(high);
return reduce_mul(blow, avx2 {}) * reduce_mul(bhigh, avx2 {});
}
}

// rsqrt
template <class A>
XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
Expand Down
6 changes: 4 additions & 2 deletions include/xsimd/arch/xsimd_common_fwd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,10 @@ namespace xsimd
XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept;
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept;
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE T hadd(batch<T, A> const& self, requires_arch<common>) noexcept;
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<common>) noexcept;
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<common>) noexcept;
// Forward declarations for pack-level helpers
namespace detail
{
Expand Down
10 changes: 10 additions & 0 deletions include/xsimd/arch/xsimd_emulated.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,16 @@ namespace xsimd
{ return xsimd::min(x, y); });
}

// reduce_mul
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
{
constexpr size_t size = batch<T, A>::size;
std::array<T, size> buffer;
self.store_unaligned(buffer.data());
return std::accumulate(buffer.begin() + 1, buffer.end(), *buffer.begin(), std::multiplies<T>());
}

// rsqrt
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
Expand Down
11 changes: 9 additions & 2 deletions include/xsimd/arch/xsimd_neon.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1705,14 +1705,21 @@ namespace xsimd
* reduce_max *
**************/

// Using common implementation because ARM doe snot provide intrinsics
// Using common implementation because ARM does not provide intrinsics
// for this operation

/**************
* reduce_min *
**************/

// Using common implementation because ARM doe snot provide intrinsics
// Using common implementation because ARM does not provide intrinsics
// for this operation

/**************
* reduce_mul *
**************/

// Using common implementation because ARM does not provide intrinsics
// for this operation

/**********
Expand Down
48 changes: 47 additions & 1 deletion include/xsimd/arch/xsimd_sse2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1290,7 +1290,7 @@ namespace xsimd
}
else
{
return hadd(self, common {});
return reduce_add(self, common {});
}
}

Expand Down Expand Up @@ -1344,6 +1344,52 @@ namespace xsimd
return first(acc3, A {});
}

// reduce_mul
template <class A>
XSIMD_INLINE float reduce_mul(batch<float, A> const& self, requires_arch<sse2>) noexcept
{
__m128 tmp0 = _mm_mul_ps(self, _mm_movehl_ps(self, self));
__m128 tmp1 = _mm_mul_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
return _mm_cvtss_f32(tmp1);
}

template <class A>
XSIMD_INLINE double reduce_mul(batch<double, A> const& self, requires_arch<sse2>) noexcept
{
return _mm_cvtsd_f64(_mm_mul_sd(self, _mm_unpackhi_pd(self, self)));
}

template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<sse2>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
batch<T, A> tmp1 = _mm_shuffle_epi32(self, _MM_SHUFFLE(0, 1, 2, 3));
tmp1 = tmp1 * self;
batch<T, A> tmp2 = _mm_unpackhi_epi32(tmp1, tmp1);
tmp2 = tmp2 * tmp1;
return _mm_cvtsi128_si32(tmp2);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
batch<T, A> tmp1 = _mm_unpackhi_epi64(self, self);
auto tmp2 = tmp1 * self;
#if defined(__x86_64__)
return _mm_cvtsi128_si64(tmp2);
#else
__m128i m;
_mm_storel_epi64(&m, tmp2);
int64_t i;
std::memcpy(&i, &m, sizeof(i));
return i;
#endif
}
else
{
return reduce_mul(self, common {});
}
}

// rsqrt
template <class A>
XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
Expand Down
9 changes: 9 additions & 0 deletions include/xsimd/arch/xsimd_sse3.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,15 @@ namespace xsimd
return _mm_cvtss_f32(tmp1);
}

// reduce_mul
template <class A>
XSIMD_INLINE float reduce_mul(batch<float, A> const& self, requires_arch<sse3>) noexcept
{
__m128 tmp1 = _mm_mul_ps(self, _mm_movehl_ps(self, self));
__m128 tmp2 = _mm_mul_ps(tmp1, _mm_movehdup_ps(tmp1));
return _mm_cvtss_f32(tmp2);
}

}

}
Expand Down
44 changes: 43 additions & 1 deletion include/xsimd/arch/xsimd_vsx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -559,7 +559,49 @@ namespace xsimd
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<vsx>) noexcept
{
return hadd(self, common {});
return reduce_add(self, common {});
}

// reduce_mul
template <class A>
XSIMD_INLINE signed reduce_mul(batch<signed, A> const& self, requires_arch<vsx>) noexcept
{
auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0
auto tmp1 = vec_mul(self.data, tmp0); // v0 * v3, v1 * v2, v2 * v1, v3 * v0
auto tmp2 = vec_mergel(tmp1, tmp1); // v2 * v1, v2 * v1, v3 * v0, v3 * v0
auto tmp3 = vec_mul(tmp1, tmp2);
return vec_extract(tmp3, 0);
}
template <class A>
XSIMD_INLINE unsigned reduce_mul(batch<unsigned, A> const& self, requires_arch<vsx>) noexcept
{
auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0
auto tmp1 = vec_mul(self.data, tmp0); // v0 * v3, v1 * v2, v2 * v1, v3 * v0
auto tmp2 = vec_mergel(tmp1, tmp1); // v2 * v1, v2 * v1, v3 * v0, v3 * v0
auto tmp3 = vec_mul(tmp1, tmp2);
return vec_extract(tmp3, 0);
}
template <class A>
XSIMD_INLINE float reduce_mul(batch<float, A> const& self, requires_arch<vsx>) noexcept
{
// FIXME: find an in-order approach
auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0
auto tmp1 = vec_mul(self.data, tmp0); // v0 * v3, v1 * v2, v2 * v1, v3 * v0
auto tmp2 = vec_mergel(tmp1, tmp1); // v2 * v1, v2 * v1, v3 * v0, v3 * v0
auto tmp3 = vec_mul(tmp1, tmp2);
return vec_extract(tmp3, 0);
}
template <class A>
XSIMD_INLINE double reduce_mul(batch<double, A> const& self, requires_arch<vsx>) noexcept
{
auto tmp0 = vec_reve(self.data); // v1, v0
auto tmp1 = vec_mul(self.data, tmp0); // v0 * v1, v1 * v0
return vec_extract(tmp1, 0);
}
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<vsx>) noexcept
{
return reduce_mul(self, common {});
}

// round
Expand Down
Loading