Skip to content

Commit 263b4e8

Browse files
Provide generic and specialize implementation of reduce_mul
This is a generalization of #1132 by @emrys53. Part of the Intel code is strongly inspired by the work from #1132, with some minor nits.
1 parent deed07a commit 263b4e8

16 files changed

+262
-4
lines changed

docs/source/api/reducer_index.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ Reduction operators
3838
+---------------------------------------+----------------------------------------------------+
3939
| :cpp:func:`reduce_min` | min of the batch elements |
4040
+---------------------------------------+----------------------------------------------------+
41+
| :cpp:func:`reduce_mul` | product of the batch elements |
42+
+---------------------------------------+----------------------------------------------------+
4143
| :cpp:func:`haddp` | horizontal sum across batches |
4244
+---------------------------------------+----------------------------------------------------+
4345

include/xsimd/arch/common/xsimd_common_details.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ namespace xsimd
7777
template <class T, class A>
7878
XSIMD_INLINE T reduce_add(batch<T, A> const&) noexcept;
7979
template <class T, class A>
80+
XSIMD_INLINE T reduce_mul(batch<T, A> const&) noexcept;
81+
template <class T, class A>
8082
XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const&, batch<T, A> const&, batch<T, A> const&) noexcept;
8183
template <class T, class A>
8284
XSIMD_INLINE batch<std::complex<T>, A> select(batch_bool<T, A> const&, batch<std::complex<T>, A> const&, batch<std::complex<T>, A> const&) noexcept;

include/xsimd/arch/common/xsimd_common_math.hpp

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2116,7 +2116,6 @@ namespace xsimd
21162116
return res;
21172117
}
21182118

2119-
21202119
namespace detail
21212120
{
21222121
template <class T, T N>
@@ -2161,6 +2160,34 @@ namespace xsimd
21612160
self, std::integral_constant<unsigned, batch<T, A>::size>());
21622161
}
21632162

2163+
// reduce_mul
2164+
template <class A, class T>
2165+
XSIMD_INLINE std::complex<T> reduce_mul(batch<std::complex<T>, A> const& self, requires_arch<common>) noexcept
2166+
{
2167+
// FIXME: could do better
2168+
alignas(A::alignment()) std::complex<T> buffer[batch<std::complex<T>, A>::size];
2169+
self.store_aligned(buffer);
2170+
std::complex<T> res = 1;
2171+
for (auto val : buffer)
2172+
{
2173+
res *= val;
2174+
}
2175+
return res;
2176+
}
2177+
2178+
template <class A, class T, class /*=typename std::enable_if<std::is_scalar<T>::value, void>::type*/>
2179+
XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<common>) noexcept
2180+
{
2181+
alignas(A::alignment()) T buffer[batch<T, A>::size];
2182+
self.store_aligned(buffer);
2183+
T res = 1;
2184+
for (T val : buffer)
2185+
{
2186+
res *= val;
2187+
}
2188+
return res;
2189+
}
2190+
21642191
// remainder
21652192
template <class A>
21662193
XSIMD_INLINE batch<float, A> remainder(batch<float, A> const& self, batch<float, A> const& other, requires_arch<common>) noexcept

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1077,6 +1077,16 @@ namespace xsimd
10771077
return reduce_min(batch<T, sse4_2>(low));
10781078
}
10791079

1080+
// reduce_mul
1081+
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
1082+
XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx>) noexcept
1083+
{
1084+
typename batch<T, sse4_2>::register_type low, high;
1085+
detail::split_avx(self, low, high);
1086+
batch<T, sse4_2> blow(low), bhigh(high);
1087+
return reduce_mul(blow * bhigh);
1088+
}
1089+
10801090
// rsqrt
10811091
template <class A>
10821092
XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
@@ -1911,4 +1921,4 @@ namespace xsimd
19111921
}
19121922
}
19131923

1914-
#endif
1924+
#endif

include/xsimd/arch/xsimd_avx512dq.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,16 @@ namespace xsimd
188188
return reduce_add(batch<float, avx2>(res1), avx2 {});
189189
}
190190

191+
// reduce_mul
192+
template <class A>
193+
XSIMD_INLINE float reduce_mul(batch<float, A> const& rhs, requires_arch<avx512dq>) noexcept
194+
{
195+
__m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
196+
__m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
197+
__m256 res1 = _mm256_mul_ps(tmp1, tmp2);
198+
return reduce_mul(batch<float, avx2>(res1), avx2 {});
199+
}
200+
191201
// swizzle constant mask
192202
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7,
193203
uint32_t V8, uint32_t V9, uint32_t V10, uint32_t V11, uint32_t V12, uint32_t V13, uint32_t V14, uint32_t V15>

include/xsimd/arch/xsimd_avx512f.hpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1558,6 +1558,37 @@ namespace xsimd
15581558
return reduce_min(batch<T, avx2>(low));
15591559
}
15601560

1561+
// reduce_mul
1562+
template <class A>
1563+
XSIMD_INLINE float reduce_mul(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
1564+
{
1565+
return _mm512_reduce_mul_ps(rhs);
1566+
}
1567+
template <class A>
1568+
XSIMD_INLINE double reduce_mul(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
1569+
{
1570+
return _mm512_reduce_mul_pd(rhs);
1571+
}
1572+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1573+
XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx512f>) noexcept
1574+
{
1575+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1576+
{
1577+
return _mm512_reduce_mul_epi32(self);
1578+
}
1579+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1580+
{
1581+
return _mm512_reduce_mul_epi64(self);
1582+
}
1583+
else
1584+
{
1585+
__m256i low, high;
1586+
detail::split_avx512(self, low, high);
1587+
batch<T, avx2> blow(low), bhigh(high);
1588+
return reduce_mul(blow, avx2 {}) * reduce_mul(bhigh, avx2 {});
1589+
}
1590+
}
1591+
15611592
// rsqrt
15621593
template <class A>
15631594
XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept

include/xsimd/arch/xsimd_common_fwd.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ namespace xsimd
3838
XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept;
3939
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
4040
XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<common>) noexcept;
41+
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
42+
XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<common>) noexcept;
4143
// Forward declarations for pack-level helpers
4244
namespace detail
4345
{

include/xsimd/arch/xsimd_emulated.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,16 @@ namespace xsimd
601601
{ return xsimd::min(x, y); });
602602
}
603603

604+
// reduce_mul
605+
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
606+
XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
607+
{
608+
constexpr size_t size = batch<T, A>::size;
609+
std::array<T, size> buffer;
610+
self.store_unaligned(buffer.data());
611+
return std::accumulate(buffer.begin() + 1, buffer.end(), *buffer.begin(), std::multiplies<T>());
612+
}
613+
604614
// rsqrt
605615
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
606616
XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept

include/xsimd/arch/xsimd_neon.hpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1705,14 +1705,21 @@ namespace xsimd
17051705
* reduce_max *
17061706
**************/
17071707

1708-
// Using common implementation because ARM doe snot provide intrinsics
1708+
// Using common implementation because ARM does not provide intrinsics
17091709
// for this operation
17101710

17111711
/**************
17121712
* reduce_min *
17131713
**************/
17141714

1715-
// Using common implementation because ARM doe snot provide intrinsics
1715+
// Using common implementation because ARM does not provide intrinsics
1716+
// for this operation
1717+
1718+
/**************
1719+
* reduce_mul *
1720+
**************/
1721+
1722+
// Using common implementation because ARM does not provide intrinsics
17161723
// for this operation
17171724

17181725
/**********

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1344,6 +1344,44 @@ namespace xsimd
13441344
return first(acc3, A {});
13451345
}
13461346

1347+
// reduce_mul
1348+
template <class A>
1349+
XSIMD_INLINE float reduce_mul(batch<float, A> const& self, requires_arch<sse2>) noexcept
1350+
{
1351+
__m128 tmp0 = _mm_mul_ps(self, _mm_movehl_ps(self, self));
1352+
__m128 tmp1 = _mm_mul_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
1353+
return _mm_cvtss_f32(tmp1);
1354+
}
1355+
1356+
template <class A>
1357+
XSIMD_INLINE double reduce_mul(batch<double, A> const& self, requires_arch<sse2>) noexcept
1358+
{
1359+
return _mm_cvtsd_f64(_mm_mul_sd(self, _mm_unpackhi_pd(self, self)));
1360+
}
1361+
1362+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1363+
XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<sse2>) noexcept
1364+
{
1365+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1366+
{
1367+
batch<T, A> tmp1 = _mm_shuffle_epi32(self, _MM_SHUFFLE(0, 1, 2, 3));
1368+
tmp1 = tmp1 * self;
1369+
batch<T, A> tmp2 = _mm_unpackhi_epi32(tmp1, tmp1);
1370+
tmp2 = tmp2 * tmp1;
1371+
return _mm_cvtsi128_si32(tmp2);
1372+
}
1373+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1374+
{
1375+
batch<T, A> tmp1 = _mm_unpackhi_epi64(self, self);
1376+
tmp1 = tmp1 * self;
1377+
return _mm_cvtsi128_si64(tmp1);
1378+
}
1379+
else
1380+
{
1381+
return reduce_mul(self, common {});
1382+
}
1383+
}
1384+
13471385
// rsqrt
13481386
template <class A>
13491387
XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept

0 commit comments

Comments
 (0)