TR-mbed 1.0
Loading...
Searching...
No Matches
PacketMath.h
Go to the documentation of this file.
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2016 Benoit Steiner (benoit.steiner.goog@gmail.com)
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_PACKET_MATH_AVX512_H
11#define EIGEN_PACKET_MATH_AVX512_H
12
13namespace Eigen {
14
15namespace internal {
16
17#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
18#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
19#endif
20
21#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
22#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
23#endif
24
25#ifdef EIGEN_VECTORIZE_FMA
26#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
27#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
28#endif
29#endif
30
36
37template <>
39 enum { value = true };
40};
41template <>
43 enum { value = true };
44};
45template <>
47 enum { value = true };
48};
49
50template<> struct is_arithmetic<Packet16h> { enum { value = true }; };
51
52template <>
94
95template<> struct packet_traits<float> : default_packet_traits
96{
97 typedef Packet16f type;
98 typedef Packet8f half;
99 enum {
100 Vectorizable = 1,
101 AlignedOnScalar = 1,
102 size = 16,
103 HasHalfPacket = 1,
104
105 HasAbs = 1,
106 HasMin = 1,
107 HasMax = 1,
109 HasBlend = 0,
112#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
113 HasLog = 1,
114 HasLog1p = 1,
115 HasExpm1 = 1,
116 HasNdtri = 1,
117 HasBessel = 1,
118 HasExp = 1,
123#endif
124 HasCmp = 1,
125 HasDiv = 1,
126 HasRound = 1,
127 HasFloor = 1,
128 HasCeil = 1,
129 HasRint = 1
130 };
131 };
132template<> struct packet_traits<double> : default_packet_traits
133{
134 typedef Packet8d type;
135 typedef Packet4d half;
136 enum {
137 Vectorizable = 1,
138 AlignedOnScalar = 1,
139 size = 8,
140 HasHalfPacket = 1,
141#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
142 HasLog = 1,
143 HasExp = 1,
146#endif
147 HasCmp = 1,
148 HasDiv = 1,
149 HasRound = 1,
150 HasFloor = 1,
151 HasCeil = 1,
152 HasRint = 1
153 };
154};
155
156/* TODO Implement AVX512 for integers
157template<> struct packet_traits<int> : default_packet_traits
158{
159 typedef Packet16i type;
160 enum {
161 Vectorizable = 1,
162 AlignedOnScalar = 1,
163 size=8
164 };
165};
166*/
167
168template <>
170 typedef float type;
171 typedef Packet8f half;
173 typedef uint16_t mask_t;
175};
176template <>
182template <>
188
189template<>
195
196template <>
200template <>
204template <>
208
209template <>
213
214template <>
218
219template<> EIGEN_STRONG_INLINE Packet16f pzero(const Packet16f& /*a*/) { return _mm512_setzero_ps(); }
220template<> EIGEN_STRONG_INLINE Packet8d pzero(const Packet8d& /*a*/) { return _mm512_setzero_pd(); }
222
224 return _mm512_castsi512_ps(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
225 0, -1, 0, -1, 0, -1, 0, -1));
226}
228 return _mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
229 0, -1, 0, -1, 0, -1, 0, -1);
230}
232 return _mm512_castsi512_pd(_mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1,
233 0, 0, -1, -1, 0, 0, -1, -1));
234}
235
236template <>
240template <>
244
245template <>
247 return _mm512_add_ps(
249 _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f,
250 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
251}
252template <>
255 _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
256}
257
258template <>
263template <>
268template <>
273
274template <>
279template <>
284template <>
289
290template <>
294template <>
298
299template <>
301 return a;
302}
303template <>
305 return a;
306}
307template <>
309 return a;
310}
311
312template <>
317template <>
322template <>
327
328template <>
333template <>
338
339#ifdef EIGEN_VECTORIZE_FMA
340template <>
342 const Packet16f& c) {
343 return _mm512_fmadd_ps(a, b, c);
344}
345template <>
347 const Packet8d& c) {
348 return _mm512_fmadd_pd(a, b, c);
349}
350#endif
351
352template <>
360
361template <>
369
370template <>
372 const Packet16f& b) {
373 // Arguments are reversed to match NaN propagation behavior of std::min.
374 return _mm512_min_ps(b, a);
375}
376template <>
378 const Packet8d& b) {
379 // Arguments are reversed to match NaN propagation behavior of std::min.
380 return _mm512_min_pd(b, a);
381}
382
383template <>
385 const Packet16f& b) {
386 // Arguments are reversed to match NaN propagation behavior of std::max.
387 return _mm512_max_ps(b, a);
388}
389template <>
391 const Packet8d& b) {
392 // Arguments are reversed to match NaN propagation behavior of std::max.
393 return _mm512_max_pd(b, a);
394}
395
396// Add specializations for min/max with prescribed NaN progation.
397template<>
401template<>
405template<>
409template<>
413template<>
417template<>
421template<>
425template<>
429
430
431#ifdef EIGEN_VECTORIZE_AVX512DQ
432template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); }
433template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); }
434EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); }
435#else
436// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
440
441// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512
445
450#endif
451
452// Helper function for bit packing snippet of low precision comparison.
453// It packs the flags from 32x16 to 16x16.
455 // Split data into small pieces and handle with AVX instructions
456 // to guarantee internal order of vector.
457 // Operation:
458 // dst[15:0] := Saturate16(rf[31:0])
459 // dst[31:16] := Saturate16(rf[63:32])
460 // ...
461 // dst[255:240] := Saturate16(rf[255:224])
469}
470
471template <>
479 return _mm512_castsi512_ps(
480 _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
481}
482
485 return _mm512_castsi512_ps(
486 _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
487}
488
494
497 return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu);
498}
499
500
501template <>
504 return _mm512_castsi512_pd(
505 _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
506}
507template <>
510 return _mm512_castsi512_pd(
511 _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
512}
513template <>
516 return _mm512_castsi512_pd(
517 _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
518}
519template <>
525
528
531
534
535template <>
537 return _mm512_set1_epi32(0xffffffffu);
538}
539
540template <>
544
545template <>
549
550template <>
555
556template <>
558 const Packet16f& b) {
559#ifdef EIGEN_VECTORIZE_AVX512DQ
560 return _mm512_and_ps(a, b);
561#else
563#endif
564}
565template <>
581
582template <>
586
587template <>
589#ifdef EIGEN_VECTORIZE_AVX512DQ
590 return _mm512_or_ps(a, b);
591#else
593#endif
594}
595
596template <>
598 const Packet8d& b) {
599#ifdef EIGEN_VECTORIZE_AVX512DQ
600 return _mm512_or_pd(a, b);
601#else
603#endif
604}
605
606template <>
610
611template <>
613#ifdef EIGEN_VECTORIZE_AVX512DQ
614 return _mm512_xor_ps(a, b);
615#else
617#endif
618}
619
620template <>
622#ifdef EIGEN_VECTORIZE_AVX512DQ
623 return _mm512_xor_pd(a, b);
624#else
626#endif
627}
628
629template <>
633
634template <>
636#ifdef EIGEN_VECTORIZE_AVX512DQ
637 return _mm512_andnot_ps(b, a);
638#else
640#endif
641}
642template <>
644#ifdef EIGEN_VECTORIZE_AVX512DQ
645 return _mm512_andnot_pd(b, a);
646#else
648#endif
649}
650
652{
653 // Work-around for default std::round rounding mode.
654 const Packet16f mask = pset1frombits<Packet16f>(static_cast<numext::uint32_t>(0x80000000u));
655 const Packet16f prev0dot5 = pset1frombits<Packet16f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
657}
659{
660 // Work-around for default std::round rounding mode.
661 const Packet8d mask = pset1frombits<Packet8d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
662 const Packet8d prev0dot5 = pset1frombits<Packet8d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
664}
665
669
673
677
678template <>
682template <>
686template <>
689 reinterpret_cast<const __m512i*>(from));
690}
691
692template <>
696template <>
700template <>
705
706template <>
711
712// Loads 8 floats from memory a returns the packet
713// {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
714template <>
716 // an unaligned load is required here as there is no requirement
717 // on the alignment of input pointer 'from'
718 __m256i low_half = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
721 return pairs;
722}
723
724#ifdef EIGEN_VECTORIZE_AVX512DQ
725// FIXME: this does not look optimal, better load a Packet4d and shuffle...
726// Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3,
727// a3}
728template <>
730 __m512d x = _mm512_setzero_pd();
731 x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[0]), 0);
732 x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[1]), 1);
733 x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[2]), 2);
734 x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3);
735 return x;
736}
737#else
738template <>
747#endif
748
749// Loads 4 floats from memory a returns the packet
750// {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
751template <>
757
758// Loads 2 doubles from memory a returns the packet
759// {a0, a0 a0, a0, a1, a1, a1, a1}
760template <>
768
769template <>
773template <>
777template <>
782
783template <>
787template <>
791template <>
796template <>
797EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from, uint16_t umask) {
798 __mmask16 mask = static_cast<__mmask16>(umask);
800}
801
802template <>
804 Index stride) {
807 _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
809
810 return _mm512_i32gather_ps(indices, from, 4);
811}
812template <>
821
822template <>
824 const Packet16f& from,
825 Index stride) {
828 _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
830 _mm512_i32scatter_ps(to, indices, from, 4);
831}
832template <>
841
842template <>
843EIGEN_STRONG_INLINE void pstore1<Packet16f>(float* to, const float& a) {
845 pstore(to, pa);
846}
847template <>
848EIGEN_STRONG_INLINE void pstore1<Packet8d>(double* to, const double& a) {
850 pstore(to, pa);
851}
852template <>
855 pstore(to, pa);
856}
857
861
862template <>
866template <>
870template <>
874
876{
877 return _mm512_permutexvar_ps(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a);
878}
879
881{
882 return _mm512_permutexvar_pd(_mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), a);
883}
884
886{
887 // _mm512_abs_ps intrinsic not found, so hack around it
889}
890template <>
892 // _mm512_abs_ps intrinsic not found, so hack around it
894 _mm512_set1_epi64(0x7fffffffffffffff)));
895}
896
897template<>
901
902// Extract exponent without existence of Packet8l.
903template<>
906 const Packet8d cst_exp_mask = pset1frombits<Packet8d>(static_cast<uint64_t>(0x7ff0000000000000ull));
907 #ifdef EIGEN_VECTORIZE_AVX512DQ
909 #else
911 #endif
912}
913
914template<>
918
922
924 // Clamp exponent to [-2099, 2099]
925 const Packet8d max_exponent = pset1<Packet8d>(2099.0);
926 const Packet8i e = _mm512_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
927
928 // Split 2^e into four factors and multiply.
929 const Packet8i bias = pset1<Packet8i>(1023);
930 Packet8i b = parithmetic_shift_right<2>(e); // floor(e/4)
931
932 // 2^b
933 const Packet8i permute_idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
938 Packet8d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
939
940 // 2^(e - 3b)
941 b = psub(psub(psub(e, b), b), b); // e - 3b
943 lo = _mm256_slli_epi64(hi, 52);
946 out = pmul(out, c); // a * 2^e
947 return out;
948}
949
950#ifdef EIGEN_VECTORIZE_AVX512DQ
951// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
952#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
953 __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \
954 __m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1)
955#else
956#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
957 __m256 OUTPUT##_0 = _mm256_insertf128_ps( \
958 _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \
959 _mm512_extractf32x4_ps(INPUT, 1), 1); \
960 __m256 OUTPUT##_1 = _mm256_insertf128_ps( \
961 _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \
962 _mm512_extractf32x4_ps(INPUT, 3), 1);
963#endif
964
965#ifdef EIGEN_VECTORIZE_AVX512DQ
966#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
967 OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
968#else
969#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
970 OUTPUT = _mm512_undefined_ps(); \
971 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
972 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \
973 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
974 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
975#endif
976
977template <>
979#ifdef EIGEN_VECTORIZE_AVX512DQ
983 return predux<Packet8f>(x);
984#else
990 sum = _mm_hadd_ps(sum, sum);
991 sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
992 return _mm_cvtss_f32(sum);
993#endif
994}
995template <>
1003
1004template <>
1020template <>
1026
1027template <>
1029//#ifdef EIGEN_VECTORIZE_AVX512DQ
1030#if 0
1035 res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
1036 return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
1037#else
1043 res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
1044 return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
1045#endif
1046}
1047template <>
1055
1056template <>
1066template <>
1074
1075template <>
1085
1086template <>
1094
1101
1102
1103
1104#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
1105 EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
1106
1108 __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
1109 __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
1110 __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
1111 __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
1112 __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
1113 __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
1114 __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
1115 __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
1116 __m512 T8 = _mm512_unpacklo_ps(kernel.packet[8], kernel.packet[9]);
1117 __m512 T9 = _mm512_unpackhi_ps(kernel.packet[8], kernel.packet[9]);
1118 __m512 T10 = _mm512_unpacklo_ps(kernel.packet[10], kernel.packet[11]);
1119 __m512 T11 = _mm512_unpackhi_ps(kernel.packet[10], kernel.packet[11]);
1120 __m512 T12 = _mm512_unpacklo_ps(kernel.packet[12], kernel.packet[13]);
1121 __m512 T13 = _mm512_unpackhi_ps(kernel.packet[12], kernel.packet[13]);
1122 __m512 T14 = _mm512_unpacklo_ps(kernel.packet[14], kernel.packet[15]);
1123 __m512 T15 = _mm512_unpackhi_ps(kernel.packet[14], kernel.packet[15]);
1124 __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
1125 __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
1126 __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
1127 __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
1128 __m512 S4 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
1129 __m512 S5 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
1130 __m512 S6 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
1131 __m512 S7 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
1132 __m512 S8 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(1, 0, 1, 0));
1133 __m512 S9 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(3, 2, 3, 2));
1134 __m512 S10 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(1, 0, 1, 0));
1135 __m512 S11 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(3, 2, 3, 2));
1136 __m512 S12 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(1, 0, 1, 0));
1137 __m512 S13 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(3, 2, 3, 2));
1138 __m512 S14 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(1, 0, 1, 0));
1139 __m512 S15 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(3, 2, 3, 2));
1140
1157
1159
1160 tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S4_0, 0x20);
1161 tmp.packet[1] = _mm256_permute2f128_ps(S1_0, S5_0, 0x20);
1162 tmp.packet[2] = _mm256_permute2f128_ps(S2_0, S6_0, 0x20);
1163 tmp.packet[3] = _mm256_permute2f128_ps(S3_0, S7_0, 0x20);
1164 tmp.packet[4] = _mm256_permute2f128_ps(S0_0, S4_0, 0x31);
1165 tmp.packet[5] = _mm256_permute2f128_ps(S1_0, S5_0, 0x31);
1166 tmp.packet[6] = _mm256_permute2f128_ps(S2_0, S6_0, 0x31);
1167 tmp.packet[7] = _mm256_permute2f128_ps(S3_0, S7_0, 0x31);
1168
1169 tmp.packet[8] = _mm256_permute2f128_ps(S0_1, S4_1, 0x20);
1170 tmp.packet[9] = _mm256_permute2f128_ps(S1_1, S5_1, 0x20);
1171 tmp.packet[10] = _mm256_permute2f128_ps(S2_1, S6_1, 0x20);
1172 tmp.packet[11] = _mm256_permute2f128_ps(S3_1, S7_1, 0x20);
1173 tmp.packet[12] = _mm256_permute2f128_ps(S0_1, S4_1, 0x31);
1174 tmp.packet[13] = _mm256_permute2f128_ps(S1_1, S5_1, 0x31);
1175 tmp.packet[14] = _mm256_permute2f128_ps(S2_1, S6_1, 0x31);
1176 tmp.packet[15] = _mm256_permute2f128_ps(S3_1, S7_1, 0x31);
1177
1178 // Second set of _m256 outputs
1179 tmp.packet[16] = _mm256_permute2f128_ps(S8_0, S12_0, 0x20);
1180 tmp.packet[17] = _mm256_permute2f128_ps(S9_0, S13_0, 0x20);
1181 tmp.packet[18] = _mm256_permute2f128_ps(S10_0, S14_0, 0x20);
1182 tmp.packet[19] = _mm256_permute2f128_ps(S11_0, S15_0, 0x20);
1183 tmp.packet[20] = _mm256_permute2f128_ps(S8_0, S12_0, 0x31);
1184 tmp.packet[21] = _mm256_permute2f128_ps(S9_0, S13_0, 0x31);
1185 tmp.packet[22] = _mm256_permute2f128_ps(S10_0, S14_0, 0x31);
1186 tmp.packet[23] = _mm256_permute2f128_ps(S11_0, S15_0, 0x31);
1187
1188 tmp.packet[24] = _mm256_permute2f128_ps(S8_1, S12_1, 0x20);
1189 tmp.packet[25] = _mm256_permute2f128_ps(S9_1, S13_1, 0x20);
1190 tmp.packet[26] = _mm256_permute2f128_ps(S10_1, S14_1, 0x20);
1191 tmp.packet[27] = _mm256_permute2f128_ps(S11_1, S15_1, 0x20);
1192 tmp.packet[28] = _mm256_permute2f128_ps(S8_1, S12_1, 0x31);
1193 tmp.packet[29] = _mm256_permute2f128_ps(S9_1, S13_1, 0x31);
1194 tmp.packet[30] = _mm256_permute2f128_ps(S10_1, S14_1, 0x31);
1195 tmp.packet[31] = _mm256_permute2f128_ps(S11_1, S15_1, 0x31);
1196
1197 // Pack them into the output
1198 PACK_OUTPUT(kernel.packet, tmp.packet, 0, 16);
1199 PACK_OUTPUT(kernel.packet, tmp.packet, 1, 16);
1200 PACK_OUTPUT(kernel.packet, tmp.packet, 2, 16);
1201 PACK_OUTPUT(kernel.packet, tmp.packet, 3, 16);
1202
1203 PACK_OUTPUT(kernel.packet, tmp.packet, 4, 16);
1204 PACK_OUTPUT(kernel.packet, tmp.packet, 5, 16);
1205 PACK_OUTPUT(kernel.packet, tmp.packet, 6, 16);
1206 PACK_OUTPUT(kernel.packet, tmp.packet, 7, 16);
1207
1208 PACK_OUTPUT(kernel.packet, tmp.packet, 8, 16);
1209 PACK_OUTPUT(kernel.packet, tmp.packet, 9, 16);
1210 PACK_OUTPUT(kernel.packet, tmp.packet, 10, 16);
1211 PACK_OUTPUT(kernel.packet, tmp.packet, 11, 16);
1212
1213 PACK_OUTPUT(kernel.packet, tmp.packet, 12, 16);
1214 PACK_OUTPUT(kernel.packet, tmp.packet, 13, 16);
1215 PACK_OUTPUT(kernel.packet, tmp.packet, 14, 16);
1216 PACK_OUTPUT(kernel.packet, tmp.packet, 15, 16);
1217}
1218#define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE) \
1219 EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], \
1220 INPUT[2 * INDEX + STRIDE]);
1221
1223 __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
1224 __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
1225 __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
1226 __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
1227
1228 __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
1229 __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
1230 __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
1231 __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
1232
1237
1239
1240 tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S1_0, 0x20);
1241 tmp.packet[1] = _mm256_permute2f128_ps(S2_0, S3_0, 0x20);
1242 tmp.packet[2] = _mm256_permute2f128_ps(S0_0, S1_0, 0x31);
1243 tmp.packet[3] = _mm256_permute2f128_ps(S2_0, S3_0, 0x31);
1244
1245 tmp.packet[4] = _mm256_permute2f128_ps(S0_1, S1_1, 0x20);
1246 tmp.packet[5] = _mm256_permute2f128_ps(S2_1, S3_1, 0x20);
1247 tmp.packet[6] = _mm256_permute2f128_ps(S0_1, S1_1, 0x31);
1248 tmp.packet[7] = _mm256_permute2f128_ps(S2_1, S3_1, 0x31);
1249
1250 PACK_OUTPUT_2(kernel.packet, tmp.packet, 0, 1);
1251 PACK_OUTPUT_2(kernel.packet, tmp.packet, 1, 1);
1252 PACK_OUTPUT_2(kernel.packet, tmp.packet, 2, 1);
1253 PACK_OUTPUT_2(kernel.packet, tmp.packet, 3, 1);
1254}
1255
1256#define PACK_OUTPUT_SQ_D(OUTPUT, INPUT, INDEX, STRIDE) \
1257 OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX], 0); \
1258 OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX + STRIDE], 1);
1259
1260#define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE) \
1261 OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \
1262 OUTPUT[INDEX] = \
1263 _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
1264
1266 __m512d T0 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
1267 __m512d T1 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0xff);
1268 __m512d T2 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
1269 __m512d T3 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0xff);
1270
1272
1274 _mm512_extractf64x4_pd(T2, 0), 0x20);
1276 _mm512_extractf64x4_pd(T3, 0), 0x20);
1278 _mm512_extractf64x4_pd(T2, 0), 0x31);
1280 _mm512_extractf64x4_pd(T3, 0), 0x31);
1281
1283 _mm512_extractf64x4_pd(T2, 1), 0x20);
1285 _mm512_extractf64x4_pd(T3, 1), 0x20);
1287 _mm512_extractf64x4_pd(T2, 1), 0x31);
1289 _mm512_extractf64x4_pd(T3, 1), 0x31);
1290
1291 PACK_OUTPUT_D(kernel.packet, tmp.packet, 0, 1);
1292 PACK_OUTPUT_D(kernel.packet, tmp.packet, 1, 1);
1293 PACK_OUTPUT_D(kernel.packet, tmp.packet, 2, 1);
1294 PACK_OUTPUT_D(kernel.packet, tmp.packet, 3, 1);
1295}
1296
1298 __m512d T0 = _mm512_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
1299 __m512d T1 = _mm512_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
1300 __m512d T2 = _mm512_unpacklo_pd(kernel.packet[2], kernel.packet[3]);
1301 __m512d T3 = _mm512_unpackhi_pd(kernel.packet[2], kernel.packet[3]);
1302 __m512d T4 = _mm512_unpacklo_pd(kernel.packet[4], kernel.packet[5]);
1303 __m512d T5 = _mm512_unpackhi_pd(kernel.packet[4], kernel.packet[5]);
1304 __m512d T6 = _mm512_unpacklo_pd(kernel.packet[6], kernel.packet[7]);
1305 __m512d T7 = _mm512_unpackhi_pd(kernel.packet[6], kernel.packet[7]);
1306
1308
1310 _mm512_extractf64x4_pd(T2, 0), 0x20);
1312 _mm512_extractf64x4_pd(T3, 0), 0x20);
1314 _mm512_extractf64x4_pd(T2, 0), 0x31);
1316 _mm512_extractf64x4_pd(T3, 0), 0x31);
1317
1319 _mm512_extractf64x4_pd(T2, 1), 0x20);
1321 _mm512_extractf64x4_pd(T3, 1), 0x20);
1323 _mm512_extractf64x4_pd(T2, 1), 0x31);
1325 _mm512_extractf64x4_pd(T3, 1), 0x31);
1326
1328 _mm512_extractf64x4_pd(T6, 0), 0x20);
1330 _mm512_extractf64x4_pd(T7, 0), 0x20);
1332 _mm512_extractf64x4_pd(T6, 0), 0x31);
1334 _mm512_extractf64x4_pd(T7, 0), 0x31);
1335
1337 _mm512_extractf64x4_pd(T6, 1), 0x20);
1339 _mm512_extractf64x4_pd(T7, 1), 0x20);
1341 _mm512_extractf64x4_pd(T6, 1), 0x31);
1343 _mm512_extractf64x4_pd(T7, 1), 0x31);
1344
1345 PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 0, 8);
1346 PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 1, 8);
1347 PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 2, 8);
1348 PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 3, 8);
1349
1350 PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 4, 8);
1351 PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 5, 8);
1352 PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 6, 8);
1353 PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 7, 8);
1354}
1355template <>
1357 const Packet16f& /*thenPacket*/,
1358 const Packet16f& /*elsePacket*/) {
1359 assert(false && "To be implemented");
1360 return Packet16f();
1361}
1362template <>
1364 const Packet8d& thenPacket,
1365 const Packet8d& elsePacket) {
1366 __mmask8 m = (ifPacket.select[0] )
1367 | (ifPacket.select[1]<<1)
1368 | (ifPacket.select[2]<<2)
1369 | (ifPacket.select[3]<<3)
1370 | (ifPacket.select[4]<<4)
1371 | (ifPacket.select[5]<<5)
1372 | (ifPacket.select[6]<<6)
1373 | (ifPacket.select[7]<<7);
1375}
1376
1377// Packet math for Eigen::half
1381
1383 return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from, 0)));
1384}
1385
1387 return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
1388}
1389
1391 return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
1392}
1393
1395 // (void*) -> workaround clang warning:
1396 // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
1397 _mm256_store_si256((__m256i*)(void*)to, from);
1398}
1399
1401 // (void*) -> workaround clang warning:
1402 // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
1404}
1405
1408 unsigned short a = from[0].x;
1409 unsigned short b = from[1].x;
1410 unsigned short c = from[2].x;
1411 unsigned short d = from[3].x;
1412 unsigned short e = from[4].x;
1413 unsigned short f = from[5].x;
1414 unsigned short g = from[6].x;
1415 unsigned short h = from[7].x;
1416 return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
1417}
1418
1421 unsigned short a = from[0].x;
1422 unsigned short b = from[1].x;
1423 unsigned short c = from[2].x;
1424 unsigned short d = from[3].x;
1425 return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
1426}
1427
1429#ifdef EIGEN_HAS_FP16_C
1430 return _mm512_cvtph_ps(a);
1431#else
1433 pstore(aux, a);
1434 float f0(aux[0]);
1435 float f1(aux[1]);
1436 float f2(aux[2]);
1437 float f3(aux[3]);
1438 float f4(aux[4]);
1439 float f5(aux[5]);
1440 float f6(aux[6]);
1441 float f7(aux[7]);
1442 float f8(aux[8]);
1443 float f9(aux[9]);
1444 float fa(aux[10]);
1445 float fb(aux[11]);
1446 float fc(aux[12]);
1447 float fd(aux[13]);
1448 float fe(aux[14]);
1449 float ff(aux[15]);
1450
1451 return _mm512_set_ps(
1452 ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
1453#endif
1454}
1455
1457#ifdef EIGEN_HAS_FP16_C
1459#else
1460 EIGEN_ALIGN64 float aux[16];
1461 pstore(aux, a);
1462 half h0(aux[0]);
1463 half h1(aux[1]);
1464 half h2(aux[2]);
1465 half h3(aux[3]);
1466 half h4(aux[4]);
1467 half h5(aux[5]);
1468 half h6(aux[6]);
1469 half h7(aux[7]);
1470 half h8(aux[8]);
1471 half h9(aux[9]);
1472 half ha(aux[10]);
1473 half hb(aux[11]);
1474 half hc(aux[12]);
1475 half hd(aux[13]);
1476 half he(aux[14]);
1477 half hf(aux[15]);
1478
1479 return _mm256_set_epi16(
1480 hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
1481 h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
1482#endif
1483}
1484
1486 return ptrue(Packet8i(a));
1487}
1488
1489template <>
1494
1495template <>
1500
1501template <>
1506
1507template <>
1509 return float2half(plset<Packet16f>(static_cast<float>(a)));
1510}
1511
1513 // in some cases Packet8i is a wrapper around __m256i, so we need to
1514 // cast to Packet8i to call the correct overload.
1515 return por(Packet8i(a),Packet8i(b));
1516}
1518 return pxor(Packet8i(a),Packet8i(b));
1519}
1521 return pand(Packet8i(a),Packet8i(b));
1522}
1524 return pandnot(Packet8i(a),Packet8i(b));
1525}
1526
1527template<> EIGEN_STRONG_INLINE Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) {
1528 return _mm256_blendv_epi8(b, a, mask);
1529}
1530
1534
1538
1542
1546
1550 return Pack32To16(pcmp_eq(af, bf));
1551}
1552
1556
1560
1564
1565template<> EIGEN_STRONG_INLINE Packet16h pconj(const Packet16h& a) { return a; }
1566
1568 Packet16h sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
1569 return _mm256_xor_si256(a, sign_mask);
1570}
1571
1575 Packet16f rf = padd(af, bf);
1576 return float2half(rf);
1577}
1578
1582 Packet16f rf = psub(af, bf);
1583 return float2half(rf);
1584}
1585
1589 Packet16f rf = pmul(af, bf);
1590 return float2half(rf);
1591}
1592
1596 Packet16f rf = pdiv(af, bf);
1597 return float2half(rf);
1598}
1599
1604
1605template <>
1611
1617
1623
1628
1630{
1631 __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
1635}
1636
1638{
1639 return _mm256_set_epi16(
1640 from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
1641 from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
1642 from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
1643 from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
1644}
1645
1647{
1649 pstore(aux, from);
1650 to[stride*0] = aux[0];
1651 to[stride*1] = aux[1];
1652 to[stride*2] = aux[2];
1653 to[stride*3] = aux[3];
1654 to[stride*4] = aux[4];
1655 to[stride*5] = aux[5];
1656 to[stride*6] = aux[6];
1657 to[stride*7] = aux[7];
1658 to[stride*8] = aux[8];
1659 to[stride*9] = aux[9];
1660 to[stride*10] = aux[10];
1661 to[stride*11] = aux[11];
1662 to[stride*12] = aux[12];
1663 to[stride*13] = aux[13];
1664 to[stride*14] = aux[14];
1665 to[stride*15] = aux[15];
1666}
1667
1670 __m256i a = kernel.packet[0];
1671 __m256i b = kernel.packet[1];
1672 __m256i c = kernel.packet[2];
1673 __m256i d = kernel.packet[3];
1674 __m256i e = kernel.packet[4];
1675 __m256i f = kernel.packet[5];
1676 __m256i g = kernel.packet[6];
1677 __m256i h = kernel.packet[7];
1678 __m256i i = kernel.packet[8];
1679 __m256i j = kernel.packet[9];
1680 __m256i k = kernel.packet[10];
1681 __m256i l = kernel.packet[11];
1682 __m256i m = kernel.packet[12];
1683 __m256i n = kernel.packet[13];
1684 __m256i o = kernel.packet[14];
1685 __m256i p = kernel.packet[15];
1686
1695
1704
1713
1722
1739
1740 // NOTE: no unpacklo/hi instr in this case, so using permute instr.
1757
1758 kernel.packet[0] = a_p_0;
1759 kernel.packet[1] = a_p_1;
1760 kernel.packet[2] = a_p_2;
1761 kernel.packet[3] = a_p_3;
1762 kernel.packet[4] = a_p_4;
1763 kernel.packet[5] = a_p_5;
1764 kernel.packet[6] = a_p_6;
1765 kernel.packet[7] = a_p_7;
1766 kernel.packet[8] = a_p_8;
1767 kernel.packet[9] = a_p_9;
1768 kernel.packet[10] = a_p_a;
1769 kernel.packet[11] = a_p_b;
1770 kernel.packet[12] = a_p_c;
1771 kernel.packet[13] = a_p_d;
1772 kernel.packet[14] = a_p_e;
1773 kernel.packet[15] = a_p_f;
1774}
1775
1778 EIGEN_ALIGN64 half in[8][16];
1779 pstore<half>(in[0], kernel.packet[0]);
1780 pstore<half>(in[1], kernel.packet[1]);
1781 pstore<half>(in[2], kernel.packet[2]);
1782 pstore<half>(in[3], kernel.packet[3]);
1783 pstore<half>(in[4], kernel.packet[4]);
1784 pstore<half>(in[5], kernel.packet[5]);
1785 pstore<half>(in[6], kernel.packet[6]);
1786 pstore<half>(in[7], kernel.packet[7]);
1787
1788 EIGEN_ALIGN64 half out[8][16];
1789
1790 for (int i = 0; i < 8; ++i) {
1791 for (int j = 0; j < 8; ++j) {
1792 out[i][j] = in[j][2*i];
1793 }
1794 for (int j = 0; j < 8; ++j) {
1795 out[i][j+8] = in[j][2*i+1];
1796 }
1797 }
1798
1799 kernel.packet[0] = pload<Packet16h>(out[0]);
1800 kernel.packet[1] = pload<Packet16h>(out[1]);
1801 kernel.packet[2] = pload<Packet16h>(out[2]);
1802 kernel.packet[3] = pload<Packet16h>(out[3]);
1803 kernel.packet[4] = pload<Packet16h>(out[4]);
1804 kernel.packet[5] = pload<Packet16h>(out[5]);
1805 kernel.packet[6] = pload<Packet16h>(out[6]);
1806 kernel.packet[7] = pload<Packet16h>(out[7]);
1807}
1808
1811 EIGEN_ALIGN64 half in[4][16];
1812 pstore<half>(in[0], kernel.packet[0]);
1813 pstore<half>(in[1], kernel.packet[1]);
1814 pstore<half>(in[2], kernel.packet[2]);
1815 pstore<half>(in[3], kernel.packet[3]);
1816
1817 EIGEN_ALIGN64 half out[4][16];
1818
1819 for (int i = 0; i < 4; ++i) {
1820 for (int j = 0; j < 4; ++j) {
1821 out[i][j] = in[j][4*i];
1822 }
1823 for (int j = 0; j < 4; ++j) {
1824 out[i][j+4] = in[j][4*i+1];
1825 }
1826 for (int j = 0; j < 4; ++j) {
1827 out[i][j+8] = in[j][4*i+2];
1828 }
1829 for (int j = 0; j < 4; ++j) {
1830 out[i][j+12] = in[j][4*i+3];
1831 }
1832 }
1833
1834 kernel.packet[0] = pload<Packet16h>(out[0]);
1835 kernel.packet[1] = pload<Packet16h>(out[1]);
1836 kernel.packet[2] = pload<Packet16h>(out[2]);
1837 kernel.packet[3] = pload<Packet16h>(out[3]);
1838}
1839
1840template <> struct is_arithmetic<Packet16bf> { enum { value = true }; };
1841
1842template <>
1846 enum {
1847 Vectorizable = 1,
1848 AlignedOnScalar = 1,
1849 size = 16,
1850 HasHalfPacket = 1,
1851 HasBlend = 0,
1852 HasInsert = 1,
1855#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
1856#ifdef EIGEN_VECTORIZE_AVX512DQ
1857 HasLog = 1, // Currently fails test with bad accuracy.
1858 HasLog1p = 1,
1859 HasExpm1 = 1,
1860 HasNdtri = 1,
1861 HasBessel = 1,
1862#endif
1863 HasExp = 1,
1868#endif
1869 HasCmp = 1,
1870 HasDiv = 1
1871 };
1872};
1873
1874template <>
1881
1882template <>
1886
1887template <>
1889 bfloat16 t;
1890 t.value = static_cast<unsigned short>(_mm256_extract_epi16(from, 0));
1891 return t;
1892}
1893
1894template <>
1896 return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
1897}
1898
1899template <>
1901 return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
1902}
1903
1904template <>
1906 const Packet16bf& from) {
1907 _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
1908}
1909
1910template <>
1912 const Packet16bf& from) {
1913 _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
1914}
1915
1918 Packet16bf r;
1919 unsigned short a = from[0].value;
1920 unsigned short b = from[1].value;
1921 unsigned short c = from[2].value;
1922 unsigned short d = from[3].value;
1923 unsigned short e = from[4].value;
1924 unsigned short f = from[5].value;
1925 unsigned short g = from[6].value;
1926 unsigned short h = from[7].value;
1927 return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
1928}
1929
1932 Packet16bf r;
1933 unsigned short a = from[0].value;
1934 unsigned short b = from[1].value;
1935 unsigned short c = from[2].value;
1936 unsigned short d = from[3].value;
1937 return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
1938}
1939
1943
1944// Convert float to bfloat16 according to round-to-nearest-even/denormals algorithm.
1946 Packet16bf r;
1947
1948#if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_AT_LEAST(10, 1)
1949 // Since GCC 10.1 supports avx512bf16 and C style explicit cast
1950 // (C++ static_cast is not supported yet), do converion via intrinsic
1951 // and register path for performance.
1953
1954#else
1955 __m512i t;
1957 __m512i nan = _mm512_set1_epi32(0x7fc0);
1958
1959 // uint32_t lsb = (input >> 16) & 1;
1961 // uint32_t rounding_bias = 0x7fff + lsb;
1962 t = _mm512_add_epi32(t, _mm512_set1_epi32(0x7fff));
1963 // input += rounding_bias;
1964 t = _mm512_add_epi32(t, input);
1965 // input = input >> 16;
1966 t = _mm512_srli_epi32(t, 16);
1967
1968 // Check NaN before converting back to bf16
1970
1971 t = _mm512_mask_blend_epi32(mask, nan, t);
1972 // output.value = static_cast<uint16_t>(input);
1973 r = _mm512_cvtepi32_epi16(t);
1974#endif // EIGEN_VECTORIZE_AVX512BF16
1975
1976 return r;
1977}
1978
1979template <>
1983
1984template <>
1988
1989template <>
1993
1994template <>
1998
1999template <>
2004
2005template <>
2007 const Packet16bf& a,
2008 const Packet16bf& b) {
2009 // Input mask is expected to be all 0/1, handle it with 8-bit
2010 // intrinsic for performance.
2011 return _mm256_blendv_epi8(b, a, mask);
2012}
2013
2018
2022
2026
2030
2031template <>
2036
2037template <>
2042
2043template <>
2048
2049template <>
2054
2055template <>
2057 Packet16bf sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
2058 return _mm256_xor_si256(a, sign_mask);
2059}
2060
2061template <>
2063 return a;
2064}
2065
2066template <>
2071
2072template <>
2077
2078template <>
2083
2084template <>
2089
2090template <>
2095
2096template <>
2101
2102template <>
2107
2108template <>
2110 return F32ToBf16(plset<Packet16f>(static_cast<float>(a)));
2111}
2112
2113template <>
2119
2120template <>
2124
2125template <>
2129
2130template <>
2134
2135template <>
2139
2140template <>
2142 __m256i m = _mm256_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,
2143 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
2144
2146 // Swap hi and lo first because shuffle is in 128-bit lanes.
2148 // Shuffle 8-bit values in src within 2*128-bit lanes.
2149 return _mm256_shuffle_epi8(res, m);
2150}
2151
2152template <>
2154 Index stride) {
2155 return _mm256_set_epi16(
2156 from[15*stride].value, from[14*stride].value, from[13*stride].value, from[12*stride].value,
2157 from[11*stride].value, from[10*stride].value, from[9*stride].value, from[8*stride].value,
2158 from[7*stride].value, from[6*stride].value, from[5*stride].value, from[4*stride].value,
2159 from[3*stride].value, from[2*stride].value, from[1*stride].value, from[0*stride].value);
2160}
2161
2162template <>
2164 const Packet16bf& from,
2165 Index stride) {
2167 pstore(aux, from);
2168 to[stride*0] = aux[0];
2169 to[stride*1] = aux[1];
2170 to[stride*2] = aux[2];
2171 to[stride*3] = aux[3];
2172 to[stride*4] = aux[4];
2173 to[stride*5] = aux[5];
2174 to[stride*6] = aux[6];
2175 to[stride*7] = aux[7];
2176 to[stride*8] = aux[8];
2177 to[stride*9] = aux[9];
2178 to[stride*10] = aux[10];
2179 to[stride*11] = aux[11];
2180 to[stride*12] = aux[12];
2181 to[stride*13] = aux[13];
2182 to[stride*14] = aux[14];
2183 to[stride*15] = aux[15];
2184}
2185
2187 __m256i a = kernel.packet[0];
2188 __m256i b = kernel.packet[1];
2189 __m256i c = kernel.packet[2];
2190 __m256i d = kernel.packet[3];
2191 __m256i e = kernel.packet[4];
2192 __m256i f = kernel.packet[5];
2193 __m256i g = kernel.packet[6];
2194 __m256i h = kernel.packet[7];
2195 __m256i i = kernel.packet[8];
2196 __m256i j = kernel.packet[9];
2197 __m256i k = kernel.packet[10];
2198 __m256i l = kernel.packet[11];
2199 __m256i m = kernel.packet[12];
2200 __m256i n = kernel.packet[13];
2201 __m256i o = kernel.packet[14];
2202 __m256i p = kernel.packet[15];
2203
2212
2221
2230
2239
2256
2257 // NOTE: no unpacklo/hi instr in this case, so using permute instr.
2258 kernel.packet[0] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
2259 kernel.packet[1] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
2260 kernel.packet[2] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
2261 kernel.packet[3] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
2262 kernel.packet[4] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
2263 kernel.packet[5] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
2264 kernel.packet[6] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
2265 kernel.packet[7] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
2266 kernel.packet[8] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
2267 kernel.packet[9] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
2268 kernel.packet[10] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
2269 kernel.packet[11] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
2270 kernel.packet[12] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
2271 kernel.packet[13] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
2272 kernel.packet[14] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
2273 kernel.packet[15] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
2274}
2275
2277 __m256i a = kernel.packet[0];
2278 __m256i b = kernel.packet[1];
2279 __m256i c = kernel.packet[2];
2280 __m256i d = kernel.packet[3];
2281
2286
2291
2292 // NOTE: no unpacklo/hi instr in this case, so using permute instr.
2293 kernel.packet[0] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x20);
2294 kernel.packet[1] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x20);
2295 kernel.packet[2] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x31);
2296 kernel.packet[3] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x31);
2297}
2298
2299} // end namespace internal
2300
2301} // end namespace Eigen
2302
2303#endif // EIGEN_PACKET_MATH_AVX512_H
#define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE)
Definition PacketMath.h:1218
#define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE)
Definition PacketMath.h:1260
#define PACK_OUTPUT_SQ_D(OUTPUT, INPUT, INDEX, STRIDE)
Definition PacketMath.h:1256
#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)
Definition PacketMath.h:956
#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE)
Definition PacketMath.h:1104
Matrix3f m
Definition AngleAxis_mimic_euler.cpp:1
ArrayXXi a
Definition Array_initializer_list_23_cxx11.cpp:1
int n
Definition BiCGSTAB_simple.cpp:1
int i
Definition BiCGSTAB_step_by_step.cpp:9
#define EIGEN_ALIGN64
Definition ConfigureVectorization.h:155
Array< double, 1, 3 > e(1./3., 0.5, 2.)
#define EIGEN_DEBUG_ALIGNED_STORE
Definition GenericPacketMath.h:35
#define EIGEN_DEBUG_ALIGNED_LOAD
Definition GenericPacketMath.h:27
#define EIGEN_DEBUG_UNALIGNED_STORE
Definition GenericPacketMath.h:39
#define EIGEN_DEBUG_UNALIGNED_LOAD
Definition GenericPacketMath.h:31
HessenbergDecomposition< MatrixXcf > hd(4)
#define EIGEN_DEVICE_FUNC
Definition Macros.h:976
#define EIGEN_FAST_MATH
Definition Macros.h:49
#define EIGEN_STRONG_INLINE
Definition Macros.h:917
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
Definition PartialRedux_count.cpp:3
Vector3d hc
Definition Tridiagonalization_householderCoefficients.cpp:5
float * p
Definition Tutorial_Map_using.cpp:9
Scalar Scalar * c
Definition benchVecAdd.cpp:17
Scalar * b
Definition benchVecAdd.cpp:17
@ N
Definition constructor.cpp:23
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy x
Definition gnuplot_common_settings.hh:12
@ Aligned64
Definition Constants.h:237
@ Aligned32
Definition Constants.h:236
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x)
Definition Half.h:495
EIGEN_STRONG_INLINE Packet8d pandnot< Packet8d >(const Packet8d &a, const Packet8d &b)
Definition PacketMath.h:643
EIGEN_STRONG_INLINE Packet16i ptrue< Packet16i >(const Packet16i &)
Definition PacketMath.h:536
v2f64 Packet2d
Definition PacketMath.h:820
EIGEN_STRONG_INLINE void pstoreu< double >(double *to, const Packet4d &from)
Definition PacketMath.h:627
EIGEN_STRONG_INLINE Packet pminmax_propagate_numbers(const Packet &a, const Packet &b, Op op)
Definition PacketMath.h:546
EIGEN_STRONG_INLINE Packet16bf ploaddup< Packet16bf >(const bfloat16 *from)
Definition PacketMath.h:1917
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
Definition Complex.h:167
EIGEN_STRONG_INLINE Packet8d pload< Packet8d >(const double *from)
Definition PacketMath.h:683
EIGEN_STRONG_INLINE Packet8d ploadquad< Packet8d >(const double *from)
Definition PacketMath.h:761
EIGEN_STRONG_INLINE Packet8d psub< Packet8d >(const Packet8d &a, const Packet8d &b)
Definition PacketMath.h:280
EIGEN_STRONG_INLINE Packet8d ploadu< Packet8d >(const double *from)
Definition PacketMath.h:697
EIGEN_STRONG_INLINE Packet8d pset1frombits< Packet8d >(const numext::uint64_t from)
Definition PacketMath.h:215
EIGEN_STRONG_INLINE Packet16f pceil< Packet16f >(const Packet16f &a)
Definition PacketMath.h:529
EIGEN_STRONG_INLINE Packet8d plset< Packet8d >(const double &a)
Definition PacketMath.h:253
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
Definition GenericPacketMath.h:215
EIGEN_STRONG_INLINE bfloat16 predux_max< Packet16bf >(const Packet16bf &from)
Definition PacketMath.h:2136
EIGEN_DEVICE_FUNC unpacket_traits< Packet >::type predux(const Packet &a)
Definition GenericPacketMath.h:875
EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f &a)
Definition PacketMath.h:1007
EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf &a)
Definition PacketMath.h:1260
EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f &)
Definition PacketMath.h:247
EIGEN_STRONG_INLINE Packet8d pmul< Packet8d >(const Packet8d &a, const Packet8d &b)
Definition PacketMath.h:318
EIGEN_STRONG_INLINE Packet8f predux_half_dowto4< Packet16f >(const Packet16f &a)
Definition PacketMath.h:1005
EIGEN_STRONG_INLINE Packet8d pmin< PropagateNaN, Packet8d >(const Packet8d &a, const Packet8d &b)
Definition PacketMath.h:418
EIGEN_STRONG_INLINE Packet16bf pmax< Packet16bf >(const Packet16bf &a, const Packet16bf &b)
Definition PacketMath.h:2103
EIGEN_STRONG_INLINE Packet16f pfrexp< Packet16f >(const Packet16f &a, Packet16f &exponent)
Definition PacketMath.h:898
EIGEN_STRONG_INLINE Packet16f padd< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition PacketMath.h:259
EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b)
Definition PacketMath.h:446
EIGEN_STRONG_INLINE Packet16bf pmul< Packet16bf >(const Packet16bf &a, const Packet16bf &b)
Definition PacketMath.h:2085
EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x)
Definition PacketMath.h:442
EIGEN_STRONG_INLINE Packet16bf pset1< Packet16bf >(const bfloat16 &from)
Definition PacketMath.h:1883
EIGEN_STRONG_INLINE Packet8i pset1< Packet8i >(const int &from)
Definition PacketMath.h:242
EIGEN_STRONG_INLINE Packet16i por< Packet16i >(const Packet16i &a, const Packet16i &b)
Definition PacketMath.h:583
EIGEN_STRONG_INLINE Packet16h ploadquad(const Eigen::half *from)
Definition PacketMath.h:1420
EIGEN_STRONG_INLINE Packet16f ploaddup< Packet16f >(const float *from)
Definition PacketMath.h:715
EIGEN_STRONG_INLINE Packet16f pxor< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition PacketMath.h:612
EIGEN_STRONG_INLINE Packet16f por< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition PacketMath.h:588
EIGEN_STRONG_INLINE float predux_max< Packet16f >(const Packet16f &a)
Definition PacketMath.h:1076
EIGEN_STRONG_INLINE Packet16h pround< Packet16h >(const Packet16h &a)
Definition PacketMath.h:1531
EIGEN_STRONG_INLINE Eigen::half pfirst< Packet16h >(const Packet16h &from)
Definition PacketMath.h:1382
EIGEN_STRONG_INLINE Packet4d predux_half_dowto4< Packet8d >(const Packet8d &a)
Definition PacketMath.h:1021
EIGEN_STRONG_INLINE Packet8d print< Packet8d >(const Packet8d &a)
Definition PacketMath.h:527
EIGEN_STRONG_INLINE Packet16i pset1< Packet16i >(const int &from)
Definition PacketMath.h:205
EIGEN_STRONG_INLINE Packet16h pfloor< Packet16h >(const Packet16h &a)
Definition PacketMath.h:1543
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
Definition Complex.h:224
EIGEN_STRONG_INLINE Packet16f print< Packet16f >(const Packet16f &a)
Definition PacketMath.h:526
EIGEN_STRONG_INLINE Packet16h ploadu< Packet16h >(const Eigen::half *from)
Definition PacketMath.h:1390
__m512d Packet8d
Definition PacketMath.h:33
EIGEN_STRONG_INLINE bool predux_any(const Packet4f &x)
Definition PacketMath.h:1765
EIGEN_STRONG_INLINE Packet8i por< Packet8i >(const Packet8i &a, const Packet8i &b)
Definition PacketMath.h:501
EIGEN_STRONG_INLINE Packet16f ploadu< Packet16f >(const float *from)
Definition PacketMath.h:693
EIGEN_STRONG_INLINE Packet16bf pceil< Packet16bf >(const Packet16bf &a)
Definition PacketMath.h:2023
EIGEN_DEVICE_FUNC void pscatter< float, Packet16f >(float *to, const Packet16f &from, Index stride)
Definition PacketMath.h:823
EIGEN_DEVICE_FUNC Packet pdiv(const Packet &a, const Packet &b)
Definition GenericPacketMath.h:244
EIGEN_STRONG_INLINE void pstore< bfloat16 >(bfloat16 *to, const Packet8bf &from)
Definition PacketMath.h:511
EIGEN_STRONG_INLINE Packet16f psub< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition PacketMath.h:275
EIGEN_STRONG_INLINE int pfirst< Packet16i >(const Packet16i &a)
Definition PacketMath.h:871
EIGEN_STRONG_INLINE Packet16h pceil< Packet16h >(const Packet16h &a)
Definition PacketMath.h:1539
EIGEN_STRONG_INLINE Packet16h pload< Packet16h >(const Eigen::half *from)
Definition PacketMath.h:1386
EIGEN_STRONG_INLINE Packet16i pandnot< Packet16i >(const Packet16i &a, const Packet16i &b)
Definition PacketMath.h:630
EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i &a)
Definition PacketMath.h:1191
EIGEN_STRONG_INLINE Eigen::half predux_min< Packet16h >(const Packet16h &a)
Definition PacketMath.h:1618
EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x)
Definition PacketMath.h:437
EIGEN_STRONG_INLINE Packet16i pmul< Packet16i >(const Packet16i &a, const Packet16i &b)
Definition PacketMath.h:323
EIGEN_STRONG_INLINE Packet16bf pgather< bfloat16, Packet16bf >(const bfloat16 *from, Index stride)
Definition PacketMath.h:2153
EIGEN_STRONG_INLINE Packet16f plset< Packet16f >(const float &a)
Definition PacketMath.h:246
EIGEN_STRONG_INLINE Packet16i pxor< Packet16i >(const Packet16i &a, const Packet16i &b)
Definition PacketMath.h:607
EIGEN_STRONG_INLINE Packet16f pmax< PropagateNumbers, Packet16f >(const Packet16f &a, const Packet16f &b)
Definition PacketMath.h:406
EIGEN_DEVICE_FUNC Packet pmax(const Packet &a, const Packet &b)
Definition GenericPacketMath.h:524
EIGEN_STRONG_INLINE Packet16i psub< Packet16i >(const Packet16i &a, const Packet16i &b)
Definition PacketMath.h:285
EIGEN_STRONG_INLINE Packet4i pblend(const Selector< 4 > &ifPacket, const Packet4i &thenPacket, const Packet4i &elsePacket)
Definition PacketMath.h:2107
EIGEN_STRONG_INLINE Packet8bf padd< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Definition PacketMath.h:1324
EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f &a, const Packet4f &b)
Definition PacketMath.h:867
EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i &a)
Definition PacketMath.h:1189
EIGEN_STRONG_INLINE Packet16f pset1frombits< Packet16f >(unsigned int from)
Definition PacketMath.h:210
EIGEN_STRONG_INLINE Packet8d ptrue< Packet8d >(const Packet8d &a)
Definition PacketMath.h:546
EIGEN_STRONG_INLINE Packet pminmax_propagate_nan(const Packet &a, const Packet &b, Op op)
Definition PacketMath.h:555
EIGEN_STRONG_INLINE half predux_mul< Packet16h >(const Packet16h &from)
Definition PacketMath.h:1624
EIGEN_STRONG_INLINE Packet16bf plset< Packet16bf >(const bfloat16 &a)
Definition PacketMath.h:2109
eigen_packet_wrapper< __m256i, 2 > Packet16bf
Definition PacketMath.h:35
EIGEN_STRONG_INLINE void pstore1< Packet16f >(float *to, const float &a)
Definition PacketMath.h:843
EIGEN_STRONG_INLINE void pstore< int >(int *to, const Packet4i &from)
Definition PacketMath.h:496
EIGEN_STRONG_INLINE Packet8h por(const Packet8h &a, const Packet8h &b)
Definition PacketMath.h:1042
EIGEN_STRONG_INLINE Packet8d pfloor< Packet8d >(const Packet8d &a)
Definition PacketMath.h:533
EIGEN_STRONG_INLINE float predux_mul< Packet16f >(const Packet16f &a)
Definition PacketMath.h:1028
EIGEN_STRONG_INLINE Packet16i pand< Packet16i >(const Packet16i &a, const Packet16i &b)
Definition PacketMath.h:551
EIGEN_STRONG_INLINE double predux< Packet8d >(const Packet8d &a)
Definition PacketMath.h:996
EIGEN_STRONG_INLINE Packet8d pload1< Packet8d >(const double *from)
Definition PacketMath.h:241
EIGEN_STRONG_INLINE Packet16i padd< Packet16i >(const Packet16i &a, const Packet16i &b)
Definition PacketMath.h:269
EIGEN_STRONG_INLINE float predux< Packet8f >(const Packet8f &a)
Definition PacketMath.h:798
EIGEN_STRONG_INLINE Packet16i ploadu< Packet16i >(const int *from)
Definition PacketMath.h:701
EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf &a)
Definition Complex.h:184
EIGEN_STRONG_INLINE Packet16f pround< Packet16f >(const Packet16f &a)
Definition PacketMath.h:651
EIGEN_STRONG_INLINE Packet8d por< Packet8d >(const Packet8d &a, const Packet8d &b)
Definition PacketMath.h:597
EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h &a)
Definition PacketMath.h:988
EIGEN_STRONG_INLINE void pstore< double >(double *to, const Packet4d &from)
Definition PacketMath.h:623
EIGEN_DEVICE_FUNC unpacket_traits< Packet >::type predux_mul(const Packet &a)
Definition GenericPacketMath.h:882
EIGEN_STRONG_INLINE float predux< Packet16f >(const Packet16f &a)
Definition PacketMath.h:978
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition PacketMath.h:827
EIGEN_STRONG_INLINE Packet16f pset1< Packet16f >(const float &from)
Definition PacketMath.h:197
EIGEN_STRONG_INLINE Packet8d ploaddup< Packet8d >(const double *from)
Definition PacketMath.h:739
EIGEN_STRONG_INLINE Packet16f pmax< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition PacketMath.h:384
EIGEN_DEVICE_FUNC Packet pmul(const Packet &a, const Packet &b)
Definition GenericPacketMath.h:237
EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h &a)
Definition PacketMath.h:978
EIGEN_DEVICE_FUNC Packet pmin(const Packet &a, const Packet &b)
Definition GenericPacketMath.h:512
EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h &a, const Packet8h &b)
Definition PacketMath.h:1053
EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf &a)
Definition Complex.h:166
EIGEN_STRONG_INLINE Packet8h predux_half_dowto4< Packet16h >(const Packet16h &a)
Definition PacketMath.h:1606
EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f &a, const Packet4f &b)
Definition PacketMath.h:868
EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f &mask, const Packet4f &a, const Packet4f &b)
Definition PacketMath.h:917
EIGEN_STRONG_INLINE void prefetch< float >(const float *addr)
Definition PacketMath.h:1117
EIGEN_STRONG_INLINE void pstoreu< bfloat16 >(bfloat16 *to, const Packet8bf &from)
Definition PacketMath.h:1104
EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i &a)
Definition PacketMath.h:1187
EIGEN_STRONG_INLINE Packet8d pmax< Packet8d >(const Packet8d &a, const Packet8d &b)
Definition PacketMath.h:390
EIGEN_STRONG_INLINE Packet16f pmul< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition PacketMath.h:313
EIGEN_STRONG_INLINE Packet4d pfrexp_generic_get_biased_exponent(const Packet4d &a)
Definition PacketMath.h:743
EIGEN_STRONG_INLINE void pscatter< half, Packet16h >(half *to, const Packet16h &from, Index stride)
Definition PacketMath.h:1646
EIGEN_STRONG_INLINE void pstoreu< half >(Eigen::half *to, const Packet16h &from)
Definition PacketMath.h:1400
EIGEN_STRONG_INLINE Packet16bf ploadu< Packet16bf >(const bfloat16 *from)
Definition PacketMath.h:1900
EIGEN_STRONG_INLINE Packet8i pxor< Packet8i >(const Packet8i &a, const Packet8i &b)
Definition PacketMath.h:511
EIGEN_STRONG_INLINE double predux_max< Packet8d >(const Packet8d &a)
Definition PacketMath.h:1087
EIGEN_STRONG_INLINE Packet16f ptrue< Packet16f >(const Packet16f &a)
Definition PacketMath.h:541
EIGEN_STRONG_INLINE Packet8d pmax< PropagateNumbers, Packet8d >(const Packet8d &a, const Packet8d &b)
Definition PacketMath.h:410
EIGEN_STRONG_INLINE double predux_min< Packet8d >(const Packet8d &a)
Definition PacketMath.h:1067
EIGEN_STRONG_INLINE bfloat16 predux_min< Packet16bf >(const Packet16bf &from)
Definition PacketMath.h:2131
EIGEN_STRONG_INLINE Packet16f pmin< PropagateNumbers, Packet16f >(const Packet16f &a, const Packet16f &b)
Definition PacketMath.h:398
EIGEN_STRONG_INLINE Packet16f pload1< Packet16f >(const float *from)
Definition PacketMath.h:237
EIGEN_STRONG_INLINE Packet16bf psub< Packet16bf >(const Packet16bf &a, const Packet16bf &b)
Definition PacketMath.h:2079
EIGEN_STRONG_INLINE Packet16f pandnot< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition PacketMath.h:635
EIGEN_STRONG_INLINE Packet8d pdiv< Packet8d >(const Packet8d &a, const Packet8d &b)
Definition PacketMath.h:334
EIGEN_STRONG_INLINE Packet16f pdiv< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition PacketMath.h:329
EIGEN_STRONG_INLINE Packet16f pload< Packet16f >(const float *from)
Definition PacketMath.h:679
EIGEN_STRONG_INLINE Packet16f pldexp< Packet16f >(const Packet16f &a, const Packet16f &exponent)
Definition PacketMath.h:919
EIGEN_STRONG_INLINE Packet8d pldexp< Packet8d >(const Packet8d &a, const Packet8d &exponent)
Definition PacketMath.h:923
EIGEN_STRONG_INLINE Packet16bf pfloor< Packet16bf >(const Packet16bf &a)
Definition PacketMath.h:2027
EIGEN_STRONG_INLINE void pstore< float >(float *to, const Packet4f &from)
Definition PacketMath.h:491
__m512i Packet16i
Definition PacketMath.h:32
EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f &a)
Definition PacketMath.h:1176
EIGEN_STRONG_INLINE void pstore1< Packet8d >(double *to, const double &a)
Definition PacketMath.h:848
EIGEN_STRONG_INLINE Packet8bf predux_half_dowto4< Packet16bf >(const Packet16bf &a)
Definition PacketMath.h:2114
EIGEN_STRONG_INLINE Packet16h padd< Packet16h >(const Packet16h &a, const Packet16h &b)
Definition PacketMath.h:1572
EIGEN_STRONG_INLINE Packet8f peven_mask(const Packet8f &)
Definition PacketMath.h:252
EIGEN_STRONG_INLINE Packet8i pandnot< Packet8i >(const Packet8i &a, const Packet8i &b)
Definition PacketMath.h:521
EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf &a)
Definition PacketMath.h:1429
EIGEN_STRONG_INLINE Packet16f pand< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition PacketMath.h:557
EIGEN_STRONG_INLINE Packet16h pgather< Eigen::half, Packet16h >(const Eigen::half *from, Index stride)
Definition PacketMath.h:1637
__m256i Packet8i
Definition PacketMath.h:32
EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf)
Definition PacketMath.h:454
EIGEN_DEVICE_FUNC void pstore(Scalar *to, const Packet &from)
Definition GenericPacketMath.h:696
EIGEN_STRONG_INLINE Packet16h plset< Packet16h >(const half &a)
Definition PacketMath.h:1508
EIGEN_STRONG_INLINE Packet16h pset1< Packet16h >(const Eigen::half &from)
Definition PacketMath.h:1378
EIGEN_STRONG_INLINE double predux_mul< Packet8d >(const Packet8d &a)
Definition PacketMath.h:1048
EIGEN_DEVICE_FUNC void pscatter< double, Packet8d >(double *to, const Packet8d &from, Index stride)
Definition PacketMath.h:833
EIGEN_STRONG_INLINE Packet16bf print< Packet16bf >(const Packet16bf &a)
Definition PacketMath.h:2019
EIGEN_DEVICE_FUNC Packet8d pgather< double, Packet8d >(const double *from, Index stride)
Definition PacketMath.h:813
EIGEN_STRONG_INLINE Packet8d pfrexp< Packet8d >(const Packet8d &a, Packet8d &exponent)
Definition PacketMath.h:915
EIGEN_STRONG_INLINE Eigen::half predux_max< Packet16h >(const Packet16h &a)
Definition PacketMath.h:1612
EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf &a, const Packet2cf &b)
Definition Complex.h:231
EIGEN_STRONG_INLINE bfloat16 pfirst< Packet16bf >(const Packet16bf &from)
Definition PacketMath.h:1888
EIGEN_STRONG_INLINE bfloat16 predux< Packet16bf >(const Packet16bf &p)
Definition PacketMath.h:2121
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet &a, const Packet &exponent)
Definition GenericPacketMathFunctions.h:85
EIGEN_STRONG_INLINE void pstoreu< int >(int *to, const Packet4i &from)
Definition PacketMath.h:1092
EIGEN_STRONG_INLINE Packet8h pand(const Packet8h &a, const Packet8h &b)
Definition PacketMath.h:1050
EIGEN_STRONG_INLINE Packet16bf pround< Packet16bf >(const Packet16bf &a)
Definition PacketMath.h:2014
EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h &a, const Packet8h &b)
Definition PacketMath.h:1047
EIGEN_DEVICE_FUNC Packet16f pgather< float, Packet16f >(const float *from, Index stride)
Definition PacketMath.h:803
EIGEN_STRONG_INLINE half predux< Packet16h >(const Packet16h &from)
Definition PacketMath.h:1600
eigen_packet_wrapper< __m256i, 1 > Packet16h
Definition PacketMath.h:34
EIGEN_STRONG_INLINE Packet16h print< Packet16h >(const Packet16h &a)
Definition PacketMath.h:1535
EIGEN_STRONG_INLINE void pscatter< bfloat16, Packet16bf >(bfloat16 *to, const Packet16bf &from, Index stride)
Definition PacketMath.h:2163
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet &a, Packet &exponent)
Definition GenericPacketMathFunctions.h:40
EIGEN_DEVICE_FUNC Packet psub(const Packet &a, const Packet &b)
Definition GenericPacketMath.h:222
EIGEN_STRONG_INLINE Packet16i pload< Packet16i >(const int *from)
Definition PacketMath.h:687
EIGEN_STRONG_INLINE Packet16bf pload< Packet16bf >(const bfloat16 *from)
Definition PacketMath.h:1895
EIGEN_STRONG_INLINE Packet16f pmin< PropagateNaN, Packet16f >(const Packet16f &a, const Packet16f &b)
Definition PacketMath.h:414
EIGEN_STRONG_INLINE double pfirst< Packet8d >(const Packet8d &a)
Definition PacketMath.h:867
EIGEN_STRONG_INLINE Packet8d pand< Packet8d >(const Packet8d &a, const Packet8d &b)
Definition PacketMath.h:566
EIGEN_STRONG_INLINE void pstore< half >(Eigen::half *to, const Packet16h &from)
Definition PacketMath.h:1394
EIGEN_STRONG_INLINE Packet16f pfloor< Packet16f >(const Packet16f &a)
Definition PacketMath.h:532
EIGEN_STRONG_INLINE Packet8i ptrue< Packet8i >(const Packet8i &a)
Definition PacketMath.h:459
EIGEN_STRONG_INLINE Packet8d pset1< Packet8d >(const double &from)
Definition PacketMath.h:201
EIGEN_STRONG_INLINE float predux_min< Packet16f >(const Packet16f &a)
Definition PacketMath.h:1057
EIGEN_STRONG_INLINE Packet16h pmul< Packet16h >(const Packet16h &a, const Packet16h &b)
Definition PacketMath.h:1586
EIGEN_STRONG_INLINE Packet16bf pdiv< Packet16bf >(const Packet16bf &a, const Packet16bf &b)
Definition PacketMath.h:2091
EIGEN_STRONG_INLINE Packet16f ploadquad< Packet16f >(const float *from)
Definition PacketMath.h:752
EIGEN_STRONG_INLINE Packet8d pround< Packet8d >(const Packet8d &a)
Definition PacketMath.h:658
EIGEN_STRONG_INLINE Packet16f pmin< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition PacketMath.h:371
EIGEN_STRONG_INLINE Packet8h padd< Packet8h >(const Packet8h &a, const Packet8h &b)
Definition PacketMath.h:1100
EIGEN_STRONG_INLINE Packet16h pdiv< Packet16h >(const Packet16h &a, const Packet16h &b)
Definition PacketMath.h:1593
__m256 Packet8f
Definition PacketMath.h:31
EIGEN_STRONG_INLINE Packet16h pmax< Packet16h >(const Packet16h &a, const Packet16h &b)
Definition PacketMath.h:1502
EIGEN_STRONG_INLINE Packet8d pxor< Packet8d >(const Packet8d &a, const Packet8d &b)
Definition PacketMath.h:621
EIGEN_STRONG_INLINE Packet8d pceil< Packet8d >(const Packet8d &a)
Definition PacketMath.h:530
EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f)
Definition PacketMath.h:1252
EIGEN_STRONG_INLINE Packet8d pmin< PropagateNumbers, Packet8d >(const Packet8d &a, const Packet8d &b)
Definition PacketMath.h:402
EIGEN_STRONG_INLINE void pstoreu< float >(float *to, const Packet4f &from)
Definition PacketMath.h:1088
EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f &a, const Packet4f &b)
Definition PacketMath.h:870
EIGEN_STRONG_INLINE Packet8i pand< Packet8i >(const Packet8i &a, const Packet8i &b)
Definition PacketMath.h:491
EIGEN_STRONG_INLINE Packet16f pmax< PropagateNaN, Packet16f >(const Packet16f &a, const Packet16f &b)
Definition PacketMath.h:422
EIGEN_STRONG_INLINE Packet16bf padd< Packet16bf >(const Packet16bf &a, const Packet16bf &b)
Definition PacketMath.h:2073
EIGEN_STRONG_INLINE void prefetch< int >(const int *addr)
Definition PacketMath.h:1118
__m256d Packet4d
Definition PacketMath.h:33
EIGEN_STRONG_INLINE Packet16h ploaddup< Packet16h >(const Eigen::half *from)
Definition PacketMath.h:1407
EIGEN_STRONG_INLINE Packet16bf pmin< Packet16bf >(const Packet16bf &a, const Packet16bf &b)
Definition PacketMath.h:2097
EIGEN_STRONG_INLINE bfloat16 predux_mul< Packet16bf >(const Packet16bf &from)
Definition PacketMath.h:2126
EIGEN_STRONG_INLINE Packet8d pmin< Packet8d >(const Packet8d &a, const Packet8d &b)
Definition PacketMath.h:377
__m512 Packet16f
Definition PacketMath.h:31
EIGEN_STRONG_INLINE Packet8d pmax< PropagateNaN, Packet8d >(const Packet8d &a, const Packet8d &b)
Definition PacketMath.h:426
EIGEN_STRONG_INLINE Packet16h psub< Packet16h >(const Packet16h &a, const Packet16h &b)
Definition PacketMath.h:1579
EIGEN_STRONG_INLINE void pstore1< Packet16i >(int *to, const int &a)
Definition PacketMath.h:853
EIGEN_STRONG_INLINE float pfirst< Packet16f >(const Packet16f &a)
Definition PacketMath.h:863
EIGEN_STRONG_INLINE void prefetch< double >(const double *addr)
Definition PacketMath.h:692
EIGEN_STRONG_INLINE Packet16h pmin< Packet16h >(const Packet16h &a, const Packet16h &b)
Definition PacketMath.h:1496
EIGEN_STRONG_INLINE Packet8d padd< Packet8d >(const Packet8d &a, const Packet8d &b)
Definition PacketMath.h:264
::uint64_t uint64_t
Definition Meta.h:58
::uint16_t uint16_t
Definition Meta.h:54
::uint32_t uint32_t
Definition Meta.h:56
Namespace containing all symbols from the Eigen library.
Definition bench_norm.cpp:85
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
Definition BandTriangularSolver.h:13
unsigned short value
Definition BFloat16.h:36
Definition BFloat16.h:58
Definition Half.h:142
Definition GenericPacketMath.h:43
@ HasRsqrt
Definition GenericPacketMath.h:67
@ HasSin
Definition GenericPacketMath.h:75
@ HasBlend
Definition GenericPacketMath.h:60
@ HasNdtri
Definition GenericPacketMath.h:90
@ HasCos
Definition GenericPacketMath.h:76
@ HasCmp
Definition GenericPacketMath.h:63
@ HasLog1p
Definition GenericPacketMath.h:71
@ HasCeil
Definition GenericPacketMath.h:101
@ HasExp
Definition GenericPacketMath.h:68
@ HasRound
Definition GenericPacketMath.h:98
@ HasRint
Definition GenericPacketMath.h:99
@ HasSqrt
Definition GenericPacketMath.h:66
@ HasErf
Definition GenericPacketMath.h:88
@ HasBessel
Definition GenericPacketMath.h:91
@ HasExpm1
Definition GenericPacketMath.h:69
@ HasLog
Definition GenericPacketMath.h:70
@ HasTanh
Definition GenericPacketMath.h:83
@ HasFloor
Definition GenericPacketMath.h:100
@ HasDiv
Definition GenericPacketMath.h:65
Definition GenericPacketMath.h:160
Definition Meta.h:133
@ value
Definition Meta.h:133
Packet8bf half
Definition PacketMath.h:1845
Packet16bf type
Definition PacketMath.h:1844
Packet4d half
Definition PacketMath.h:135
Packet8d type
Definition PacketMath.h:134
Packet8f half
Definition PacketMath.h:98
Packet16f type
Definition PacketMath.h:97
Packet16h half
Definition PacketMath.h:56
Packet16h type
Definition PacketMath.h:54
Definition GenericPacketMath.h:107
@ HasSub
Definition GenericPacketMath.h:118
@ HasMax
Definition GenericPacketMath.h:124
@ HasNegate
Definition GenericPacketMath.h:120
@ HasMul
Definition GenericPacketMath.h:119
@ HasAdd
Definition GenericPacketMath.h:117
@ HasSetLinear
Definition GenericPacketMath.h:126
@ HasMin
Definition GenericPacketMath.h:123
@ HasConj
Definition GenericPacketMath.h:125
@ HasAbs2
Definition GenericPacketMath.h:122
@ HasAbs
Definition GenericPacketMath.h:121
@ HasHalfPacket
Definition GenericPacketMath.h:114
@ size
Definition GenericPacketMath.h:112
@ AlignedOnScalar
Definition GenericPacketMath.h:113
@ Vectorizable
Definition GenericPacketMath.h:111
Definition ForwardDeclarations.h:17
Packet8bf half
Definition PacketMath.h:1879
bfloat16 type
Definition PacketMath.h:1877
uint16_t mask_t
Definition PacketMath.h:173
float type
Definition PacketMath.h:170
Packet8f half
Definition PacketMath.h:171
Packet16i integer_packet
Definition PacketMath.h:172
Packet8h half
Definition PacketMath.h:192
Eigen::half type
Definition PacketMath.h:191
Packet8i half
Definition PacketMath.h:185
int type
Definition PacketMath.h:184
Packet4d half
Definition PacketMath.h:179
double type
Definition PacketMath.h:178
Definition GenericPacketMath.h:133
@ masked_load_available
Definition GenericPacketMath.h:141
@ size
Definition GenericPacketMath.h:138
@ masked_store_available
Definition GenericPacketMath.h:142
@ vectorizable
Definition GenericPacketMath.h:140
@ alignment
Definition GenericPacketMath.h:139
std::ofstream out("Result.txt")
std::ptrdiff_t j
Definition tut_arithmetic_redux_minmax.cpp:2