TR-mbed 1.0
Loading...
Searching...
No Matches
GeneralBlockPanelKernel.h
Go to the documentation of this file.
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_GENERAL_BLOCK_PANEL_H
11#define EIGEN_GENERAL_BLOCK_PANEL_H
12
13
14namespace Eigen {
15
16namespace internal {
17
23
24template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target, int _PacketSize=GEBPPacketFull>
25class gebp_traits;
26
27
29inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
30{
31 return a<=0 ? b : a;
32}
33
34#if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
35#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
36#else
37#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
38#endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
39
40#if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
41#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
42#else
43#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
44#endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
45
46#if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
47#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
48#else
49#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
50#endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
51
52#if EIGEN_ARCH_i386_OR_x86_64
53const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024);
54const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024);
55const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024);
56#elif EIGEN_ARCH_PPC
57const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024);
58const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
59const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024);
60#else
62const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
63const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024);
64#endif
65
66#undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
67#undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
68#undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
69
84
86inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
87{
89
90 if(action==SetAction)
91 {
92 // set the cpu cache size and cache all block sizes from a global cache size in byte
93 eigen_internal_assert(l1!=0 && l2!=0);
94 m_cacheSizes.m_l1 = *l1;
95 m_cacheSizes.m_l2 = *l2;
96 m_cacheSizes.m_l3 = *l3;
97 }
98 else if(action==GetAction)
99 {
100 eigen_internal_assert(l1!=0 && l2!=0);
101 *l1 = m_cacheSizes.m_l1;
102 *l2 = m_cacheSizes.m_l2;
103 *l3 = m_cacheSizes.m_l3;
104 }
105 else
106 {
108 }
109}
110
111/* Helper for computeProductBlockingSizes.
112 *
113 * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
114 * this function computes the blocking size parameters along the respective dimensions
115 * for matrix products and related algorithms. The blocking sizes depends on various
116 * parameters:
117 * - the L1 and L2 cache sizes,
118 * - the register level blocking sizes defined by gebp_traits,
119 * - the number of scalars that fit into a packet (when vectorization is enabled).
120 *
121 * \sa setCpuCacheSizes */
122
123template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
125{
127
128 // Explanations:
129 // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
130 // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
131 // per mr x kc horizontal small panels where mr is the blocking size along the m dimension
132 // at the register level. This small horizontal panel has to stay within L1 cache.
133 std::ptrdiff_t l1, l2, l3;
135 #ifdef EIGEN_VECTORIZE_AVX512
136 // We need to find a rationale for that, but without this adjustment,
137 // performance with AVX512 is pretty bad, like -20% slower.
138 // One reason is that with increasing packet-size, the blocking size k
139 // has to become pretty small if we want that 1 lhs panel fit within L1.
140 // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are:
141 // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144.
142 // This is quite small for a good reuse of the accumulation registers.
143 l1 *= 4;
144 #endif
145
146 if (num_threads > 1) {
147 typedef typename Traits::ResScalar ResScalar;
148 enum {
149 kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
150 ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
151 kr = 8,
152 mr = Traits::mr,
153 nr = Traits::nr
154 };
155 // Increasing k gives us more time to prefetch the content of the "C"
156 // registers. However once the latency is hidden there is no point in
157 // increasing the value of k, so we'll cap it at 320 (value determined
158 // experimentally).
159 // To avoid that k vanishes, we make k_cache at least as big as kr
160 const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
161 if (k_cache < k) {
162 k = k_cache - (k_cache % kr);
164 }
165
166 const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
168 if (n_cache <= n_per_thread) {
169 // Don't exceed the capacity of the l2 cache.
170 eigen_internal_assert(n_cache >= static_cast<Index>(nr));
171 n = n_cache - (n_cache % nr);
173 } else {
174 n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
175 }
176
177 if (l3 > l2) {
178 // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
179 const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
181 if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
182 m = m_cache - (m_cache % mr);
184 } else {
185 m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
186 }
187 }
188 }
189 else {
190 // In unit tests we do not want to use extra large matrices,
191 // so we reduce the cache size to check the blocking strategy is not flawed
192#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
193 l1 = 9*1024;
194 l2 = 32*1024;
195 l3 = 512*1024;
196#endif
197
198 // Early return for small problems because the computation below are time consuming for small problems.
199 // Perhaps it would make more sense to consider k*n*m??
200 // Note that for very tiny problem, this function should be bypassed anyway
201 // because we use the coefficient-based implementation for them.
202 if((numext::maxi)(k,(numext::maxi)(m,n))<48)
203 return;
204
205 typedef typename Traits::ResScalar ResScalar;
206 enum {
207 k_peeling = 8,
208 k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
209 k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
210 };
211
212 // ---- 1st level of blocking on L1, yields kc ----
213
214 // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
215 // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
216 // We also include a register-level block of the result (mx x nr).
217 // (In an ideal world only the lhs panel would stay in L1)
218 // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
219 const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
220 const Index old_k = k;
221 if(k>max_kc)
222 {
223 // We are really blocking on the third dimension:
224 // -> reduce blocking size to make sure the last block is as large as possible
225 // while keeping the same number of sweeps over the result.
226 k = (k%max_kc)==0 ? max_kc
227 : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
228
229 eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
230 }
231
232 // ---- 2nd level of blocking on max(L2,L3), yields nc ----
233
234 // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
235 // actual_l2 = max(l2, l3/nb_core_sharing_l3)
236 // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
237 // For instance, it corresponds to 6MB of L3 shared among 4 cores.
238 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
239 const Index actual_l2 = l3;
240 #else
241 const Index actual_l2 = 1572864; // == 1.5 MB
242 #endif
243
244 // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
245 // The second half is implicitly reserved to access the result and lhs coefficients.
246 // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
247 // to limit this growth: we bound nc to growth by a factor x1.5.
248 // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
249 // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
251 const Index lhs_bytes = m * k * sizeof(LhsScalar);
253 if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
254 {
255 // L1 blocking
256 max_nc = remaining_l1 / (k*sizeof(RhsScalar));
257 }
258 else
259 {
260 // L2 blocking
261 max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
262 }
263 // WARNING Below, we assume that Traits::nr is a power of two.
264 Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
265 if(n>nc)
266 {
267 // We are really blocking over the columns:
268 // -> reduce blocking size to make sure the last block is as large as possible
269 // while keeping the same number of sweeps over the packed lhs.
270 // Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
271 n = (n%nc)==0 ? nc
272 : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
273 }
274 else if(old_k==k)
275 {
276 // So far, no blocking at all, i.e., kc==k, and nc==n.
277 // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
278 // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
279 Index problem_size = k*n*sizeof(LhsScalar);
281 Index max_mc = m;
282 if(problem_size<=1024)
283 {
284 // problem is small enough to keep in L1
285 // Let's choose m such that lhs's block fit in 1/3 of L1
286 actual_lm = l1;
287 }
288 else if(l3!=0 && problem_size<=32768)
289 {
290 // we have both L2 and L3, and problem is small enough to be kept in L2
291 // Let's choose m such that lhs's block fit in 1/3 of L2
292 actual_lm = l2;
293 max_mc = (numext::mini<Index>)(576,max_mc);
294 }
295 Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
296 if (mc > Traits::mr) mc -= mc % Traits::mr;
297 else if (mc==0) return;
298 m = (m%mc)==0 ? mc
299 : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
300 }
301 }
302}
303
304template <typename Index>
306{
307#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
309 k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
310 m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
311 n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
312 return true;
313 }
314#else
318#endif
319 return false;
320}
321
338template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
345
346template<typename LhsScalar, typename RhsScalar, typename Index>
351
352template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
354 private:
355 static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken;
356 public:
358};
359
360template <typename Packet>
362{
364 const Packet& get(const FixedInt<0>&) const { return B_0; }
365 const Packet& get(const FixedInt<1>&) const { return B1; }
366 const Packet& get(const FixedInt<2>&) const { return B2; }
367 const Packet& get(const FixedInt<3>&) const { return B3; }
368};
369
370template <int N, typename T1, typename T2, typename T3>
371struct packet_conditional { typedef T3 type; };
372
373template <typename T1, typename T2, typename T3>
375
376template <typename T1, typename T2, typename T3>
378
379#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
380 typedef typename packet_conditional<packet_size, \
381 typename packet_traits<name ## Scalar>::type, \
382 typename packet_traits<name ## Scalar>::half, \
383 typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
384 prefix ## name ## Packet
385
386#define PACKET_DECL_COND(name, packet_size) \
387 typedef typename packet_conditional<packet_size, \
388 typename packet_traits<name ## Scalar>::type, \
389 typename packet_traits<name ## Scalar>::half, \
390 typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
391 name ## Packet
392
393#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \
394 typedef typename packet_conditional<packet_size, \
395 typename packet_traits<Scalar>::type, \
396 typename packet_traits<Scalar>::half, \
397 typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
398 prefix ## ScalarPacket
399
400#define PACKET_DECL_COND_SCALAR(packet_size) \
401 typedef typename packet_conditional<packet_size, \
402 typename packet_traits<Scalar>::type, \
403 typename packet_traits<Scalar>::half, \
404 typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
405 ScalarPacket
406
407/* Vectorization logic
408 * real*real: unpack rhs to constant packets, ...
409 *
410 * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
411 * storing each res packet into two packets (2x2),
412 * at the end combine them: swap the second and addsub them
413 * cf*cf : same but with 2x4 blocks
414 * cplx*real : unpack rhs to constant packets, ...
415 * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
416 */
417template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
419{
420public:
424
428
429 enum {
436
438
439 // register block size along the N direction must be 1 or 4
440 nr = 4,
441
442 // register block size along the M direction (currently, this one cannot be modified)
445 && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914))
446 // we assume 16 registers or more
447 // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
448 // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
449 // Bug 1515: MSVC prior to v19.14 yields to register spilling.
451#else
453#endif
454
456 RhsProgress = 1
457 };
458
459
464
467
472
473 template<typename RhsPacketType>
478
480 {
481 pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
482 }
483
484 template<typename RhsPacketType>
486 {
487 loadRhs(b, dest);
488 }
489
491 {
492 }
493
498
499 template<typename LhsPacketType>
504
505 template<typename LhsPacketType>
510
511 template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
513 {
515 // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
516 // let gcc allocate the register in which to store the result of the pmul
517 // (in the case where there is no FMA) gcc fails to figure out how to avoid
518 // spilling register.
519#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
521 c = cj.pmadd(a,b,c);
522#else
523 tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
524#endif
525 }
526
527 template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
529 {
530 madd(a, b.get(lane), c, tmp, lane);
531 }
532
534 {
535 r = pmadd(c,alpha,r);
536 }
537
538 template<typename ResPacketHalf>
539 EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const
540 {
541 r = pmadd(c,alpha,r);
542 }
543
544};
545
546template<typename RealScalar, bool _ConjLhs, int Arch, int _PacketSize>
548{
549public:
550 typedef std::complex<RealScalar> LhsScalar;
553
557
558 enum {
559 ConjLhs = _ConjLhs,
560 ConjRhs = false,
562 LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
563 RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
564 ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
565
567 nr = 4,
568#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
569 // we assume 16 registers
570 mr = 3*LhsPacketSize,
571#else
572 mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
573#endif
574
575 LhsProgress = LhsPacketSize,
576 RhsProgress = 1
577 };
578
583
585
587
592
593 template<typename RhsPacketType>
598
600 {
601 pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
602 }
603
604 template<typename RhsPacketType>
606 {
607 loadRhs(b, dest);
608 }
609
612
617
619 {
620 // FIXME we can do better!
621 // what we want here is a ploadheight
622 RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
624 }
625
627 {
628 eigen_internal_assert(RhsPacketSize<=8);
630 }
631
633 {
635 }
636
637 template<typename LhsPacketType>
642
643 template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
648
649 template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
651 {
652#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
654 c.v = pmadd(a.v,b,c.v);
655#else
656 tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
657#endif
658 }
659
660 EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
661 {
662 c += a * b;
663 }
664
665 template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
667 {
668 madd(a, b.get(lane), c, tmp, lane);
669 }
670
671 template <typename ResPacketType, typename AccPacketType>
677
678protected:
679};
680
681template<typename Packet>
687
688template<typename Packet>
690{
692 res.first = padd(a.first, b.first);
693 res.second = padd(a.second,b.second);
694 return res;
695}
696
697// note that for DoublePacket<RealPacket> the "4" in "downto4"
698// corresponds to the number of complexes, so it means "8"
699// it terms of real coefficients.
700
701template<typename Packet>
702const DoublePacket<Packet>&
708
709template<typename Packet>
710DoublePacket<typename unpacket_traits<Packet>::half>
712 typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0)
713{
714 // yes, that's pretty hackish :(
716 typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
717 typedef typename packet_traits<Cplx>::type CplxPacket;
718 res.first = predux_half_dowto4(CplxPacket(a.first)).v;
719 res.second = predux_half_dowto4(CplxPacket(a.second)).v;
720 return res;
721}
722
723// same here, "quad" actually means "8" in terms of real coefficients
724template<typename Scalar, typename RealPacket>
727{
728 dest.first = pset1<RealPacket>(numext::real(*b));
729 dest.second = pset1<RealPacket>(numext::imag(*b));
730}
731
732template<typename Scalar, typename RealPacket>
734 typename enable_if<unpacket_traits<RealPacket>::size==16>::type* = 0)
735{
736 // yes, that's pretty hackish too :(
737 typedef typename NumTraits<Scalar>::Real RealScalar;
738 RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
739 RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
740 dest.first = ploadquad<RealPacket>(r);
741 dest.second = ploadquad<RealPacket>(i);
742}
743
744
748// template<typename Packet>
749// DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
750// {
751// DoublePacket<Packet> res;
752// res.first = padd(a.first, b.first);
753// res.second = padd(a.second,b.second);
754// return res;
755// }
756
757template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
758class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize >
759{
760public:
761 typedef std::complex<RealScalar> Scalar;
762 typedef std::complex<RealScalar> LhsScalar;
763 typedef std::complex<RealScalar> RhsScalar;
764 typedef std::complex<RealScalar> ResScalar;
765
771
772 enum {
773 ConjLhs = _ConjLhs,
774 ConjRhs = _ConjRhs,
777 ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
778 LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
779 RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
780 RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
781
782 // FIXME: should depend on NumberOfRegisters
783 nr = 4,
784 mr = ResPacketSize,
785
786 LhsProgress = ResPacketSize,
787 RhsProgress = 1
788 };
789
791
797
798 // this actualy holds 8 packets!
800
802
808
809 // Scalar path
814
815 // Vectorized path
816 template<typename RealPacketType>
818 {
819 dest.first = pset1<RealPacketType>(numext::real(*b));
820 dest.second = pset1<RealPacketType>(numext::imag(*b));
821 }
822
824 {
825 loadRhs(b, dest.B_0);
826 loadRhs(b + 1, dest.B1);
827 loadRhs(b + 2, dest.B2);
828 loadRhs(b + 3, dest.B3);
829 }
830
831 // Scalar path
833 {
834 loadRhs(b, dest);
835 }
836
837 // Vectorized path
838 template<typename RealPacketType>
840 {
841 loadRhs(b, dest);
842 }
843
845
847 {
848 loadRhs(b,dest);
849 }
854
855 // nothing special here
857 {
859 }
860
861 template<typename LhsPacketType>
866
867 template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType>
871 {
872 c.first = padd(pmul(a,b.first), c.first);
873 c.second = padd(pmul(a,b.second),c.second);
874 }
875
876 template<typename LaneIdType>
877 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const
878 {
879 c = cj.pmadd(a,b,c);
880 }
881
882 template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
884 {
885 madd(a, b.get(lane), c, tmp, lane);
886 }
887
888 EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
889
890 template<typename RealPacketType, typename ResPacketType>
892 {
893 // assemble c
895 if((!ConjLhs)&&(!ConjRhs))
896 {
897 tmp = pcplxflip(pconj(ResPacketType(c.second)));
898 tmp = padd(ResPacketType(c.first),tmp);
899 }
900 else if((!ConjLhs)&&(ConjRhs))
901 {
902 tmp = pconj(pcplxflip(ResPacketType(c.second)));
903 tmp = padd(ResPacketType(c.first),tmp);
904 }
905 else if((ConjLhs)&&(!ConjRhs))
906 {
907 tmp = pcplxflip(ResPacketType(c.second));
908 tmp = padd(pconj(ResPacketType(c.first)),tmp);
909 }
910 else if((ConjLhs)&&(ConjRhs))
911 {
912 tmp = pcplxflip(ResPacketType(c.second));
913 tmp = psub(pconj(ResPacketType(c.first)),tmp);
914 }
915
916 r = pmadd(tmp,alpha,r);
917 }
918
919protected:
921};
922
923template<typename RealScalar, bool _ConjRhs, int Arch, int _PacketSize>
925{
926public:
927 typedef std::complex<RealScalar> Scalar;
931
937
938#undef PACKET_DECL_COND_SCALAR_PREFIX
939#undef PACKET_DECL_COND_PREFIX
940#undef PACKET_DECL_COND_SCALAR
941#undef PACKET_DECL_COND
942
943 enum {
944 ConjLhs = false,
945 ConjRhs = _ConjRhs,
948 LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
949 RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
950 ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
951
953 // FIXME: should depend on NumberOfRegisters
954 nr = 4,
955 mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
956
957 LhsProgress = ResPacketSize,
958 RhsProgress = 1
959 };
960
967
972
973 template<typename RhsPacketType>
978
980 {
981 pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
982 }
983
984 template<typename RhsPacketType>
986 {
987 loadRhs(b, dest);
988 }
989
992
997
1002
1003 template<typename LhsPacketType>
1008
1009 template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
1014
1015 template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
1017 {
1018#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
1020 c.v = pmadd(a,b.v,c.v);
1021#else
1022 tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
1023#endif
1024
1025 }
1026
1027 EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
1028 {
1029 c += a * b;
1030 }
1031
1032 template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
1034 {
1035 madd(a, b.get(lane), c, tmp, lane);
1036 }
1037
1038 template <typename ResPacketType, typename AccPacketType>
1044
1045protected:
1046
1047};
1048
1049/* optimized General packed Block * packed Panel product kernel
1050 *
1051 * Mixing type logic: C += A * B
1052 * | A | B | comments
1053 * |real |cplx | no vectorization yet, would require to pack A with duplication
1054 * |cplx |real | easy vectorization
1055 */
1056template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
1058{
1062
1069
1071
1073
1074 typedef typename SwappedTraits::ResScalar SResScalar;
1075 typedef typename SwappedTraits::LhsPacket SLhsPacket;
1076 typedef typename SwappedTraits::RhsPacket SRhsPacket;
1077 typedef typename SwappedTraits::ResPacket SResPacket;
1078 typedef typename SwappedTraits::AccPacket SAccPacket;
1079
1080 typedef typename HalfTraits::LhsPacket LhsPacketHalf;
1081 typedef typename HalfTraits::RhsPacket RhsPacketHalf;
1082 typedef typename HalfTraits::ResPacket ResPacketHalf;
1083 typedef typename HalfTraits::AccPacket AccPacketHalf;
1084
1085 typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
1086 typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
1087 typedef typename QuarterTraits::ResPacket ResPacketQuarter;
1088 typedef typename QuarterTraits::AccPacket AccPacketQuarter;
1089
1090 typedef typename DataMapper::LinearMapper LinearMapper;
1091
1092 enum {
1093 Vectorizable = Traits::Vectorizable,
1094 LhsProgress = Traits::LhsProgress,
1095 LhsProgressHalf = HalfTraits::LhsProgress,
1096 LhsProgressQuarter = QuarterTraits::LhsProgress,
1097 RhsProgress = Traits::RhsProgress,
1098 RhsProgressHalf = HalfTraits::RhsProgress,
1099 RhsProgressQuarter = QuarterTraits::RhsProgress,
1100 ResPacketSize = Traits::ResPacketSize
1102
1104 void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
1105 Index rows, Index depth, Index cols, ResScalar alpha,
1107};
1108
1109template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs,
1112{
1115
1116 typedef typename Traits::ResScalar ResScalar;
1117 typedef typename SwappedTraits::LhsPacket SLhsPacket;
1118 typedef typename SwappedTraits::RhsPacket SRhsPacket;
1119 typedef typename SwappedTraits::ResPacket SResPacket;
1120 typedef typename SwappedTraits::AccPacket SAccPacket;
1121
1137};
1138
1139
1140template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
1144
1145 typedef typename Traits::ResScalar ResScalar;
1146 typedef typename SwappedTraits::LhsPacket SLhsPacket;
1147 typedef typename SwappedTraits::RhsPacket SRhsPacket;
1148 typedef typename SwappedTraits::ResPacket SResPacket;
1149 typedef typename SwappedTraits::AccPacket SAccPacket;
1150
1152 const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
1154 {
1159
1162
1163 if (depth - endk > 0)
1164 {
1165 // We have to handle the last row(s) of the rhs, which
1166 // correspond to a half-packet
1168
1169 for (Index kk = endk; kk < depth; kk++)
1170 {
1173 straits.loadLhsUnaligned(blB, a0);
1174 straits.loadRhs(blA, b0);
1175 straits.madd(a0,b0,c0,b0, fix<0>);
1176 blB += SwappedTraits::LhsProgress/4;
1177 blA += 1;
1178 }
1179 straits.acc(c0, alphav, R);
1180 }
1181 else
1182 {
1184 }
1185 res.scatterPacket(i, j2, R);
1186 }
1187};
1188
1189template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
1191{
1192 typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
1193
1194 EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1195 {
1196 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1197 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1198 traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
1199 traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
1200 traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
1201 traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
1202 traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
1203 traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
1204 #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1205 __asm__ ("" : "+x,m" (*A0));
1206 #endif
1207 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1208 }
1209
1211 const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha,
1214 {
1216
1217 // loops on each largest micro horizontal panel of lhs
1218 // (LhsProgress x depth)
1219 for(Index i=peelStart; i<peelEnd; i+=LhsProgress)
1220 {
1221 // loops on each largest micro vertical panel of rhs (depth * nr)
1222 for(Index j2=0; j2<packet_cols4; j2+=nr)
1223 {
1224 // We select a LhsProgress x nr micro block of res
1225 // which is entirely stored into 1 x nr registers.
1226
1227 const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1228 prefetch(&blA[0]);
1229
1230 // gets res block as register
1231 AccPacket C0, C1, C2, C3;
1232 traits.initAcc(C0);
1233 traits.initAcc(C1);
1234 traits.initAcc(C2);
1235 traits.initAcc(C3);
1236 // To improve instruction pipelining, let's double the accumulation registers:
1237 // even k will accumulate in C*, while odd k will accumulate in D*.
1238 // This trick is crutial to get good performance with FMA, otherwise it is
1239 // actually faster to perform separated MUL+ADD because of a naturally
1240 // better instruction-level parallelism.
1241 AccPacket D0, D1, D2, D3;
1242 traits.initAcc(D0);
1243 traits.initAcc(D1);
1244 traits.initAcc(D2);
1245 traits.initAcc(D3);
1246
1247 LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1248 LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1249 LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1250 LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1251
1252 r0.prefetch(prefetch_res_offset);
1253 r1.prefetch(prefetch_res_offset);
1254 r2.prefetch(prefetch_res_offset);
1255 r3.prefetch(prefetch_res_offset);
1256
1257 // performs "inner" products
1258 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1259 prefetch(&blB[0]);
1260 LhsPacket A0, A1;
1261
1262 for(Index k=0; k<peeled_kc; k+=pk)
1263 {
1264 EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
1266 RhsPacket T0;
1267
1268 internal::prefetch(blB+(48+0));
1269 peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1270 peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1271 peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1272 peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1273 internal::prefetch(blB+(48+16));
1274 peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1275 peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1276 peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1277 peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1278
1279 blB += pk*4*RhsProgress;
1280 blA += pk*LhsProgress;
1281
1282 EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
1283 }
1284 C0 = padd(C0,D0);
1285 C1 = padd(C1,D1);
1286 C2 = padd(C2,D2);
1287 C3 = padd(C3,D3);
1288
1289 // process remaining peeled loop
1290 for(Index k=peeled_kc; k<depth; k++)
1291 {
1293 RhsPacket T0;
1294 peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1295 blB += 4*RhsProgress;
1296 blA += LhsProgress;
1297 }
1298
1299 ResPacket R0, R1;
1300 ResPacket alphav = pset1<ResPacket>(alpha);
1301
1302 R0 = r0.template loadPacket<ResPacket>(0);
1303 R1 = r1.template loadPacket<ResPacket>(0);
1304 traits.acc(C0, alphav, R0);
1305 traits.acc(C1, alphav, R1);
1306 r0.storePacket(0, R0);
1307 r1.storePacket(0, R1);
1308
1309 R0 = r2.template loadPacket<ResPacket>(0);
1310 R1 = r3.template loadPacket<ResPacket>(0);
1311 traits.acc(C2, alphav, R0);
1312 traits.acc(C3, alphav, R1);
1313 r2.storePacket(0, R0);
1314 r3.storePacket(0, R1);
1315 }
1316
1317 // Deal with remaining columns of the rhs
1318 for(Index j2=packet_cols4; j2<cols; j2++)
1319 {
1320 // One column at a time
1321 const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1322 prefetch(&blA[0]);
1323
1324 // gets res block as register
1325 AccPacket C0;
1326 traits.initAcc(C0);
1327
1328 LinearMapper r0 = res.getLinearMapper(i, j2);
1329
1330 // performs "inner" products
1331 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1332 LhsPacket A0;
1333
1334 for(Index k= 0; k<peeled_kc; k+=pk)
1335 {
1336 EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
1337 RhsPacket B_0;
1338
1339#define EIGEN_GEBGP_ONESTEP(K) \
1340 do { \
1341 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
1342 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1343 /* FIXME: why unaligned???? */ \
1344 traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
1345 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1346 traits.madd(A0, B_0, C0, B_0, fix<0>); \
1347 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
1348 } while(false);
1349
1358
1359 blB += pk*RhsProgress;
1360 blA += pk*LhsProgress;
1361
1362 EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
1363 }
1364
1365 // process remaining peeled loop
1366 for(Index k=peeled_kc; k<depth; k++)
1367 {
1368 RhsPacket B_0;
1370 blB += RhsProgress;
1371 blA += LhsProgress;
1372 }
1373#undef EIGEN_GEBGP_ONESTEP
1374 ResPacket R0;
1375 ResPacket alphav = pset1<ResPacket>(alpha);
1376 R0 = r0.template loadPacket<ResPacket>(0);
1377 traits.acc(C0, alphav, R0);
1378 r0.storePacket(0, R0);
1379 }
1380 }
1381 }
1382};
1383
1384template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
1385struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
1386{
1387
1388EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1389 {
1390 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1391 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1392 traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
1393 traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
1394 traits.madd(*A0, *B_0, *C0, *B_0);
1395 traits.madd(*A0, *B1, *C1, *B1);
1396 traits.madd(*A0, *B2, *C2, *B2);
1397 traits.madd(*A0, *B3, *C3, *B3);
1398 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1399 }
1400};
1401
1402template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
1404void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
1405 ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
1408 {
1409 Traits traits;
1411
1412 if(strideA==-1) strideA = depth;
1413 if(strideB==-1) strideB = depth;
1415 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
1416 const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
1417 const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
1418 const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
1419 const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
1420 const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
1421 enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
1422 const Index peeled_kc = depth & ~(pk-1);
1423 const int prefetch_res_offset = 32/sizeof(ResScalar);
1424// const Index depth2 = depth & ~1;
1425
1426 //---------- Process 3 * LhsProgress rows at once ----------
1427 // This corresponds to 3*LhsProgress x nr register blocks.
1428 // Usually, make sense only with FMA
1429 if(mr>=3*Traits::LhsProgress)
1430 {
1431 // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
1432 // and on each largest micro vertical panel of the rhs (depth * nr).
1433 // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
1434 // However, if depth is too small, we can extend the number of rows of these horizontal panels.
1435 // This actual number of rows is computed as follow:
1436 const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1437 // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1438 // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
1439 // or because we are testing specific blocking sizes.
1440 const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
1442 {
1444 for(Index j2=0; j2<packet_cols4; j2+=nr)
1445 {
1447 {
1448
1449 // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
1450 // stored into 3 x nr registers.
1451
1452 const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
1453 prefetch(&blA[0]);
1454
1455 // gets res block as register
1456 AccPacket C0, C1, C2, C3,
1457 C4, C5, C6, C7,
1458 C8, C9, C10, C11;
1459 traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1460 traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1461 traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
1462
1463 LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1464 LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1465 LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1466 LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1467
1468 r0.prefetch(0);
1469 r1.prefetch(0);
1470 r2.prefetch(0);
1471 r3.prefetch(0);
1472
1473 // performs "inner" products
1474 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1475 prefetch(&blB[0]);
1476 LhsPacket A0, A1;
1477
1478 for(Index k=0; k<peeled_kc; k+=pk)
1479 {
1480 EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
1481 // 15 registers are taken (12 for acc, 2 for lhs).
1483 RhsPacket T0;
1484 LhsPacket A2;
1485 #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
1486 // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
1487 // without this workaround A0, A1, and A2 are loaded in the same register,
1488 // which is not good for pipelining
1489 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
1490 #else
1491 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
1492 #endif
1493#define EIGEN_GEBP_ONESTEP(K) \
1494 do { \
1495 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1496 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1497 internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
1498 if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
1499 internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
1500 } /* Bug 953 */ \
1501 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1502 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1503 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1504 EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
1505 traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \
1506 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1507 traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1508 traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
1509 traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \
1510 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1511 traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1512 traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
1513 traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \
1514 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1515 traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1516 traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
1517 traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \
1518 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1519 traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1520 traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
1521 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1522 } while (false)
1523
1533
1534 blB += pk*4*RhsProgress;
1535 blA += pk*3*Traits::LhsProgress;
1536
1537 EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
1538 }
1539 // process remaining peeled loop
1540 for(Index k=peeled_kc; k<depth; k++)
1541 {
1543 RhsPacket T0;
1544 LhsPacket A2;
1546 blB += 4*RhsProgress;
1547 blA += 3*Traits::LhsProgress;
1548 }
1549
1550#undef EIGEN_GEBP_ONESTEP
1551
1552 ResPacket R0, R1, R2;
1554
1555 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1556 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1557 R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1558 traits.acc(C0, alphav, R0);
1559 traits.acc(C4, alphav, R1);
1560 traits.acc(C8, alphav, R2);
1561 r0.storePacket(0 * Traits::ResPacketSize, R0);
1562 r0.storePacket(1 * Traits::ResPacketSize, R1);
1563 r0.storePacket(2 * Traits::ResPacketSize, R2);
1564
1565 R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1566 R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1567 R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1568 traits.acc(C1, alphav, R0);
1569 traits.acc(C5, alphav, R1);
1570 traits.acc(C9, alphav, R2);
1571 r1.storePacket(0 * Traits::ResPacketSize, R0);
1572 r1.storePacket(1 * Traits::ResPacketSize, R1);
1573 r1.storePacket(2 * Traits::ResPacketSize, R2);
1574
1575 R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1576 R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1577 R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1578 traits.acc(C2, alphav, R0);
1579 traits.acc(C6, alphav, R1);
1580 traits.acc(C10, alphav, R2);
1581 r2.storePacket(0 * Traits::ResPacketSize, R0);
1582 r2.storePacket(1 * Traits::ResPacketSize, R1);
1583 r2.storePacket(2 * Traits::ResPacketSize, R2);
1584
1585 R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1586 R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1587 R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1588 traits.acc(C3, alphav, R0);
1589 traits.acc(C7, alphav, R1);
1590 traits.acc(C11, alphav, R2);
1591 r3.storePacket(0 * Traits::ResPacketSize, R0);
1592 r3.storePacket(1 * Traits::ResPacketSize, R1);
1593 r3.storePacket(2 * Traits::ResPacketSize, R2);
1594 }
1595 }
1596
1597 // Deal with remaining columns of the rhs
1598 for(Index j2=packet_cols4; j2<cols; j2++)
1599 {
1601 {
1602 // One column at a time
1603 const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
1604 prefetch(&blA[0]);
1605
1606 // gets res block as register
1607 AccPacket C0, C4, C8;
1608 traits.initAcc(C0);
1609 traits.initAcc(C4);
1610 traits.initAcc(C8);
1611
1612 LinearMapper r0 = res.getLinearMapper(i, j2);
1613 r0.prefetch(0);
1614
1615 // performs "inner" products
1616 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1617 LhsPacket A0, A1, A2;
1618
1619 for(Index k=0; k<peeled_kc; k+=pk)
1620 {
1621 EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
1622 RhsPacket B_0;
1623#define EIGEN_GEBGP_ONESTEP(K) \
1624 do { \
1625 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1626 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1627 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1628 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1629 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1630 traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1631 traits.madd(A0, B_0, C0, B_0, fix<0>); \
1632 traits.madd(A1, B_0, C4, B_0, fix<0>); \
1633 traits.madd(A2, B_0, C8, B_0, fix<0>); \
1634 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1635 } while (false)
1636
1645
1646 blB += int(pk) * int(RhsProgress);
1647 blA += int(pk) * 3 * int(Traits::LhsProgress);
1648
1649 EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
1650 }
1651
1652 // process remaining peeled loop
1653 for(Index k=peeled_kc; k<depth; k++)
1654 {
1655 RhsPacket B_0;
1657 blB += RhsProgress;
1658 blA += 3*Traits::LhsProgress;
1659 }
1660#undef EIGEN_GEBGP_ONESTEP
1661 ResPacket R0, R1, R2;
1663
1664 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1665 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1666 R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1667 traits.acc(C0, alphav, R0);
1668 traits.acc(C4, alphav, R1);
1669 traits.acc(C8, alphav, R2);
1670 r0.storePacket(0 * Traits::ResPacketSize, R0);
1671 r0.storePacket(1 * Traits::ResPacketSize, R1);
1672 r0.storePacket(2 * Traits::ResPacketSize, R2);
1673 }
1674 }
1675 }
1676 }
1677
1678 //---------- Process 2 * LhsProgress rows at once ----------
1679 if(mr>=2*Traits::LhsProgress)
1680 {
1681 const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1682 // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1683 // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
1684 // or because we are testing specific blocking sizes.
1685 Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
1686
1688 {
1690 for(Index j2=0; j2<packet_cols4; j2+=nr)
1691 {
1693 {
1694
1695 // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
1696 // stored into 2 x nr registers.
1697
1698 const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1699 prefetch(&blA[0]);
1700
1701 // gets res block as register
1702 AccPacket C0, C1, C2, C3,
1703 C4, C5, C6, C7;
1704 traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1705 traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1706
1707 LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1708 LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1709 LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1710 LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1711
1712 r0.prefetch(prefetch_res_offset);
1713 r1.prefetch(prefetch_res_offset);
1714 r2.prefetch(prefetch_res_offset);
1715 r3.prefetch(prefetch_res_offset);
1716
1717 // performs "inner" products
1718 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1719 prefetch(&blB[0]);
1720 LhsPacket A0, A1;
1721
1722 for(Index k=0; k<peeled_kc; k+=pk)
1723 {
1724 EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
1726 RhsPacket T0;
1727
1728 // NOTE: the begin/end asm comments below work around bug 935!
1729 // but they are not enough for gcc>=6 without FMA (bug 1637)
1730 #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1731 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
1732 #else
1733 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
1734 #endif
1735#define EIGEN_GEBGP_ONESTEP(K) \
1736 do { \
1737 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1738 traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
1739 traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
1740 traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
1741 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1742 traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1743 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1744 traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1745 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1746 traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1747 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1748 traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1749 EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
1750 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1751 } while (false)
1752
1753 internal::prefetch(blB+(48+0));
1758 internal::prefetch(blB+(48+16));
1763
1764 blB += pk*4*RhsProgress;
1765 blA += pk*(2*Traits::LhsProgress);
1766
1767 EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
1768 }
1769 // process remaining peeled loop
1770 for(Index k=peeled_kc; k<depth; k++)
1771 {
1773 RhsPacket T0;
1775 blB += 4*RhsProgress;
1776 blA += 2*Traits::LhsProgress;
1777 }
1778#undef EIGEN_GEBGP_ONESTEP
1779
1780 ResPacket R0, R1, R2, R3;
1782
1783 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1784 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1785 R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1786 R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1787 traits.acc(C0, alphav, R0);
1788 traits.acc(C4, alphav, R1);
1789 traits.acc(C1, alphav, R2);
1790 traits.acc(C5, alphav, R3);
1791 r0.storePacket(0 * Traits::ResPacketSize, R0);
1792 r0.storePacket(1 * Traits::ResPacketSize, R1);
1793 r1.storePacket(0 * Traits::ResPacketSize, R2);
1794 r1.storePacket(1 * Traits::ResPacketSize, R3);
1795
1796 R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1797 R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1798 R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1799 R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1800 traits.acc(C2, alphav, R0);
1801 traits.acc(C6, alphav, R1);
1802 traits.acc(C3, alphav, R2);
1803 traits.acc(C7, alphav, R3);
1804 r2.storePacket(0 * Traits::ResPacketSize, R0);
1805 r2.storePacket(1 * Traits::ResPacketSize, R1);
1806 r3.storePacket(0 * Traits::ResPacketSize, R2);
1807 r3.storePacket(1 * Traits::ResPacketSize, R3);
1808 }
1809 }
1810
1811 // Deal with remaining columns of the rhs
1812 for(Index j2=packet_cols4; j2<cols; j2++)
1813 {
1815 {
1816 // One column at a time
1817 const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1818 prefetch(&blA[0]);
1819
1820 // gets res block as register
1821 AccPacket C0, C4;
1822 traits.initAcc(C0);
1823 traits.initAcc(C4);
1824
1825 LinearMapper r0 = res.getLinearMapper(i, j2);
1826 r0.prefetch(prefetch_res_offset);
1827
1828 // performs "inner" products
1829 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1830 LhsPacket A0, A1;
1831
1832 for(Index k=0; k<peeled_kc; k+=pk)
1833 {
1834 EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
1835 RhsPacket B_0, B1;
1836
1837#define EIGEN_GEBGP_ONESTEP(K) \
1838 do { \
1839 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
1840 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1841 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1842 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1843 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1844 traits.madd(A0, B_0, C0, B1, fix<0>); \
1845 traits.madd(A1, B_0, C4, B_0, fix<0>); \
1846 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
1847 } while(false)
1848
1857
1858 blB += int(pk) * int(RhsProgress);
1859 blA += int(pk) * 2 * int(Traits::LhsProgress);
1860
1861 EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
1862 }
1863
1864 // process remaining peeled loop
1865 for(Index k=peeled_kc; k<depth; k++)
1866 {
1867 RhsPacket B_0, B1;
1869 blB += RhsProgress;
1870 blA += 2*Traits::LhsProgress;
1871 }
1872#undef EIGEN_GEBGP_ONESTEP
1873 ResPacket R0, R1;
1875
1876 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1877 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1878 traits.acc(C0, alphav, R0);
1879 traits.acc(C4, alphav, R1);
1880 r0.storePacket(0 * Traits::ResPacketSize, R0);
1881 r0.storePacket(1 * Traits::ResPacketSize, R1);
1882 }
1883 }
1884 }
1885 }
1886 //---------- Process 1 * LhsProgress rows at once ----------
1887 if(mr>=1*Traits::LhsProgress)
1888 {
1891 }
1892 //---------- Process LhsProgressHalf rows at once ----------
1893 if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
1894 {
1897 }
1898 //---------- Process LhsProgressQuarter rows at once ----------
1899 if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
1900 {
1903 }
1904 //---------- Process remaining rows, 1 at once ----------
1906 {
1907 // loop on each panel of the rhs
1908 for(Index j2=0; j2<packet_cols4; j2+=nr)
1909 {
1910 // loop on each row of the lhs (1*LhsProgress x depth)
1911 for(Index i=peeled_mc_quarter; i<rows; i+=1)
1912 {
1913 const LhsScalar* blA = &blockA[i*strideA+offsetA];
1914 prefetch(&blA[0]);
1915 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1916
1917 // If LhsProgress is 8 or 16, it assumes that there is a
1918 // half or quarter packet, respectively, of the same size as
1919 // nr (which is currently 4) for the return type.
1922 if ((SwappedTraits::LhsProgress % 4) == 0 &&
1923 (SwappedTraits::LhsProgress<=16) &&
1924 (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
1925 (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
1926 {
1927 SAccPacket C0, C1, C2, C3;
1928 straits.initAcc(C0);
1929 straits.initAcc(C1);
1930 straits.initAcc(C2);
1931 straits.initAcc(C3);
1932
1933 const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
1934 const Index endk = (depth/spk)*spk;
1935 const Index endk4 = (depth/(spk*4))*(spk*4);
1936
1937 Index k=0;
1938 for(; k<endk4; k+=4*spk)
1939 {
1941 SRhsPacket B_0,B_1;
1942
1943 straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
1944 straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
1945
1946 straits.loadRhsQuad(blA+0*spk, B_0);
1947 straits.loadRhsQuad(blA+1*spk, B_1);
1948 straits.madd(A0,B_0,C0,B_0, fix<0>);
1949 straits.madd(A1,B_1,C1,B_1, fix<0>);
1950
1951 straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
1952 straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
1953 straits.loadRhsQuad(blA+2*spk, B_0);
1954 straits.loadRhsQuad(blA+3*spk, B_1);
1955 straits.madd(A0,B_0,C2,B_0, fix<0>);
1956 straits.madd(A1,B_1,C3,B_1, fix<0>);
1957
1958 blB += 4*SwappedTraits::LhsProgress;
1959 blA += 4*spk;
1960 }
1961 C0 = padd(padd(C0,C1),padd(C2,C3));
1962 for(; k<endk; k+=spk)
1963 {
1964 SLhsPacket A0;
1965 SRhsPacket B_0;
1966
1967 straits.loadLhsUnaligned(blB, A0);
1968 straits.loadRhsQuad(blA, B_0);
1969 straits.madd(A0,B_0,C0,B_0, fix<0>);
1970
1971 blB += SwappedTraits::LhsProgress;
1972 blA += spk;
1973 }
1974 if(SwappedTraits::LhsProgress==8)
1975 {
1976 // Special case where we have to first reduce the accumulation register C0
1981
1984
1985 if(depth-endk>0)
1986 {
1987 // We have to handle the last row of the rhs which corresponds to a half-packet
1990 straits.loadLhsUnaligned(blB, a0);
1991 straits.loadRhs(blA, b0);
1993 straits.madd(a0,b0,c0,b0, fix<0>);
1994 straits.acc(c0, alphav, R);
1995 }
1996 else
1997 {
1999 }
2000 res.scatterPacket(i, j2, R);
2001 }
2002 else if (SwappedTraits::LhsProgress==16)
2003 {
2004 // Special case where we have to first reduce the
2005 // accumulation register C0. We specialize the block in
2006 // template form, so that LhsProgress < 16 paths don't
2007 // fail to compile
2009 p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
2010 }
2011 else
2012 {
2015 straits.acc(C0, alphav, R);
2016 res.scatterPacket(i, j2, R);
2017 }
2018 }
2019 else // scalar path
2020 {
2021 // get a 1 x 4 res block as registers
2022 ResScalar C0(0), C1(0), C2(0), C3(0);
2023
2024 for(Index k=0; k<depth; k++)
2025 {
2026 LhsScalar A0;
2027 RhsScalar B_0, B_1;
2028
2029 A0 = blA[k];
2030
2031 B_0 = blB[0];
2032 B_1 = blB[1];
2033 C0 = cj.pmadd(A0,B_0,C0);
2034 C1 = cj.pmadd(A0,B_1,C1);
2035
2036 B_0 = blB[2];
2037 B_1 = blB[3];
2038 C2 = cj.pmadd(A0,B_0,C2);
2039 C3 = cj.pmadd(A0,B_1,C3);
2040
2041 blB += 4;
2042 }
2043 res(i, j2 + 0) += alpha * C0;
2044 res(i, j2 + 1) += alpha * C1;
2045 res(i, j2 + 2) += alpha * C2;
2046 res(i, j2 + 3) += alpha * C3;
2047 }
2048 }
2049 }
2050 // remaining columns
2051 for(Index j2=packet_cols4; j2<cols; j2++)
2052 {
2053 // loop on each row of the lhs (1*LhsProgress x depth)
2054 for(Index i=peeled_mc_quarter; i<rows; i+=1)
2055 {
2056 const LhsScalar* blA = &blockA[i*strideA+offsetA];
2057 prefetch(&blA[0]);
2058 // gets a 1 x 1 res block as registers
2059 ResScalar C0(0);
2060 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
2061 for(Index k=0; k<depth; k++)
2062 {
2063 LhsScalar A0 = blA[k];
2064 RhsScalar B_0 = blB[k];
2065 C0 = cj.pmadd(A0, B_0, C0);
2066 }
2067 res(i, j2) += alpha * C0;
2068 }
2069 }
2070 }
2071 }
2072
2073
2074// pack a block of the lhs
2075// The traversal is as follow (mr==4):
2076// 0 4 8 12 ...
2077// 1 5 9 13 ...
2078// 2 6 10 14 ...
2079// 3 7 11 15 ...
2080//
2081// 16 20 24 28 ...
2082// 17 21 25 29 ...
2083// 18 22 26 30 ...
2084// 19 23 27 31 ...
2085//
2086// 32 33 34 35 ...
2087// 36 36 38 39 ...
2088template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2090{
2091 typedef typename DataMapper::LinearMapper LinearMapper;
2092 EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2093};
2094
2095template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2098{
2099 typedef typename unpacket_traits<Packet>::half HalfPacket;
2100 typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2101 enum { PacketSize = unpacket_traits<Packet>::size,
2102 HalfPacketSize = unpacket_traits<HalfPacket>::size,
2103 QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2104 HasHalf = (int)HalfPacketSize < (int)PacketSize,
2105 HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
2106
2107 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
2108 EIGEN_UNUSED_VARIABLE(stride);
2110 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2111 eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
2113 Index count = 0;
2114
2115 const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
2116 const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
2117 const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
2118 const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
2119 const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
2121 const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
2123
2124 Index i=0;
2125
2126 // Pack 3 packets
2127 if(Pack1>=3*PacketSize)
2128 {
2129 for(; i<peeled_mc3; i+=3*PacketSize)
2130 {
2131 if(PanelMode) count += (3*PacketSize) * offset;
2132
2133 for(Index k=0; k<depth; k++)
2134 {
2135 Packet A, B, C;
2136 A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2137 B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2138 C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
2139 pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
2140 pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
2141 pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
2142 }
2143 if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
2144 }
2145 }
2146 // Pack 2 packets
2147 if(Pack1>=2*PacketSize)
2148 {
2149 for(; i<peeled_mc2; i+=2*PacketSize)
2150 {
2151 if(PanelMode) count += (2*PacketSize) * offset;
2152
2153 for(Index k=0; k<depth; k++)
2154 {
2155 Packet A, B;
2156 A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2157 B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2158 pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
2159 pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
2160 }
2161 if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
2162 }
2163 }
2164 // Pack 1 packets
2165 if(Pack1>=1*PacketSize)
2166 {
2167 for(; i<peeled_mc1; i+=1*PacketSize)
2168 {
2169 if(PanelMode) count += (1*PacketSize) * offset;
2170
2171 for(Index k=0; k<depth; k++)
2172 {
2173 Packet A;
2174 A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2175 pstore(blockA+count, cj.pconj(A));
2176 count+=PacketSize;
2177 }
2178 if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
2179 }
2180 }
2181 // Pack half packets
2182 if(HasHalf && Pack1>=HalfPacketSize)
2183 {
2184 for(; i<peeled_mc_half; i+=HalfPacketSize)
2185 {
2186 if(PanelMode) count += (HalfPacketSize) * offset;
2187
2188 for(Index k=0; k<depth; k++)
2189 {
2190 HalfPacket A;
2191 A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
2192 pstoreu(blockA+count, cj.pconj(A));
2193 count+=HalfPacketSize;
2194 }
2195 if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
2196 }
2197 }
2198 // Pack quarter packets
2199 if(HasQuarter && Pack1>=QuarterPacketSize)
2200 {
2201 for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
2202 {
2203 if(PanelMode) count += (QuarterPacketSize) * offset;
2204
2205 for(Index k=0; k<depth; k++)
2206 {
2207 QuarterPacket A;
2208 A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
2209 pstoreu(blockA+count, cj.pconj(A));
2210 count+=QuarterPacketSize;
2211 }
2212 if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
2213 }
2214 }
2215 // Pack2 may be *smaller* than PacketSize—that happens for
2216 // products like real * complex, where we have to go half the
2217 // progress on the lhs in order to duplicate those operands to
2218 // address both real & imaginary parts on the rhs. This portion will
2219 // pack those half ones until they match the number expected on the
2220 // last peeling loop at this point (for the rhs).
2222 {
2224 {
2226
2227 for(Index k=0; k<depth; k++)
2228 for(Index w=0; w<last_lhs_progress; w++)
2229 blockA[count++] = cj(lhs(i+w, k));
2230
2231 if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
2232 }
2233 }
2234 // Pack scalars
2235 for(; i<rows; i++)
2236 {
2237 if(PanelMode) count += offset;
2238 for(Index k=0; k<depth; k++)
2239 blockA[count++] = cj(lhs(i, k));
2240 if(PanelMode) count += (stride-offset-depth);
2241 }
2242}
2243
2244template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2246{
2247 typedef typename DataMapper::LinearMapper LinearMapper;
2248 EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2249};
2250
2251template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2254{
2255 typedef typename unpacket_traits<Packet>::half HalfPacket;
2256 typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2257 enum { PacketSize = unpacket_traits<Packet>::size,
2258 HalfPacketSize = unpacket_traits<HalfPacket>::size,
2259 QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2260 HasHalf = (int)HalfPacketSize < (int)PacketSize,
2261 HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
2262
2263 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
2264 EIGEN_UNUSED_VARIABLE(stride);
2266 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2268 Index count = 0;
2269 bool gone_half = false, gone_quarter = false, gone_last = false;
2270
2271 Index i = 0;
2272 int pack = Pack1;
2273 int psize = PacketSize;
2274 while(pack>0)
2275 {
2277 Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
2279 for(; i<peeled_mc; i+=pack)
2280 {
2281 if(PanelMode) count += pack * offset;
2282
2283 Index k=0;
2284 if(pack>=psize && psize >= QuarterPacketSize)
2285 {
2286 const Index peeled_k = (depth/psize)*psize;
2287 for(; k<peeled_k; k+=psize)
2288 {
2289 for (Index m = 0; m < pack; m += psize)
2290 {
2291 if (psize == PacketSize) {
2292 PacketBlock<Packet> kernel;
2293 for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
2294 ptranspose(kernel);
2295 for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
2296 } else if (HasHalf && psize == HalfPacketSize) {
2297 gone_half = true;
2299 for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
2301 for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
2302 } else if (HasQuarter && psize == QuarterPacketSize) {
2303 gone_quarter = true;
2305 for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
2307 for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
2308 }
2309 }
2310 count += psize*pack;
2311 }
2312 }
2313
2314 for(; k<depth; k++)
2315 {
2316 Index w=0;
2317 for(; w<pack-3; w+=4)
2318 {
2319 Scalar a(cj(lhs(i+w+0, k))),
2320 b(cj(lhs(i+w+1, k))),
2321 c(cj(lhs(i+w+2, k))),
2322 d(cj(lhs(i+w+3, k)));
2323 blockA[count++] = a;
2324 blockA[count++] = b;
2325 blockA[count++] = c;
2326 blockA[count++] = d;
2327 }
2328 if(pack%4)
2329 for(;w<pack;++w)
2330 blockA[count++] = cj(lhs(i+w, k));
2331 }
2332
2333 if(PanelMode) count += pack * (stride-offset-depth);
2334 }
2335
2336 pack -= psize;
2337 Index left = rows - i;
2338 if (pack <= 0) {
2339 if (!gone_last &&
2340 (starting_pos == i || left >= psize/2 || left >= psize/4) &&
2341 ((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
2342 (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
2343 psize /= 2;
2344 pack = psize;
2345 continue;
2346 }
2347 // Pack2 may be *smaller* than PacketSize—that happens for
2348 // products like real * complex, where we have to go half the
2349 // progress on the lhs in order to duplicate those operands to
2350 // address both real & imaginary parts on the rhs. This portion will
2351 // pack those half ones until they match the number expected on the
2352 // last peeling loop at this point (for the rhs).
2353 if (Pack2 < PacketSize && !gone_last) {
2354 gone_last = true;
2355 psize = pack = left & ~1;
2356 }
2357 }
2358 }
2359
2360 for(; i<rows; i++)
2361 {
2362 if(PanelMode) count += offset;
2363 for(Index k=0; k<depth; k++)
2364 blockA[count++] = cj(lhs(i, k));
2365 if(PanelMode) count += (stride-offset-depth);
2366 }
2367}
2368
2369// copy a complete panel of the rhs
2370// this version is optimized for column major matrices
2371// The traversal order is as follow: (nr==4):
2372// 0 1 2 3 12 13 14 15 24 27
2373// 4 5 6 7 16 17 18 19 25 28
2374// 8 9 10 11 20 21 22 23 26 29
2375// . . . . . . . . . .
2376template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2378{
2380 typedef typename DataMapper::LinearMapper LinearMapper;
2381 enum { PacketSize = packet_traits<Scalar>::size };
2382 EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2383};
2384
2385template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2386EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
2387 ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2388{
2389 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
2390 EIGEN_UNUSED_VARIABLE(stride);
2392 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2394 Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2395 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2396 Index count = 0;
2397 const Index peeled_k = (depth/PacketSize)*PacketSize;
2398// if(nr>=8)
2399// {
2400// for(Index j2=0; j2<packet_cols8; j2+=8)
2401// {
2402// // skip what we have before
2403// if(PanelMode) count += 8 * offset;
2404// const Scalar* b0 = &rhs[(j2+0)*rhsStride];
2405// const Scalar* b1 = &rhs[(j2+1)*rhsStride];
2406// const Scalar* b2 = &rhs[(j2+2)*rhsStride];
2407// const Scalar* b3 = &rhs[(j2+3)*rhsStride];
2408// const Scalar* b4 = &rhs[(j2+4)*rhsStride];
2409// const Scalar* b5 = &rhs[(j2+5)*rhsStride];
2410// const Scalar* b6 = &rhs[(j2+6)*rhsStride];
2411// const Scalar* b7 = &rhs[(j2+7)*rhsStride];
2412// Index k=0;
2413// if(PacketSize==8) // TODO enable vectorized transposition for PacketSize==4
2414// {
2415// for(; k<peeled_k; k+=PacketSize) {
2416// PacketBlock<Packet> kernel;
2417// for (int p = 0; p < PacketSize; ++p) {
2418// kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
2419// }
2420// ptranspose(kernel);
2421// for (int p = 0; p < PacketSize; ++p) {
2422// pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
2423// count+=PacketSize;
2424// }
2425// }
2426// }
2427// for(; k<depth; k++)
2428// {
2429// blockB[count+0] = cj(b0[k]);
2430// blockB[count+1] = cj(b1[k]);
2431// blockB[count+2] = cj(b2[k]);
2432// blockB[count+3] = cj(b3[k]);
2433// blockB[count+4] = cj(b4[k]);
2434// blockB[count+5] = cj(b5[k]);
2435// blockB[count+6] = cj(b6[k]);
2436// blockB[count+7] = cj(b7[k]);
2437// count += 8;
2438// }
2439// // skip what we have after
2440// if(PanelMode) count += 8 * (stride-offset-depth);
2441// }
2442// }
2443
2444 if(nr>=4)
2445 {
2447 {
2448 // skip what we have before
2449 if(PanelMode) count += 4 * offset;
2450 const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
2451 const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
2452 const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
2453 const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
2454
2455 Index k=0;
2456 if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ??
2457 {
2458 for(; k<peeled_k; k+=PacketSize) {
2459 PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
2460 kernel.packet[0 ] = dm0.template loadPacket<Packet>(k);
2461 kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
2462 kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
2463 kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
2464 ptranspose(kernel);
2465 pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
2466 pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
2467 pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
2468 pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
2469 count+=4*PacketSize;
2470 }
2471 }
2472 for(; k<depth; k++)
2473 {
2474 blockB[count+0] = cj(dm0(k));
2475 blockB[count+1] = cj(dm1(k));
2476 blockB[count+2] = cj(dm2(k));
2477 blockB[count+3] = cj(dm3(k));
2478 count += 4;
2479 }
2480 // skip what we have after
2481 if(PanelMode) count += 4 * (stride-offset-depth);
2482 }
2483 }
2484
2485 // copy the remaining columns one at a time (nr==1)
2486 for(Index j2=packet_cols4; j2<cols; ++j2)
2487 {
2488 if(PanelMode) count += offset;
2489 const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
2490 for(Index k=0; k<depth; k++)
2491 {
2492 blockB[count] = cj(dm0(k));
2493 count += 1;
2494 }
2495 if(PanelMode) count += (stride-offset-depth);
2496 }
2497}
2498
2499// this version is optimized for row major matrices
2500template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2502{
2506 typedef typename DataMapper::LinearMapper LinearMapper;
2507 enum { PacketSize = packet_traits<Scalar>::size,
2511 {
2512 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
2513 EIGEN_UNUSED_VARIABLE(stride);
2515 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2516 const bool HasHalf = (int)HalfPacketSize < (int)PacketSize;
2517 const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize;
2519 Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2520 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2521 Index count = 0;
2522
2523 // if(nr>=8)
2524 // {
2525 // for(Index j2=0; j2<packet_cols8; j2+=8)
2526 // {
2527 // // skip what we have before
2528 // if(PanelMode) count += 8 * offset;
2529 // for(Index k=0; k<depth; k++)
2530 // {
2531 // if (PacketSize==8) {
2532 // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2533 // pstoreu(blockB+count, cj.pconj(A));
2534 // } else if (PacketSize==4) {
2535 // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2536 // Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
2537 // pstoreu(blockB+count, cj.pconj(A));
2538 // pstoreu(blockB+count+PacketSize, cj.pconj(B));
2539 // } else {
2540 // const Scalar* b0 = &rhs[k*rhsStride + j2];
2541 // blockB[count+0] = cj(b0[0]);
2542 // blockB[count+1] = cj(b0[1]);
2543 // blockB[count+2] = cj(b0[2]);
2544 // blockB[count+3] = cj(b0[3]);
2545 // blockB[count+4] = cj(b0[4]);
2546 // blockB[count+5] = cj(b0[5]);
2547 // blockB[count+6] = cj(b0[6]);
2548 // blockB[count+7] = cj(b0[7]);
2549 // }
2550 // count += 8;
2551 // }
2552 // // skip what we have after
2553 // if(PanelMode) count += 8 * (stride-offset-depth);
2554 // }
2555 // }
2556 if(nr>=4)
2557 {
2559 {
2560 // skip what we have before
2561 if(PanelMode) count += 4 * offset;
2562 for(Index k=0; k<depth; k++)
2563 {
2564 if (PacketSize==4) {
2565 Packet A = rhs.template loadPacket<Packet>(k, j2);
2566 pstoreu(blockB+count, cj.pconj(A));
2567 count += PacketSize;
2568 } else if (HasHalf && HalfPacketSize==4) {
2569 HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
2570 pstoreu(blockB+count, cj.pconj(A));
2571 count += HalfPacketSize;
2572 } else if (HasQuarter && QuarterPacketSize==4) {
2573 QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
2574 pstoreu(blockB+count, cj.pconj(A));
2575 count += QuarterPacketSize;
2576 } else {
2577 const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
2578 blockB[count+0] = cj(dm0(0));
2579 blockB[count+1] = cj(dm0(1));
2580 blockB[count+2] = cj(dm0(2));
2581 blockB[count+3] = cj(dm0(3));
2582 count += 4;
2583 }
2584 }
2585 // skip what we have after
2586 if(PanelMode) count += 4 * (stride-offset-depth);
2587 }
2588 }
2589 // copy the remaining columns one at a time (nr==1)
2590 for(Index j2=packet_cols4; j2<cols; ++j2)
2591 {
2592 if(PanelMode) count += offset;
2593 for(Index k=0; k<depth; k++)
2594 {
2595 blockB[count] = cj(rhs(k, j2));
2596 count += 1;
2597 }
2598 if(PanelMode) count += stride-offset-depth;
2599 }
2600 }
2601};
2602
2603} // end namespace internal
2604
2607inline std::ptrdiff_t l1CacheSize()
2608{
2609 std::ptrdiff_t l1, l2, l3;
2611 return l1;
2612}
2613
2616inline std::ptrdiff_t l2CacheSize()
2617{
2618 std::ptrdiff_t l1, l2, l3;
2620 return l2;
2621}
2622
2626inline std::ptrdiff_t l3CacheSize()
2627{
2628 std::ptrdiff_t l1, l2, l3;
2630 return l3;
2631}
2632
2638inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
2639{
2641}
2642
2643} // end namespace Eigen
2644
2645#endif // EIGEN_GENERAL_BLOCK_PANEL_H
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
Definition PacketMath.h:22
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
Definition PacketMath.h:27
Matrix3f m
Definition AngleAxis_mimic_euler.cpp:1
ArrayXXi a
Definition Array_initializer_list_23_cxx11.cpp:1
int n
Definition BiCGSTAB_simple.cpp:1
int i
Definition BiCGSTAB_step_by_step.cpp:9
internal::enable_if< internal::valid_indexed_view_overload< RowIndices, ColIndices >::value &&internal::traits< typenameEIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::ReturnAsIndexedView, typenameEIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::type operator()(const RowIndices &rowIndices, const ColIndices &colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
Definition IndexedViewMethods.h:73
#define EIGEN_ASM_COMMENT(X)
Definition Macros.h:1082
#define EIGEN_COMP_MSVC
Definition Macros.h:114
#define EIGEN_PLAIN_ENUM_MIN(a, b)
Definition Macros.h:1288
#define eigen_internal_assert(x)
Definition Macros.h:1043
#define EIGEN_UNUSED_VARIABLE(var)
Definition Macros.h:1076
#define EIGEN_DONT_INLINE
Definition Macros.h:940
#define eigen_assert(x)
Definition Macros.h:1037
#define EIGEN_STRONG_INLINE
Definition Macros.h:917
RowVector3d w
Definition Matrix_resize_int.cpp:3
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
Definition PartialRedux_count.cpp:3
float * p
Definition Tutorial_Map_using.cpp:9
int rows
Definition Tutorial_commainit_02.cpp:1
int cols
Definition Tutorial_commainit_02.cpp:1
Scalar Scalar * c
Definition benchVecAdd.cpp:17
Scalar * b
Definition benchVecAdd.cpp:17
Scalar Scalar int size
Definition benchVecAdd.cpp:17
SCALAR Scalar
Definition bench_gemm.cpp:46
Matrix< Scalar, Dynamic, Dynamic > C
Definition bench_gemm.cpp:50
NumTraits< Scalar >::Real RealScalar
Definition bench_gemm.cpp:47
Matrix< SCALARA, Dynamic, Dynamic, opt_A > A
Definition bench_gemm.cpp:48
Matrix< SCALARB, Dynamic, Dynamic, opt_B > B
Definition bench_gemm.cpp:49
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M
Definition benchmark-blocking-sizes.cpp:22
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K
Definition benchmark-blocking-sizes.cpp:21
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
Definition benchmark-blocking-sizes.cpp:20
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N
Definition benchmark-blocking-sizes.cpp:23
mp::number< mp::cpp_dec_float< 100 >, mp::et_on > Real
Definition boostmultiprec.cpp:78
Definition ForwardDeclarations.h:87
The matrix class, also used for vectors and row-vectors.
Definition Matrix.h:180
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
Definition GeneralBlockPanelKernel.h:993
EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const true_type &) const
Definition GeneralBlockPanelKernel.h:1016
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
Definition GeneralBlockPanelKernel.h:1033
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const LaneIdType &) const
Definition GeneralBlockPanelKernel.h:1010
EIGEN_STRONG_INLINE void acc(const AccPacketType &c, const ResPacketType &alpha, ResPacketType &r) const
Definition GeneralBlockPanelKernel.h:1039
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition GeneralBlockPanelKernel.h:974
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
Definition GeneralBlockPanelKernel.h:962
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
Definition GeneralBlockPanelKernel.h:1004
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *, RhsPacketx4 &) const
Definition GeneralBlockPanelKernel.h:990
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition GeneralBlockPanelKernel.h:985
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
Definition GeneralBlockPanelKernel.h:968
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
Definition GeneralBlockPanelKernel.h:1027
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
Definition GeneralBlockPanelKernel.h:963
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
Definition GeneralBlockPanelKernel.h:998
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
Definition GeneralBlockPanelKernel.h:961
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketx4 &dest) const
Definition GeneralBlockPanelKernel.h:979
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition GeneralBlockPanelKernel.h:605
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
Definition GeneralBlockPanelKernel.h:613
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *, RhsPacketx4 &) const
Definition GeneralBlockPanelKernel.h:610
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition GeneralBlockPanelKernel.h:594
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
Definition GeneralBlockPanelKernel.h:660
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
Definition GeneralBlockPanelKernel.h:666
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
Definition GeneralBlockPanelKernel.h:581
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
Definition GeneralBlockPanelKernel.h:579
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
Definition GeneralBlockPanelKernel.h:552
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketx4 &dest) const
Definition GeneralBlockPanelKernel.h:599
EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar *b, RhsPacket &dest, const false_type &) const
Definition GeneralBlockPanelKernel.h:626
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const LaneIdType &) const
Definition GeneralBlockPanelKernel.h:644
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
Definition GeneralBlockPanelKernel.h:588
EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar *b, RhsPacket &dest, const true_type &) const
Definition GeneralBlockPanelKernel.h:618
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
Definition GeneralBlockPanelKernel.h:632
EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const true_type &) const
Definition GeneralBlockPanelKernel.h:650
EIGEN_STRONG_INLINE void acc(const AccPacketType &c, const ResPacketType &alpha, ResPacketType &r) const
Definition GeneralBlockPanelKernel.h:672
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
Definition GeneralBlockPanelKernel.h:638
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
Definition GeneralBlockPanelKernel.h:580
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, ResPacket &dest) const
Definition GeneralBlockPanelKernel.h:846
conditional< Vectorizable, ScalarPacket, Scalar >::type LhsPacket4Packing
Definition GeneralBlockPanelKernel.h:792
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, DoublePacketType &dest) const
Definition GeneralBlockPanelKernel.h:850
conditional< Vectorizable, ScalarPacket, Scalar >::type ResPacket
Definition GeneralBlockPanelKernel.h:795
EIGEN_STRONG_INLINE enable_if<!is_same< RhsPacketType, RhsPacketx4 >::value >::type madd(const LhsPacketType &a, const RhsPacketType &b, DoublePacket< ResPacketType > &c, TmpType &, const LaneIdType &) const
Definition GeneralBlockPanelKernel.h:870
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *, RhsPacketx4 &) const
Definition GeneralBlockPanelKernel.h:844
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, ScalarPacket &dest) const
Definition GeneralBlockPanelKernel.h:810
EIGEN_STRONG_INLINE void acc(const DoublePacket< RealPacketType > &c, const ResPacketType &alpha, ResPacketType &r) const
Definition GeneralBlockPanelKernel.h:891
conditional< Vectorizable, DoublePacketType, Scalar >::type RhsPacket
Definition GeneralBlockPanelKernel.h:794
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
Definition GeneralBlockPanelKernel.h:862
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketx4 &dest) const
Definition GeneralBlockPanelKernel.h:823
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, DoublePacket< RealPacketType > &dest) const
Definition GeneralBlockPanelKernel.h:839
conditional< Vectorizable, DoublePacketType, Scalar >::type AccPacket
Definition GeneralBlockPanelKernel.h:796
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
Definition GeneralBlockPanelKernel.h:883
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
Definition GeneralBlockPanelKernel.h:856
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, ScalarPacket &dest) const
Definition GeneralBlockPanelKernel.h:832
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, ResPacket &c, RhsPacket &, const LaneIdType &) const
Definition GeneralBlockPanelKernel.h:877
conj_helper< LhsScalar, RhsScalar, ConjLhs, ConjRhs > cj
Definition GeneralBlockPanelKernel.h:920
EIGEN_STRONG_INLINE void acc(const Scalar &c, const Scalar &alpha, Scalar &r) const
Definition GeneralBlockPanelKernel.h:888
conditional< Vectorizable, RealPacket, Scalar >::type LhsPacket
Definition GeneralBlockPanelKernel.h:793
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, DoublePacket< RealPacketType > &dest) const
Definition GeneralBlockPanelKernel.h:817
EIGEN_STRONG_INLINE void initAcc(DoublePacketType &p)
Definition GeneralBlockPanelKernel.h:803
Definition GeneralBlockPanelKernel.h:419
PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize)
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
Definition GeneralBlockPanelKernel.h:494
_RhsScalar RhsScalar
Definition GeneralBlockPanelKernel.h:422
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
Definition GeneralBlockPanelKernel.h:461
ResPacket AccPacket
Definition GeneralBlockPanelKernel.h:466
_LhsScalar LhsScalar
Definition GeneralBlockPanelKernel.h:421
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition GeneralBlockPanelKernel.h:474
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const LaneIdType &) const
Definition GeneralBlockPanelKernel.h:512
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
Definition GeneralBlockPanelKernel.h:460
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
Definition GeneralBlockPanelKernel.h:528
EIGEN_STRONG_INLINE void acc(const ResPacketHalf &c, const ResPacketHalf &alpha, ResPacketHalf &r) const
Definition GeneralBlockPanelKernel.h:539
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
Definition GeneralBlockPanelKernel.h:533
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacketType &dest) const
Definition GeneralBlockPanelKernel.h:500
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition GeneralBlockPanelKernel.h:485
QuadPacket< RhsPacket > RhsPacketx4
Definition GeneralBlockPanelKernel.h:465
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
Definition GeneralBlockPanelKernel.h:423
LhsPacket LhsPacket4Packing
Definition GeneralBlockPanelKernel.h:463
PACKET_DECL_COND_PREFIX(_, Res, _PacketSize)
PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize)
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
Definition GeneralBlockPanelKernel.h:506
@ ConjRhs
Definition GeneralBlockPanelKernel.h:431
@ nr
Definition GeneralBlockPanelKernel.h:440
@ mr
Definition GeneralBlockPanelKernel.h:452
@ LhsProgress
Definition GeneralBlockPanelKernel.h:455
@ NumberOfRegisters
Definition GeneralBlockPanelKernel.h:437
@ LhsPacketSize
Definition GeneralBlockPanelKernel.h:433
@ RhsProgress
Definition GeneralBlockPanelKernel.h:456
@ ResPacketSize
Definition GeneralBlockPanelKernel.h:435
@ Vectorizable
Definition GeneralBlockPanelKernel.h:432
@ default_mr
Definition GeneralBlockPanelKernel.h:443
@ ConjLhs
Definition GeneralBlockPanelKernel.h:430
@ RhsPacketSize
Definition GeneralBlockPanelKernel.h:434
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
Definition GeneralBlockPanelKernel.h:462
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *, RhsPacketx4 &) const
Definition GeneralBlockPanelKernel.h:490
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketx4 &dest) const
Definition GeneralBlockPanelKernel.h:479
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
Definition GeneralBlockPanelKernel.h:468
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Definition gnuplot_common_settings.hh:64
@ ColMajor
Definition Constants.h:319
@ RowMajor
Definition Constants.h:321
else
Definition cholesky.cpp:66
return int(ret)+1
RealScalar alpha
Definition level1_cplx_impl.h:147
const std::ptrdiff_t defaultL2CacheSize
Definition GeneralBlockPanelKernel.h:62
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
Definition Complex.h:167
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
Definition GenericPacketMath.h:215
EIGEN_DEVICE_FUNC void pbroadcast4(const typename unpacket_traits< Packet >::type *a, Packet &a0, Packet &a1, Packet &a2, Packet &a3)
Definition GenericPacketMath.h:653
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
Definition Complex.h:224
void queryCacheSizes(int &l1, int &l2, int &l3)
Definition Memory.h:1106
void evaluateProductBlockingSizesHeuristic(Index &k, Index &m, Index &n, Index num_threads=1)
Definition GeneralBlockPanelKernel.h:124
EIGEN_DEVICE_FUNC void prefetch(const Scalar *addr)
Definition GenericPacketMath.h:719
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition PacketMath.h:827
EIGEN_DEVICE_FUNC Packet pmul(const Packet &a, const Packet &b)
Definition GenericPacketMath.h:237
const std::ptrdiff_t defaultL3CacheSize
Definition GeneralBlockPanelKernel.h:63
EIGEN_STRONG_INLINE Packet1cd pcplxflip(const Packet1cd &x)
Definition Complex.h:620
GEBPPacketSizeType
Definition GeneralBlockPanelKernel.h:18
@ GEBPPacketHalf
Definition GeneralBlockPanelKernel.h:20
@ GEBPPacketQuarter
Definition GeneralBlockPanelKernel.h:21
@ GEBPPacketFull
Definition GeneralBlockPanelKernel.h:19
void computeProductBlockingSizes(Index &k, Index &m, Index &n, Index num_threads=1)
Computes the blocking parameters for a m x k times k x n matrix product.
Definition GeneralBlockPanelKernel.h:339
void manage_caching_sizes(Action action, std::ptrdiff_t *l1, std::ptrdiff_t *l2, std::ptrdiff_t *l3)
Definition GeneralBlockPanelKernel.h:86
EIGEN_DEVICE_FUNC void pstore(Scalar *to, const Packet &from)
Definition GenericPacketMath.h:696
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c &a)
Definition PacketMath.h:2478
@ Lhs
Definition TensorContractionMapper.h:19
@ Rhs
Definition TensorContractionMapper.h:18
bool useSpecificBlockingSizes(Index &k, Index &m, Index &n)
Definition GeneralBlockPanelKernel.h:305
EIGEN_DEVICE_FUNC void pstoreu(Scalar *to, const Packet &from)
Definition GenericPacketMath.h:700
std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
Definition GeneralBlockPanelKernel.h:29
const std::ptrdiff_t defaultL1CacheSize
Definition GeneralBlockPanelKernel.h:61
EIGEN_DEVICE_FUNC Packet psub(const Packet &a, const Packet &b)
Definition GenericPacketMath.h:222
void loadQuadToDoublePacket(const Scalar *b, DoublePacket< RealPacket > &dest, typename enable_if< unpacket_traits< RealPacket >::size<=8 >::type *=0)
Definition GeneralBlockPanelKernel.h:725
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
Definition MathFunctions.h:1091
EIGEN_DEVICE_FUNC T div_ceil(const T &a, const T &b)
Definition Meta.h:779
Namespace containing all symbols from the Eigen library.
Definition bench_norm.cpp:85
std::ptrdiff_t l1CacheSize()
Definition GeneralBlockPanelKernel.h:2607
std::ptrdiff_t l2CacheSize()
Definition GeneralBlockPanelKernel.h:2616
Action
Definition Constants.h:504
@ GetAction
Definition Constants.h:504
@ SetAction
Definition Constants.h:504
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
std::ptrdiff_t l3CacheSize()
Definition GeneralBlockPanelKernel.h:2626
void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
Definition GeneralBlockPanelKernel.h:2638
Definition BandTriangularSolver.h:13
Definition BFloat16.h:88
#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val)
Definition GeneralBlockPanelKernel.h:37
#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val)
Definition GeneralBlockPanelKernel.h:49
#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val)
Definition GeneralBlockPanelKernel.h:43
#define EIGEN_GEBGP_ONESTEP(K)
#define EIGEN_GEBP_ONESTEP(K)
uint8_t count
Definition ref_serial.h:256
Determines whether the given binary operation of two numeric types is allowed and what the scalar ret...
Definition XprHelper.h:806
Definition GeneralBlockPanelKernel.h:71
std::ptrdiff_t m_l1
Definition GeneralBlockPanelKernel.h:80
CacheSizes()
Definition GeneralBlockPanelKernel.h:72
std::ptrdiff_t m_l2
Definition GeneralBlockPanelKernel.h:81
std::ptrdiff_t m_l3
Definition GeneralBlockPanelKernel.h:82
Definition GeneralBlockPanelKernel.h:683
Packet second
Definition GeneralBlockPanelKernel.h:685
Packet first
Definition GeneralBlockPanelKernel.h:684
Definition GenericPacketMath.h:1014
Definition GeneralBlockPanelKernel.h:362
const Packet & get(const FixedInt< 0 > &) const
Definition GeneralBlockPanelKernel.h:364
const Packet & get(const FixedInt< 1 > &) const
Definition GeneralBlockPanelKernel.h:365
Packet B2
Definition GeneralBlockPanelKernel.h:363
const Packet & get(const FixedInt< 2 > &) const
Definition GeneralBlockPanelKernel.h:366
Packet B_0
Definition GeneralBlockPanelKernel.h:363
Packet B1
Definition GeneralBlockPanelKernel.h:363
const Packet & get(const FixedInt< 3 > &) const
Definition GeneralBlockPanelKernel.h:367
Packet B3
Definition GeneralBlockPanelKernel.h:363
Definition GeneralBlockPanelKernel.h:353
RhsPacket ::type type
Definition GeneralBlockPanelKernel.h:357
typedef RhsPacketx4
Definition GeneralBlockPanelKernel.h:357
Definition Meta.h:273
Definition Meta.h:97
Definition GeneralBlockPanelKernel.h:1058
DataMapper::LinearMapper LinearMapper
Definition GeneralBlockPanelKernel.h:1090
Traits::RhsPacket RhsPacket
Definition GeneralBlockPanelKernel.h:1065
SwappedTraits::ResPacket SResPacket
Definition GeneralBlockPanelKernel.h:1077
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target > SwappedTraits
Definition GeneralBlockPanelKernel.h:1072
QuarterTraits::ResPacket ResPacketQuarter
Definition GeneralBlockPanelKernel.h:1087
SwappedTraits::AccPacket SAccPacket
Definition GeneralBlockPanelKernel.h:1078
QuarterTraits::RhsPacket RhsPacketQuarter
Definition GeneralBlockPanelKernel.h:1086
HalfTraits::ResPacket ResPacketHalf
Definition GeneralBlockPanelKernel.h:1082
HalfTraits::RhsPacket RhsPacketHalf
Definition GeneralBlockPanelKernel.h:1081
Traits::RhsPacketx4 RhsPacketx4
Definition GeneralBlockPanelKernel.h:1068
SwappedTraits::RhsPacket SRhsPacket
Definition GeneralBlockPanelKernel.h:1076
QuarterTraits::LhsPacket LhsPacketQuarter
Definition GeneralBlockPanelKernel.h:1085
HalfTraits::LhsPacket LhsPacketHalf
Definition GeneralBlockPanelKernel.h:1080
QuarterTraits::AccPacket AccPacketQuarter
Definition GeneralBlockPanelKernel.h:1088
SwappedTraits::ResScalar SResScalar
Definition GeneralBlockPanelKernel.h:1074
RhsPanelHelper< RhsPacket, RhsPacketx4, 15 >::type RhsPanel15
Definition GeneralBlockPanelKernel.h:1070
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketHalf > HalfTraits
Definition GeneralBlockPanelKernel.h:1060
Traits::LhsPacket LhsPacket
Definition GeneralBlockPanelKernel.h:1064
Traits::ResScalar ResScalar
Definition GeneralBlockPanelKernel.h:1063
SwappedTraits::LhsPacket SLhsPacket
Definition GeneralBlockPanelKernel.h:1075
Traits::ResPacket ResPacket
Definition GeneralBlockPanelKernel.h:1066
Traits::AccPacket AccPacket
Definition GeneralBlockPanelKernel.h:1067
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target > Traits
Definition GeneralBlockPanelKernel.h:1059
HalfTraits::AccPacket AccPacketHalf
Definition GeneralBlockPanelKernel.h:1083
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketQuarter > QuarterTraits
Definition GeneralBlockPanelKernel.h:1061
Definition BlasUtil.h:28
DataMapper::LinearMapper LinearMapper
Definition GeneralBlockPanelKernel.h:2380
packet_traits< Scalar >::type Packet
Definition GeneralBlockPanelKernel.h:2379
DataMapper::LinearMapper LinearMapper
Definition GeneralBlockPanelKernel.h:2506
EIGEN_DONT_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride=0, Index offset=0)
Definition GeneralBlockPanelKernel.h:2510
unpacket_traits< typenameunpacket_traits< Packet >::half >::half QuarterPacket
Definition GeneralBlockPanelKernel.h:2505
unpacket_traits< Packet >::half HalfPacket
Definition GeneralBlockPanelKernel.h:2504
packet_traits< Scalar >::type Packet
Definition GeneralBlockPanelKernel.h:2503
Definition BlasUtil.h:25
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target > SwappedTraits
Definition GeneralBlockPanelKernel.h:1143
EIGEN_STRONG_INLINE void operator()(const DataMapper &res, SwappedTraits &straits, const LhsScalar *blA, const RhsScalar *blB, Index depth, const Index endk, Index i, Index j2, ResScalar alpha, SAccPacket &C0)
Definition GeneralBlockPanelKernel.h:1151
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target > Traits
Definition GeneralBlockPanelKernel.h:1142
Definition GeneralBlockPanelKernel.h:1112
SwappedTraits::AccPacket SAccPacket
Definition GeneralBlockPanelKernel.h:1120
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target > Traits
Definition GeneralBlockPanelKernel.h:1113
SwappedTraits::ResPacket SResPacket
Definition GeneralBlockPanelKernel.h:1119
Traits::ResScalar ResScalar
Definition GeneralBlockPanelKernel.h:1116
SwappedTraits::LhsPacket SLhsPacket
Definition GeneralBlockPanelKernel.h:1117
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target > SwappedTraits
Definition GeneralBlockPanelKernel.h:1114
EIGEN_STRONG_INLINE void operator()(const DataMapper &res, SwappedTraits &straits, const LhsScalar *blA, const RhsScalar *blB, Index depth, const Index endk, Index i, Index j2, ResScalar alpha, SAccPacket &C0)
Definition GeneralBlockPanelKernel.h:1122
SwappedTraits::RhsPacket SRhsPacket
Definition GeneralBlockPanelKernel.h:1118
Definition GeneralBlockPanelKernel.h:1386
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar *blA, const RhsScalar *blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
Definition GeneralBlockPanelKernel.h:1388
Definition GeneralBlockPanelKernel.h:1191
EIGEN_STRONG_INLINE void operator()(const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, ResScalar alpha, Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
Definition GeneralBlockPanelKernel.h:1210
GEBPTraits::RhsPacketx4 RhsPacketx4
Definition GeneralBlockPanelKernel.h:1192
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar *blA, const RhsScalar *blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
Definition GeneralBlockPanelKernel.h:1194
Definition GeneralBlockPanelKernel.h:371
T3 type
Definition GeneralBlockPanelKernel.h:371
Definition GenericPacketMath.h:107
Definition ForwardDeclarations.h:17
Definition Meta.h:96
DoublePacket< typename unpacket_traits< Packet >::half > half
Definition GeneralBlockPanelKernel.h:746
Definition GenericPacketMath.h:133
T half
Definition GenericPacketMath.h:135
Definition datatypes.h:12
Definition PacketMath.h:47