TR-mbed 1.0
Loading...
Searching...
No Matches
MatrixProduct.h
Go to the documentation of this file.
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com)
5// Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com)
6//
7// This Source Code Form is subject to the terms of the Mozilla
8// Public License v. 2.0. If a copy of the MPL was not distributed
9// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10
11#ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H
12#define EIGEN_MATRIX_PRODUCT_ALTIVEC_H
13
14#ifndef EIGEN_ALTIVEC_USE_CUSTOM_PACK
15#define EIGEN_ALTIVEC_USE_CUSTOM_PACK 1
16#endif
17
18#include "MatrixProductCommon.h"
19
20// Since LLVM doesn't support dynamic dispatching, force either always MMA or VSX
21#if EIGEN_COMP_LLVM
22#if !defined(EIGEN_ALTIVEC_DISABLE_MMA) && !defined(EIGEN_ALTIVEC_MMA_ONLY)
23#ifdef __MMA__
24#define EIGEN_ALTIVEC_MMA_ONLY
25#else
26#define EIGEN_ALTIVEC_DISABLE_MMA
27#endif
28#endif
29#endif
30
31#ifdef __has_builtin
32#if __has_builtin(__builtin_mma_assemble_acc)
33 #define ALTIVEC_MMA_SUPPORT
34#endif
35#endif
36
37#if defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
38 #include "MatrixProductMMA.h"
39#endif
40
41/**************************************************************************************************
42 * TODO *
43 * - Check StorageOrder on dhs_pack (the innermost second loop seems unvectorized when it could). *
44 * - Check the possibility of transposing as GETREAL and GETIMAG when needed. *
45 **************************************************************************************************/
46namespace Eigen {
47
48namespace internal {
49
50/**************************
51 * Constants and typedefs *
52 **************************/
53template<typename Scalar>
66
67template<>
80
81// MatrixProduct decomposes real/imaginary vectors into a real vector and an imaginary vector, this turned out
82// to be faster than Eigen's usual approach of having real/imaginary pairs on a single vector. This constants then
83// are responsible to extract from convert between Eigen's and MatrixProduct approach.
84
85const static Packet16uc p16uc_GETREAL32 = { 0, 1, 2, 3,
86 8, 9, 10, 11,
87 16, 17, 18, 19,
88 24, 25, 26, 27};
89
90const static Packet16uc p16uc_GETIMAG32 = { 4, 5, 6, 7,
91 12, 13, 14, 15,
92 20, 21, 22, 23,
93 28, 29, 30, 31};
94const static Packet16uc p16uc_GETREAL64 = { 0, 1, 2, 3, 4, 5, 6, 7,
95 16, 17, 18, 19, 20, 21, 22, 23};
96
97//[a,ai],[b,bi] = [ai,bi]
98const static Packet16uc p16uc_GETIMAG64 = { 8, 9, 10, 11, 12, 13, 14, 15,
99 24, 25, 26, 27, 28, 29, 30, 31};
100
101/*********************************************
102 * Single precision real and complex packing *
103 * *******************************************/
104
119template<typename Scalar, typename Index, int StorageOrder>
120EIGEN_ALWAYS_INLINE std::complex<Scalar> getAdjointVal(Index i, Index j, const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder>& dt)
121{
122 std::complex<Scalar> v;
123 if(i < j)
124 {
125 v.real( dt(j,i).real());
126 v.imag(-dt(j,i).imag());
127 } else if(i > j)
128 {
129 v.real( dt(i,j).real());
130 v.imag( dt(i,j).imag());
131 } else {
132 v.real( dt(i,j).real());
133 v.imag((Scalar)0.0);
134 }
135 return v;
136}
137
138template<typename Scalar, typename Index, int StorageOrder, int N>
139EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex<Scalar>* blockB, const std::complex<Scalar>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
140{
141 const Index depth = k2 + rows;
145 Scalar* blockBf = reinterpret_cast<Scalar *>(blockB);
146
147 Index rir = 0, rii, j = 0;
148 for(; j + vectorSize <= cols; j+=vectorSize)
149 {
150 rii = rir + vectorDelta;
151
152 for(Index i = k2; i < depth; i++)
153 {
154 for(Index k = 0; k < vectorSize; k++)
155 {
156 std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, j + k, rhs);
157
158 blockBf[rir + k] = v.real();
159 blockBf[rii + k] = v.imag();
160 }
161 rir += vectorSize;
162 rii += vectorSize;
163 }
164
165 rir += vectorDelta;
166 }
167 if (j < cols)
168 {
169 rii = rir + ((cols - j) * rows);
170
171 for(Index i = k2; i < depth; i++)
172 {
173 Index k = j;
174 for(; k < cols; k++)
175 {
176 std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, k, rhs);
177
178 blockBf[rir] = v.real();
179 blockBf[rii] = v.imag();
180
181 rir += 1;
182 rii += 1;
183 }
184 }
185 }
186}
187
188template<typename Scalar, typename Index, int StorageOrder>
189EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex<Scalar>* blockA, const std::complex<Scalar>* _lhs, Index lhsStride, Index cols, Index rows)
190{
191 const Index depth = cols;
195 Scalar* blockAf = (Scalar *)(blockA);
196
197 Index rir = 0, rii, j = 0;
198 for(; j + vectorSize <= rows; j+=vectorSize)
199 {
200 rii = rir + vectorDelta;
201
202 for(Index i = 0; i < depth; i++)
203 {
204 for(Index k = 0; k < vectorSize; k++)
205 {
206 std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(j+k, i, lhs);
207
208 blockAf[rir + k] = v.real();
209 blockAf[rii + k] = v.imag();
210 }
211 rir += vectorSize;
212 rii += vectorSize;
213 }
214
215 rir += vectorDelta;
216 }
217
218 if (j < rows)
219 {
220 rii = rir + ((rows - j) * depth);
221
222 for(Index i = 0; i < depth; i++)
223 {
224 Index k = j;
225 for(; k < rows; k++)
226 {
227 std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(k, i, lhs);
228
229 blockAf[rir] = v.real();
230 blockAf[rii] = v.imag();
231
232 rir += 1;
233 rii += 1;
234 }
235 }
236 }
237}
238
239template<typename Scalar, typename Index, int StorageOrder, int N>
241{
242 const Index depth = k2 + rows;
245
246 Index ri = 0, j = 0;
247 for(; j + N*vectorSize <= cols; j+=N*vectorSize)
248 {
249 Index i = k2;
250 for(; i < depth; i++)
251 {
252 for(Index k = 0; k < N*vectorSize; k++)
253 {
254 if(i <= j+k)
255 blockB[ri + k] = rhs(j+k, i);
256 else
257 blockB[ri + k] = rhs(i, j+k);
258 }
259 ri += N*vectorSize;
260 }
261 }
262
263 if (j < cols)
264 {
265 for(Index i = k2; i < depth; i++)
266 {
267 Index k = j;
268 for(; k < cols; k++)
269 {
270 if(k <= i)
271 blockB[ri] = rhs(i, k);
272 else
273 blockB[ri] = rhs(k, i);
274 ri += 1;
275 }
276 }
277 }
278}
279
280template<typename Scalar, typename Index, int StorageOrder>
282{
283 const Index depth = cols;
286
287 Index ri = 0, j = 0;
288 for(; j + vectorSize <= rows; j+=vectorSize)
289 {
290 Index i = 0;
291
292 for(; i < depth; i++)
293 {
294 for(Index k = 0; k < vectorSize; k++)
295 {
296 if(i <= j+k)
297 blockA[ri + k] = lhs(j+k, i);
298 else
299 blockA[ri + k] = lhs(i, j+k);
300 }
301 ri += vectorSize;
302 }
303 }
304
305 if (j < rows)
306 {
307 for(Index i = 0; i < depth; i++)
308 {
309 Index k = j;
310 for(; k < rows; k++)
311 {
312 if(i <= k)
313 blockA[ri] = lhs(k, i);
314 else
315 blockA[ri] = lhs(i, k);
316 ri += 1;
317 }
318 }
319 }
320}
321
322template<typename Index, int nr, int StorageOrder>
323struct symm_pack_rhs<std::complex<float>, Index, nr, StorageOrder>
324{
325 void operator()(std::complex<float>* blockB, const std::complex<float>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
326 {
328 }
329};
330
331template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
332struct symm_pack_lhs<std::complex<float>, Index, Pack1, Pack2_dummy, StorageOrder>
333{
334 void operator()(std::complex<float>* blockA, const std::complex<float>* _lhs, Index lhsStride, Index cols, Index rows)
335 {
337 }
338};
339
340// *********** symm_pack std::complex<float64> ***********
341
342template<typename Index, int nr, int StorageOrder>
343struct symm_pack_rhs<std::complex<double>, Index, nr, StorageOrder>
344{
345 void operator()(std::complex<double>* blockB, const std::complex<double>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
346 {
348 }
349};
350
351template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
352struct symm_pack_lhs<std::complex<double>, Index, Pack1, Pack2_dummy, StorageOrder>
353{
354 void operator()(std::complex<double>* blockA, const std::complex<double>* _lhs, Index lhsStride, Index cols, Index rows)
355 {
357 }
358};
359
360// *********** symm_pack float32 ***********
361template<typename Index, int nr, int StorageOrder>
362struct symm_pack_rhs<float, Index, nr, StorageOrder>
363{
368};
369
370template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
371struct symm_pack_lhs<float, Index, Pack1, Pack2_dummy, StorageOrder>
372{
373 void operator()(float* blockA, const float* _lhs, Index lhsStride, Index cols, Index rows)
374 {
376 }
377};
378
379// *********** symm_pack float64 ***********
380template<typename Index, int nr, int StorageOrder>
381struct symm_pack_rhs<double, Index, nr, StorageOrder>
382{
387};
388
389template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
391{
392 void operator()(double* blockA, const double* _lhs, Index lhsStride, Index cols, Index rows)
393 {
395 }
396};
397
409template<typename Scalar, typename Packet, typename Index>
411{
412 const Index size = 16 / sizeof(Scalar);
413 pstore<Scalar>(to + (0 * size), block.packet[0]);
414 pstore<Scalar>(to + (1 * size), block.packet[1]);
415 pstore<Scalar>(to + (2 * size), block.packet[2]);
416 pstore<Scalar>(to + (3 * size), block.packet[3]);
417}
418
419template<typename Scalar, typename Packet, typename Index>
421{
422 const Index size = 16 / sizeof(Scalar);
423 pstore<Scalar>(to + (0 * size), block.packet[0]);
424 pstore<Scalar>(to + (1 * size), block.packet[1]);
425}
426
427// General template for lhs & rhs complex packing.
428template<typename Scalar, typename Index, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode, bool UseLhs>
429struct dhs_cpack {
430 EIGEN_STRONG_INLINE void operator()(std::complex<Scalar>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
431 {
433 const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
434 Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
435 Scalar* blockAt = reinterpret_cast<Scalar *>(blockA);
436 Index j = 0;
437
438 for(; j + vectorSize <= rows; j+=vectorSize)
439 {
440 Index i = 0;
441
442 rii = rir + vectorDelta;
443
444 for(; i + vectorSize <= depth; i+=vectorSize)
445 {
448
449 if (UseLhs) {
451 } else {
453 }
454
455 blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32);
456 blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETREAL32);
457 blockr.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETREAL32);
458 blockr.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETREAL32);
459
460 blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETIMAG32);
461 blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETIMAG32);
462 blocki.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETIMAG32);
463 blocki.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETIMAG32);
464
465 if(Conjugate)
466 {
467 blocki.packet[0] = -blocki.packet[0];
468 blocki.packet[1] = -blocki.packet[1];
469 blocki.packet[2] = -blocki.packet[2];
470 blocki.packet[3] = -blocki.packet[3];
471 }
472
473 if(((StorageOrder == RowMajor) && UseLhs) || (((StorageOrder == ColMajor) && !UseLhs)))
474 {
477 }
478
481
482 rir += 4*vectorSize;
483 rii += 4*vectorSize;
484 }
485 for(; i < depth; i++)
486 {
489
490 if(((StorageOrder == ColMajor) && UseLhs) || (((StorageOrder == RowMajor) && !UseLhs)))
491 {
492 if (UseLhs) {
493 cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
494 cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 2, i);
495 } else {
496 cblock.packet[0] = lhs.template loadPacket<PacketC>(i, j + 0);
497 cblock.packet[1] = lhs.template loadPacket<PacketC>(i, j + 2);
498 }
499 } else {
500 std::complex<Scalar> lhs0, lhs1;
501 if (UseLhs) {
502 lhs0 = lhs(j + 0, i);
503 lhs1 = lhs(j + 1, i);
504 cblock.packet[0] = pload2(&lhs0, &lhs1);
505 lhs0 = lhs(j + 2, i);
506 lhs1 = lhs(j + 3, i);
507 cblock.packet[1] = pload2(&lhs0, &lhs1);
508 } else {
509 lhs0 = lhs(i, j + 0);
510 lhs1 = lhs(i, j + 1);
511 cblock.packet[0] = pload2(&lhs0, &lhs1);
512 lhs0 = lhs(i, j + 2);
513 lhs1 = lhs(i, j + 3);
514 cblock.packet[1] = pload2(&lhs0, &lhs1);
515 }
516 }
517
518 blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL32);
519 blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG32);
520
521 if(Conjugate)
522 {
523 blocki.packet[0] = -blocki.packet[0];
524 }
525
526 pstore<Scalar>(blockAt + rir, blockr.packet[0]);
527 pstore<Scalar>(blockAt + rii, blocki.packet[0]);
528
529 rir += vectorSize;
530 rii += vectorSize;
531 }
532
533 rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
534 }
535
536 if (j < rows)
537 {
538 if(PanelMode) rir += (offset*(rows - j - vectorSize));
539 rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
540
541 for(Index i = 0; i < depth; i++)
542 {
543 Index k = j;
544 for(; k < rows; k++)
545 {
546 if (UseLhs) {
547 blockAt[rir] = lhs(k, i).real();
548
549 if(Conjugate)
550 blockAt[rii] = -lhs(k, i).imag();
551 else
552 blockAt[rii] = lhs(k, i).imag();
553 } else {
554 blockAt[rir] = lhs(i, k).real();
555
556 if(Conjugate)
557 blockAt[rii] = -lhs(i, k).imag();
558 else
559 blockAt[rii] = lhs(i, k).imag();
560 }
561
562 rir += 1;
563 rii += 1;
564 }
565 }
566 }
567 }
568};
569
570// General template for lhs & rhs packing.
571template<typename Scalar, typename Index, typename DataMapper, typename Packet, int StorageOrder, bool PanelMode, bool UseLhs>
572struct dhs_pack{
574 {
576 Index ri = 0, j = 0;
577
578 for(; j + vectorSize <= rows; j+=vectorSize)
579 {
580 Index i = 0;
581
583
584 for(; i + vectorSize <= depth; i+=vectorSize)
585 {
587
588 if (UseLhs) {
590 } else {
592 }
593 if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
594 {
596 }
597
599
600 ri += 4*vectorSize;
601 }
602 for(; i < depth; i++)
603 {
604 if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
605 {
606 if (UseLhs) {
607 blockA[ri+0] = lhs(j+0, i);
608 blockA[ri+1] = lhs(j+1, i);
609 blockA[ri+2] = lhs(j+2, i);
610 blockA[ri+3] = lhs(j+3, i);
611 } else {
612 blockA[ri+0] = lhs(i, j+0);
613 blockA[ri+1] = lhs(i, j+1);
614 blockA[ri+2] = lhs(i, j+2);
615 blockA[ri+3] = lhs(i, j+3);
616 }
617 } else {
618 Packet lhsV;
619 if (UseLhs) {
620 lhsV = lhs.template loadPacket<Packet>(j, i);
621 } else {
622 lhsV = lhs.template loadPacket<Packet>(i, j);
623 }
624 pstore<Scalar>(blockA + ri, lhsV);
625 }
626
627 ri += vectorSize;
628 }
629
630 if(PanelMode) ri += vectorSize*(stride - offset - depth);
631 }
632
633 if (j < rows)
634 {
635 if(PanelMode) ri += offset*(rows - j);
636
637 for(Index i = 0; i < depth; i++)
638 {
639 Index k = j;
640 for(; k < rows; k++)
641 {
642 if (UseLhs) {
643 blockA[ri] = lhs(k, i);
644 } else {
645 blockA[ri] = lhs(i, k);
646 }
647 ri += 1;
648 }
649 }
650 }
651 }
652};
653
654// General template for lhs packing, float64 specialization.
655template<typename Index, typename DataMapper, int StorageOrder, bool PanelMode>
657{
658 EIGEN_STRONG_INLINE void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
659 {
661 Index ri = 0, j = 0;
662
663 for(; j + vectorSize <= rows; j+=vectorSize)
664 {
665 Index i = 0;
666
668
669 for(; i + vectorSize <= depth; i+=vectorSize)
670 {
672 if(StorageOrder == RowMajor)
673 {
674 block.packet[0] = lhs.template loadPacket<Packet2d>(j + 0, i);
675 block.packet[1] = lhs.template loadPacket<Packet2d>(j + 1, i);
676
678 } else {
679 block.packet[0] = lhs.template loadPacket<Packet2d>(j, i + 0);
680 block.packet[1] = lhs.template loadPacket<Packet2d>(j, i + 1);
681 }
682
684
685 ri += 2*vectorSize;
686 }
687 for(; i < depth; i++)
688 {
689 if(StorageOrder == RowMajor)
690 {
691 blockA[ri+0] = lhs(j+0, i);
692 blockA[ri+1] = lhs(j+1, i);
693 } else {
694 Packet2d lhsV = lhs.template loadPacket<Packet2d>(j, i);
695 pstore<double>(blockA + ri, lhsV);
696 }
697
698 ri += vectorSize;
699 }
700
701 if(PanelMode) ri += vectorSize*(stride - offset - depth);
702 }
703
704 if (j < rows)
705 {
706 if(PanelMode) ri += offset*(rows - j);
707
708 for(Index i = 0; i < depth; i++)
709 {
710 Index k = j;
711 for(; k < rows; k++)
712 {
713 blockA[ri] = lhs(k, i);
714 ri += 1;
715 }
716 }
717 }
718 }
719};
720
721// General template for rhs packing, float64 specialization.
722template<typename Index, typename DataMapper, int StorageOrder, bool PanelMode>
724{
725 EIGEN_STRONG_INLINE void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
726 {
728 Index ri = 0, j = 0;
729
730 for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
731 {
732 Index i = 0;
733
734 if(PanelMode) ri += offset*(2*vectorSize);
735
736 for(; i + vectorSize <= depth; i+=vectorSize)
737 {
739 if(StorageOrder == ColMajor)
740 {
742 block1.packet[0] = rhs.template loadPacket<Packet2d>(i, j + 0);
743 block1.packet[1] = rhs.template loadPacket<Packet2d>(i, j + 1);
744 block2.packet[0] = rhs.template loadPacket<Packet2d>(i, j + 2);
745 block2.packet[1] = rhs.template loadPacket<Packet2d>(i, j + 3);
746
749
750 pstore<double>(blockB + ri , block1.packet[0]);
751 pstore<double>(blockB + ri + 2, block2.packet[0]);
752 pstore<double>(blockB + ri + 4, block1.packet[1]);
753 pstore<double>(blockB + ri + 6, block2.packet[1]);
754 } else {
755 block.packet[0] = rhs.template loadPacket<Packet2d>(i + 0, j + 0); //[a1 a2]
756 block.packet[1] = rhs.template loadPacket<Packet2d>(i + 0, j + 2); //[a3 a4]
757 block.packet[2] = rhs.template loadPacket<Packet2d>(i + 1, j + 0); //[b1 b2]
758 block.packet[3] = rhs.template loadPacket<Packet2d>(i + 1, j + 2); //[b3 b4]
759
761 }
762
763 ri += 4*vectorSize;
764 }
765 for(; i < depth; i++)
766 {
767 if(StorageOrder == ColMajor)
768 {
769 blockB[ri+0] = rhs(i, j+0);
770 blockB[ri+1] = rhs(i, j+1);
771
772 ri += vectorSize;
773
774 blockB[ri+0] = rhs(i, j+2);
775 blockB[ri+1] = rhs(i, j+3);
776 } else {
777 Packet2d rhsV = rhs.template loadPacket<Packet2d>(i, j);
778 pstore<double>(blockB + ri, rhsV);
779
780 ri += vectorSize;
781
782 rhsV = rhs.template loadPacket<Packet2d>(i, j + 2);
783 pstore<double>(blockB + ri, rhsV);
784 }
785 ri += vectorSize;
786 }
787
788 if(PanelMode) ri += (2*vectorSize)*(stride - offset - depth);
789 }
790
791 if (j < cols)
792 {
793 if(PanelMode) ri += offset*(cols - j);
794
795 for(Index i = 0; i < depth; i++)
796 {
797 Index k = j;
798 for(; k < cols; k++)
799 {
800 blockB[ri] = rhs(i, k);
801 ri += 1;
802 }
803 }
804 }
805 }
806};
807
808// General template for lhs complex packing, float64 specialization.
809template<typename Index, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
811{
812 EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
813 {
815 const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
816 Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
817 double* blockAt = reinterpret_cast<double *>(blockA);
818 Index j = 0;
819
820 for(; j + vectorSize <= rows; j+=vectorSize)
821 {
822 Index i = 0;
823
824 rii = rir + vectorDelta;
825
826 for(; i + vectorSize <= depth; i+=vectorSize)
827 {
830
831 if(StorageOrder == ColMajor)
832 {
833 cblock.packet[0] = lhs.template loadPacket<PacketC>(j, i + 0); //[a1 a1i]
834 cblock.packet[1] = lhs.template loadPacket<PacketC>(j, i + 1); //[b1 b1i]
835
836 cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 1, i + 0); //[a2 a2i]
837 cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1); //[b2 b2i]
838
839 blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETREAL64); //[a1 a2]
840 blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2]
841
842 blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETIMAG64);
843 blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETIMAG64);
844 } else {
845 cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i); //[a1 a1i]
846 cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i); //[a2 a2i]
847
848 cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 0, i + 1); //[b1 b1i]
849 cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1); //[b2 b2i
850
851 blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); //[a1 a2]
852 blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2]
853
854 blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
855 blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64);
856 }
857
858 if(Conjugate)
859 {
860 blocki.packet[0] = -blocki.packet[0];
861 blocki.packet[1] = -blocki.packet[1];
862 }
863
866
867 rir += 2*vectorSize;
868 rii += 2*vectorSize;
869 }
870 for(; i < depth; i++)
871 {
874
875 cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
876 cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i);
877
878 blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
879 blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
880
881 if(Conjugate)
882 {
883 blocki.packet[0] = -blocki.packet[0];
884 }
885
886 pstore<double>(blockAt + rir, blockr.packet[0]);
887 pstore<double>(blockAt + rii, blocki.packet[0]);
888
889 rir += vectorSize;
890 rii += vectorSize;
891 }
892
893 rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
894 }
895
896 if (j < rows)
897 {
898 if(PanelMode) rir += (offset*(rows - j - vectorSize));
899 rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
900
901 for(Index i = 0; i < depth; i++)
902 {
903 Index k = j;
904 for(; k < rows; k++)
905 {
906 blockAt[rir] = lhs(k, i).real();
907
908 if(Conjugate)
909 blockAt[rii] = -lhs(k, i).imag();
910 else
911 blockAt[rii] = lhs(k, i).imag();
912
913 rir += 1;
914 rii += 1;
915 }
916 }
917 }
918 }
919};
920
921// General template for rhs complex packing, float64 specialization.
922template<typename Index, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
924{
925 EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
926 {
928 const Index vectorDelta = 2*vectorSize * ((PanelMode) ? stride : depth);
929 Index rir = ((PanelMode) ? (2*vectorSize*offset) : 0), rii;
930 double* blockBt = reinterpret_cast<double *>(blockB);
931 Index j = 0;
932
933 for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
934 {
935 Index i = 0;
936
937 rii = rir + vectorDelta;
938
939 for(; i < depth; i++)
940 {
943
945
946 blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
947 blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64);
948
949 blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
950 blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64);
951
952 if(Conjugate)
953 {
954 blocki.packet[0] = -blocki.packet[0];
955 blocki.packet[1] = -blocki.packet[1];
956 }
957
960
961 rir += 2*vectorSize;
962 rii += 2*vectorSize;
963 }
964
965 rir += ((PanelMode) ? (2*vectorSize*(2*stride - depth)) : vectorDelta);
966 }
967
968 if (j < cols)
969 {
970 if(PanelMode) rir += (offset*(cols - j - 2*vectorSize));
971 rii = rir + (((PanelMode) ? stride : depth) * (cols - j));
972
973 for(Index i = 0; i < depth; i++)
974 {
975 Index k = j;
976 for(; k < cols; k++)
977 {
978 blockBt[rir] = rhs(i, k).real();
979
980 if(Conjugate)
981 blockBt[rii] = -rhs(i, k).imag();
982 else
983 blockBt[rii] = rhs(i, k).imag();
984
985 rir += 1;
986 rii += 1;
987 }
988 }
989 }
990 }
991};
992
993/**************
994 * GEMM utils *
995 **************/
996
997// 512-bits rank1-update of acc. It can either positive or negative accumulate (useful for complex gemm).
998template<typename Packet, bool NegativeAccumulate>
1000{
1002 {
1003 acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]);
1004 acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]);
1005 acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]);
1006 acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]);
1007 } else {
1008 acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]);
1009 acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]);
1010 acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]);
1011 acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]);
1012 }
1013}
1014
1015template<typename Packet, bool NegativeAccumulate>
1017{
1019 {
1020 acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]);
1021 } else {
1022 acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]);
1023 }
1024}
1025
1026template<int N, typename Scalar, typename Packet, bool NegativeAccumulate>
1033
1034template<typename Scalar, typename Packet, typename Index>
1036{
1037#ifdef _ARCH_PWR9
1038 lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar));
1039#else
1040 Index i = 0;
1041 do {
1042 lhsV[i] = lhs[i];
1043 } while (++i < remaining_rows);
1044#endif
1045}
1046
1047template<int N, typename Scalar, typename Packet, typename Index, bool NegativeAccumulate>
1055
1056// 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types real * complex and complex * real.
1057template<int N, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
1075
1076template<int N, typename Scalar, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
1086
1087template<typename Scalar, typename Packet, typename Index, bool LhsIsReal>
1089{
1090#ifdef _ARCH_PWR9
1094#else
1095 Index i = 0;
1096 do {
1097 lhsV[i] = lhs_ptr[i];
1098 if(!LhsIsReal) lhsVi[i] = lhs_ptr_imag[i];
1099 } while (++i < remaining_rows);
1101#endif
1102}
1103
1104template<int N, typename Scalar, typename Packet, typename Index, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
1112
1113template<typename Scalar, typename Packet>
1115{
1116 return ploadu<Packet>(lhs);
1117}
1118
1119// Zero the accumulator on PacketBlock.
1120template<typename Scalar, typename Packet>
1122{
1123 acc.packet[0] = pset1<Packet>((Scalar)0);
1124 acc.packet[1] = pset1<Packet>((Scalar)0);
1125 acc.packet[2] = pset1<Packet>((Scalar)0);
1126 acc.packet[3] = pset1<Packet>((Scalar)0);
1127}
1128
1129template<typename Scalar, typename Packet>
1131{
1132 acc.packet[0] = pset1<Packet>((Scalar)0);
1133}
1134
1135// Scale the PacketBlock vectors by alpha.
1136template<typename Packet>
1138{
1139 acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]);
1140 acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]);
1141 acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]);
1142 acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]);
1143}
1144
1145template<typename Packet>
1147{
1148 acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]);
1149}
1150
1151template<typename Packet>
1153{
1154 acc.packet[0] = pmul<Packet>(accZ.packet[0], pAlpha);
1155 acc.packet[1] = pmul<Packet>(accZ.packet[1], pAlpha);
1156 acc.packet[2] = pmul<Packet>(accZ.packet[2], pAlpha);
1157 acc.packet[3] = pmul<Packet>(accZ.packet[3], pAlpha);
1158}
1159
1160template<typename Packet>
1165
1166// Complex version of PacketBlock scaling.
1167template<typename Packet, int N>
1178
1179template<typename Packet>
1181{
1182 acc.packet[0] = pand(acc.packet[0], pMask);
1183 acc.packet[1] = pand(acc.packet[1], pMask);
1184 acc.packet[2] = pand(acc.packet[2], pMask);
1185 acc.packet[3] = pand(acc.packet[3], pMask);
1186}
1187
1188template<typename Packet>
1196
1197// Load a PacketBlock, the N parameters make tunning gemm easier so we can add more accumulators as needed.
1198template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
1200{
1201 if (StorageOrder == RowMajor) {
1202 acc.packet[0] = res.template loadPacket<Packet>(row + 0, col + N*accCols);
1203 acc.packet[1] = res.template loadPacket<Packet>(row + 1, col + N*accCols);
1204 acc.packet[2] = res.template loadPacket<Packet>(row + 2, col + N*accCols);
1205 acc.packet[3] = res.template loadPacket<Packet>(row + 3, col + N*accCols);
1206 } else {
1207 acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
1208 acc.packet[1] = res.template loadPacket<Packet>(row + N*accCols, col + 1);
1209 acc.packet[2] = res.template loadPacket<Packet>(row + N*accCols, col + 2);
1210 acc.packet[3] = res.template loadPacket<Packet>(row + N*accCols, col + 3);
1211 }
1212}
1213
1214// An overload of bload when you have a PacketBLock with 8 vectors.
1215template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
1217{
1218 if (StorageOrder == RowMajor) {
1219 acc.packet[0] = res.template loadPacket<Packet>(row + 0, col + N*accCols);
1220 acc.packet[1] = res.template loadPacket<Packet>(row + 1, col + N*accCols);
1221 acc.packet[2] = res.template loadPacket<Packet>(row + 2, col + N*accCols);
1222 acc.packet[3] = res.template loadPacket<Packet>(row + 3, col + N*accCols);
1223 acc.packet[4] = res.template loadPacket<Packet>(row + 0, col + (N+1)*accCols);
1224 acc.packet[5] = res.template loadPacket<Packet>(row + 1, col + (N+1)*accCols);
1225 acc.packet[6] = res.template loadPacket<Packet>(row + 2, col + (N+1)*accCols);
1226 acc.packet[7] = res.template loadPacket<Packet>(row + 3, col + (N+1)*accCols);
1227 } else {
1228 acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
1229 acc.packet[1] = res.template loadPacket<Packet>(row + N*accCols, col + 1);
1230 acc.packet[2] = res.template loadPacket<Packet>(row + N*accCols, col + 2);
1231 acc.packet[3] = res.template loadPacket<Packet>(row + N*accCols, col + 3);
1232 acc.packet[4] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 0);
1233 acc.packet[5] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 1);
1234 acc.packet[6] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 2);
1235 acc.packet[7] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 3);
1236 }
1237}
1238
1239template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
1241{
1242 acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
1243 acc.packet[1] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 0);
1244}
1245
1246const static Packet4i mask41 = { -1, 0, 0, 0 };
1247const static Packet4i mask42 = { -1, -1, 0, 0 };
1248const static Packet4i mask43 = { -1, -1, -1, 0 };
1249
1250const static Packet2l mask21 = { -1, 0 };
1251
1252template<typename Packet>
1254{
1255 if (remaining_rows == 0) {
1256 return pset1<Packet>(float(0.0)); // Not used
1257 } else {
1258 switch (remaining_rows) {
1259 case 1: return Packet(mask41);
1260 case 2: return Packet(mask42);
1261 default: return Packet(mask43);
1262 }
1263 }
1264}
1265
1266template<>
1268{
1269 if (remaining_rows == 0) {
1270 return pset1<Packet2d>(double(0.0)); // Not used
1271 } else {
1272 return Packet2d(mask21);
1273 }
1274}
1275
1276template<typename Packet>
1283
1284template<typename Packet>
1289
1290template<>
1292{
1293 a1 = pload<Packet2d>(a);
1294 a3 = pload<Packet2d>(a + 2);
1295 a0 = vec_splat(a1, 0);
1296 a1 = vec_splat(a1, 1);
1297 a2 = vec_splat(a3, 0);
1298 a3 = vec_splat(a3, 1);
1299}
1300
1301// PEEL loop factor.
1302#define PEEL 7
1303
1304template<typename Scalar, typename Packet, typename Index>
1318
1319template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows>
1321 const DataMapper& res,
1322 const Scalar* lhs_base,
1323 const Scalar* rhs_base,
1324 Index depth,
1325 Index strideA,
1326 Index offsetA,
1327 Index row,
1328 Index col,
1331 const Packet& pAlpha)
1332{
1333 const Scalar* rhs_ptr = rhs_base;
1336
1338
1340 Index k = 0;
1341 for(; k + PEEL <= remaining_depth; k+= PEEL)
1342 {
1345 for (int l = 0; l < PEEL; l++) {
1347 }
1348 }
1349 for(; k < remaining_depth; k++)
1350 {
1352 }
1353 for(; k < depth; k++)
1354 {
1355 Packet rhsV[1];
1356 rhsV[0] = pset1<Packet>(rhs_ptr[0]);
1360 }
1361
1362 accZero.packet[0] = vec_mul(pAlpha, accZero.packet[0]);
1363 for(Index i = 0; i < remaining_rows; i++) {
1364 res(row + i, col) += accZero.packet[0][i];
1365 }
1366}
1367
1368template<typename Scalar, typename Packet, typename Index, const Index accRows>
1381
1382template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
1384 const DataMapper& res,
1385 const Scalar* lhs_base,
1386 const Scalar* rhs_base,
1387 Index depth,
1388 Index strideA,
1389 Index offsetA,
1390 Index row,
1391 Index col,
1392 Index rows,
1393 Index cols,
1395 const Packet& pAlpha,
1396 const Packet& pMask)
1397{
1398 const Scalar* rhs_ptr = rhs_base;
1401
1403
1405 Index k = 0;
1406 for(; k + PEEL <= remaining_depth; k+= PEEL)
1407 {
1410 for (int l = 0; l < PEEL; l++) {
1412 }
1413 }
1414 for(; k < remaining_depth; k++)
1415 {
1417 }
1418
1419 if ((remaining_depth == depth) && (rows >= accCols))
1420 {
1421 for(Index j = 0; j < 4; j++) {
1422 acc.packet[j] = res.template loadPacket<Packet>(row, col + j);
1423 }
1425 res.template storePacketBlock<Packet,4>(row, col, acc);
1426 } else {
1427 for(; k < depth; k++)
1428 {
1429 Packet rhsV[4];
1430 pbroadcast4<Packet>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1433 rhs_ptr += accRows;
1434 }
1435
1436 for(Index j = 0; j < 4; j++) {
1437 accZero.packet[j] = vec_mul(pAlpha, accZero.packet[j]);
1438 }
1439 for(Index j = 0; j < 4; j++) {
1440 for(Index i = 0; i < remaining_rows; i++) {
1441 res(row + i, col + j) += accZero.packet[j][i];
1442 }
1443 }
1444 }
1445}
1446
1447#define MICRO_UNROLL(func) \
1448 func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
1449
1450#define MICRO_UNROLL_WORK(func, func2, peel) \
1451 MICRO_UNROLL(func2); \
1452 func(0,peel) func(1,peel) func(2,peel) func(3,peel) \
1453 func(4,peel) func(5,peel) func(6,peel) func(7,peel)
1454
1455#define MICRO_LOAD_ONE(iter) \
1456 if (unroll_factor > iter) { \
1457 lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr##iter); \
1458 lhs_ptr##iter += accCols; \
1459 } else { \
1460 EIGEN_UNUSED_VARIABLE(lhsV##iter); \
1461 }
1462
1463#define MICRO_WORK_ONE(iter, peel) \
1464 if (unroll_factor > iter) { \
1465 pger_common<Packet, false>(&accZero##iter, lhsV##iter, rhsV##peel); \
1466 }
1467
1468#define MICRO_TYPE_PEEL4(func, func2, peel) \
1469 if (PEEL > peel) { \
1470 Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
1471 pbroadcast4<Packet>(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
1472 MICRO_UNROLL_WORK(func, func2, peel) \
1473 } else { \
1474 EIGEN_UNUSED_VARIABLE(rhsV##peel); \
1475 }
1476
1477#define MICRO_TYPE_PEEL1(func, func2, peel) \
1478 if (PEEL > peel) { \
1479 Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
1480 rhsV##peel[0] = pset1<Packet>(rhs_ptr[remaining_cols * peel]); \
1481 MICRO_UNROLL_WORK(func, func2, peel) \
1482 } else { \
1483 EIGEN_UNUSED_VARIABLE(rhsV##peel); \
1484 }
1485
1486#define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2) \
1487 Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \
1488 func(func1,func2,0); func(func1,func2,1); \
1489 func(func1,func2,2); func(func1,func2,3); \
1490 func(func1,func2,4); func(func1,func2,5); \
1491 func(func1,func2,6); func(func1,func2,7); \
1492 func(func1,func2,8); func(func1,func2,9);
1493
1494#define MICRO_UNROLL_TYPE_ONE(M, func, func1, func2) \
1495 Packet rhsV0[M]; \
1496 func(func1,func2,0);
1497
1498#define MICRO_ONE_PEEL4 \
1499 MICRO_UNROLL_TYPE_PEEL(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
1500 rhs_ptr += (accRows * PEEL);
1501
1502#define MICRO_ONE4 \
1503 MICRO_UNROLL_TYPE_ONE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
1504 rhs_ptr += accRows;
1505
1506#define MICRO_ONE_PEEL1 \
1507 MICRO_UNROLL_TYPE_PEEL(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
1508 rhs_ptr += (remaining_cols * PEEL);
1509
1510#define MICRO_ONE1 \
1511 MICRO_UNROLL_TYPE_ONE(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
1512 rhs_ptr += remaining_cols;
1513
1514#define MICRO_DST_PTR_ONE(iter) \
1515 if (unroll_factor > iter) { \
1516 bsetzero<Scalar, Packet>(accZero##iter); \
1517 } else { \
1518 EIGEN_UNUSED_VARIABLE(accZero##iter); \
1519 }
1520
1521#define MICRO_DST_PTR MICRO_UNROLL(MICRO_DST_PTR_ONE)
1522
1523#define MICRO_SRC_PTR_ONE(iter) \
1524 if (unroll_factor > iter) { \
1525 lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \
1526 } else { \
1527 EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
1528 }
1529
1530#define MICRO_SRC_PTR MICRO_UNROLL(MICRO_SRC_PTR_ONE)
1531
1532#define MICRO_PREFETCH_ONE(iter) \
1533 if (unroll_factor > iter) { \
1534 EIGEN_POWER_PREFETCH(lhs_ptr##iter); \
1535 }
1536
1537#define MICRO_PREFETCH MICRO_UNROLL(MICRO_PREFETCH_ONE)
1538
1539#define MICRO_STORE_ONE(iter) \
1540 if (unroll_factor > iter) { \
1541 acc.packet[0] = res.template loadPacket<Packet>(row + iter*accCols, col + 0); \
1542 acc.packet[1] = res.template loadPacket<Packet>(row + iter*accCols, col + 1); \
1543 acc.packet[2] = res.template loadPacket<Packet>(row + iter*accCols, col + 2); \
1544 acc.packet[3] = res.template loadPacket<Packet>(row + iter*accCols, col + 3); \
1545 bscale<Packet>(acc, accZero##iter, pAlpha); \
1546 res.template storePacketBlock<Packet,4>(row + iter*accCols, col, acc); \
1547 }
1548
1549#define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE)
1550
1551#define MICRO_COL_STORE_ONE(iter) \
1552 if (unroll_factor > iter) { \
1553 acc.packet[0] = res.template loadPacket<Packet>(row + iter*accCols, col + 0); \
1554 bscale<Packet>(acc, accZero##iter, pAlpha); \
1555 res.template storePacketBlock<Packet,1>(row + iter*accCols, col, acc); \
1556 }
1557
1558#define MICRO_COL_STORE MICRO_UNROLL(MICRO_COL_STORE_ONE)
1559
1560template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
1562 const DataMapper& res,
1563 const Scalar* lhs_base,
1564 const Scalar* rhs_base,
1565 Index depth,
1566 Index strideA,
1567 Index offsetA,
1568 Index& row,
1569 Index col,
1570 const Packet& pAlpha)
1571{
1572 const Scalar* rhs_ptr = rhs_base;
1576
1579
1580 Index k = 0;
1581 for(; k + PEEL <= depth; k+= PEEL)
1582 {
1586 }
1587 for(; k < depth; k++)
1588 {
1590 }
1592
1594}
1595
1596template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
1598 const DataMapper& res,
1599 const Scalar* lhs_base,
1600 const Scalar* rhs_base,
1601 Index depth,
1602 Index strideA,
1603 Index offsetA,
1604 Index& row,
1605 Index col,
1607 const Packet& pAlpha)
1608{
1609 const Scalar* rhs_ptr = rhs_base;
1613
1616
1617 Index k = 0;
1618 for(; k + PEEL <= depth; k+= PEEL)
1619 {
1623 }
1624 for(; k < depth; k++)
1625 {
1627 }
1629
1631}
1632
1633template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
1635 const DataMapper& res,
1636 const Scalar* lhs_base,
1637 const Scalar* rhs_base,
1638 Index depth,
1639 Index strideA,
1640 Index offsetA,
1641 Index& row,
1642 Index rows,
1643 Index col,
1645 const Packet& pAlpha)
1646{
1647#define MAX_UNROLL 6
1648 while(row + MAX_UNROLL*accCols <= rows) {
1650 }
1651 switch( (rows-row)/accCols ) {
1652#if MAX_UNROLL > 7
1653 case 7:
1655 break;
1656#endif
1657#if MAX_UNROLL > 6
1658 case 6:
1660 break;
1661#endif
1662#if MAX_UNROLL > 5
1663 case 5:
1665 break;
1666#endif
1667#if MAX_UNROLL > 4
1668 case 4:
1670 break;
1671#endif
1672#if MAX_UNROLL > 3
1673 case 3:
1675 break;
1676#endif
1677#if MAX_UNROLL > 2
1678 case 2:
1680 break;
1681#endif
1682#if MAX_UNROLL > 1
1683 case 1:
1685 break;
1686#endif
1687 default:
1688 break;
1689 }
1690#undef MAX_UNROLL
1691}
1692
1693/****************
1694 * GEMM kernels *
1695 * **************/
1696template<typename Scalar, typename Index, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
1698{
1699 const Index remaining_rows = rows % accCols;
1700 const Index remaining_cols = cols % accRows;
1701
1702 if( strideA == -1 ) strideA = depth;
1703 if( strideB == -1 ) strideB = depth;
1704
1706 const Packet pMask = bmask<Packet>((const int)(remaining_rows));
1707
1708 Index col = 0;
1709 for(; col + accRows <= cols; col += accRows)
1710 {
1711 const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
1712 const Scalar* lhs_base = blockA;
1713 Index row = 0;
1714
1715#define MAX_UNROLL 6
1716 while(row + MAX_UNROLL*accCols <= rows) {
1718 }
1719 switch( (rows-row)/accCols ) {
1720#if MAX_UNROLL > 7
1721 case 7:
1723 break;
1724#endif
1725#if MAX_UNROLL > 6
1726 case 6:
1728 break;
1729#endif
1730#if MAX_UNROLL > 5
1731 case 5:
1733 break;
1734#endif
1735#if MAX_UNROLL > 4
1736 case 4:
1738 break;
1739#endif
1740#if MAX_UNROLL > 3
1741 case 3:
1743 break;
1744#endif
1745#if MAX_UNROLL > 2
1746 case 2:
1748 break;
1749#endif
1750#if MAX_UNROLL > 1
1751 case 1:
1753 break;
1754#endif
1755 default:
1756 break;
1757 }
1758#undef MAX_UNROLL
1759
1760 if(remaining_rows > 0)
1761 {
1763 }
1764 }
1765
1766 if(remaining_cols > 0)
1767 {
1768 const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB;
1769 const Scalar* lhs_base = blockA;
1770
1771 for(; col < cols; col++)
1772 {
1773 Index row = 0;
1774
1776
1777 if (remaining_rows > 0)
1778 {
1780 }
1781 rhs_base++;
1782 }
1783 }
1784}
1785
1786#define accColsC (accCols / 2)
1787#define advanceRows ((LhsIsReal) ? 1 : 2)
1788#define advanceCols ((RhsIsReal) ? 1 : 2)
1789
1790// PEEL_COMPLEX loop factor.
1791#define PEEL_COMPLEX 3
1792
1793template<typename Scalar, typename Packet, typename Index, const Index accRows, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
1812
1813template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
1815 const DataMapper& res,
1816 const Scalar* lhs_base,
1817 const Scalar* rhs_base,
1818 Index depth,
1819 Index strideA,
1820 Index offsetA,
1821 Index strideB,
1822 Index row,
1823 Index col,
1826 const Packet& pAlphaReal,
1827 const Packet& pAlphaImag)
1828{
1829 const Scalar* rhs_ptr_real = rhs_base;
1830 const Scalar* rhs_ptr_imag;
1834 const Scalar* lhs_ptr_imag;
1840
1843
1845 Index k = 0;
1846 for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX)
1847 {
1849 if(!RhsIsReal) {
1851 }
1853 if(!LhsIsReal) {
1855 }
1856 for (int l = 0; l < PEEL_COMPLEX; l++) {
1858 }
1859 }
1860 for(; k < remaining_depth; k++)
1861 {
1863 }
1864
1865 for(; k < depth; k++)
1866 {
1867 Packet rhsV[1], rhsVi[1];
1875 }
1876
1879
1880 if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1))
1881 {
1882 res(row + 0, col + 0) += pfirst<Packetc>(acc0.packet[0]);
1883 } else {
1884 acc0.packet[0] += res.template loadPacket<Packetc>(row + 0, col + 0);
1885 res.template storePacketBlock<Packetc,1>(row + 0, col + 0, acc0);
1886 if(remaining_rows > accColsC) {
1887 res(row + accColsC, col + 0) += pfirst<Packetc>(acc1.packet[0]);
1888 }
1889 }
1890}
1891
1892template<typename Scalar, typename Packet, typename Index, const Index accRows, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
1910
1911template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
1913 const DataMapper& res,
1914 const Scalar* lhs_base,
1915 const Scalar* rhs_base,
1916 Index depth,
1917 Index strideA,
1918 Index offsetA,
1919 Index strideB,
1920 Index row,
1921 Index col,
1922 Index rows,
1923 Index cols,
1925 const Packet& pAlphaReal,
1926 const Packet& pAlphaImag,
1927 const Packet& pMask)
1928{
1929 const Scalar* rhs_ptr_real = rhs_base;
1930 const Scalar* rhs_ptr_imag;
1934 const Scalar* lhs_ptr_imag;
1941
1944
1946 Index k = 0;
1947 for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX)
1948 {
1950 if(!RhsIsReal) {
1952 }
1954 if(!LhsIsReal) {
1956 }
1957 for (int l = 0; l < PEEL_COMPLEX; l++) {
1959 }
1960 }
1961 for(; k < remaining_depth; k++)
1962 {
1964 }
1965
1966 if ((remaining_depth == depth) && (rows >= accCols))
1967 {
1971 res.template storePacketBlock<Packetc,4>(row + 0, col, acc0);
1973 } else {
1974 for(; k < depth; k++)
1975 {
1976 Packet rhsV[4], rhsVi[4];
1984 }
1985
1988
1989 if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1))
1990 {
1991 for(Index j = 0; j < 4; j++) {
1992 res(row + 0, col + j) += pfirst<Packetc>(acc0.packet[j]);
1993 }
1994 } else {
1995 for(Index j = 0; j < 4; j++) {
1997 acc2.packet[0] = res.template loadPacket<Packetc>(row + 0, col + j) + acc0.packet[j];
1998 res.template storePacketBlock<Packetc,1>(row + 0, col + j, acc2);
1999 if(remaining_rows > accColsC) {
2000 res(row + accColsC, col + j) += pfirst<Packetc>(acc1.packet[j]);
2001 }
2002 }
2003 }
2004 }
2005}
2006
2007#define MICRO_COMPLEX_UNROLL(func) \
2008 func(0) func(1) func(2) func(3) func(4)
2009
2010#define MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
2011 MICRO_COMPLEX_UNROLL(func2); \
2012 func(0,peel) func(1,peel) func(2,peel) func(3,peel) func(4,peel)
2013
2014#define MICRO_COMPLEX_LOAD_ONE(iter) \
2015 if (unroll_factor > iter) { \
2016 lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter); \
2017 lhs_ptr_real##iter += accCols; \
2018 if(!LhsIsReal) { \
2019 lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_imag##iter); \
2020 lhs_ptr_imag##iter += accCols; \
2021 } else { \
2022 EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
2023 } \
2024 } else { \
2025 EIGEN_UNUSED_VARIABLE(lhsV##iter); \
2026 EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
2027 }
2028
2029#define MICRO_COMPLEX_WORK_ONE4(iter, peel) \
2030 if (unroll_factor > iter) { \
2031 pgerc_common<4, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
2032 }
2033
2034#define MICRO_COMPLEX_WORK_ONE1(iter, peel) \
2035 if (unroll_factor > iter) { \
2036 pgerc_common<1, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
2037 }
2038
2039#define MICRO_COMPLEX_TYPE_PEEL4(func, func2, peel) \
2040 if (PEEL_COMPLEX > peel) { \
2041 Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \
2042 Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \
2043 pbroadcast4_old<Packet>(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
2044 if(!RhsIsReal) { \
2045 pbroadcast4_old<Packet>(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \
2046 } else { \
2047 EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
2048 } \
2049 MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
2050 } else { \
2051 EIGEN_UNUSED_VARIABLE(rhsV##peel); \
2052 EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
2053 }
2054
2055#define MICRO_COMPLEX_TYPE_PEEL1(func, func2, peel) \
2056 if (PEEL_COMPLEX > peel) { \
2057 Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \
2058 Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \
2059 rhsV##peel[0] = pset1<Packet>(rhs_ptr_real[remaining_cols * peel]); \
2060 if(!RhsIsReal) { \
2061 rhsVi##peel[0] = pset1<Packet>(rhs_ptr_imag[remaining_cols * peel]); \
2062 } else { \
2063 EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
2064 } \
2065 MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
2066 } else { \
2067 EIGEN_UNUSED_VARIABLE(rhsV##peel); \
2068 EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
2069 }
2070
2071#define MICRO_COMPLEX_UNROLL_TYPE_PEEL(M, func, func1, func2) \
2072 Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \
2073 Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M], rhsVi4[M], rhsVi5[M], rhsVi6[M], rhsVi7[M], rhsVi8[M], rhsVi9[M]; \
2074 func(func1,func2,0); func(func1,func2,1); \
2075 func(func1,func2,2); func(func1,func2,3); \
2076 func(func1,func2,4); func(func1,func2,5); \
2077 func(func1,func2,6); func(func1,func2,7); \
2078 func(func1,func2,8); func(func1,func2,9);
2079
2080#define MICRO_COMPLEX_UNROLL_TYPE_ONE(M, func, func1, func2) \
2081 Packet rhsV0[M], rhsVi0[M];\
2082 func(func1,func2,0);
2083
2084#define MICRO_COMPLEX_ONE_PEEL4 \
2085 MICRO_COMPLEX_UNROLL_TYPE_PEEL(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \
2086 rhs_ptr_real += (accRows * PEEL_COMPLEX); \
2087 if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX);
2088
2089#define MICRO_COMPLEX_ONE4 \
2090 MICRO_COMPLEX_UNROLL_TYPE_ONE(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \
2091 rhs_ptr_real += accRows; \
2092 if(!RhsIsReal) rhs_ptr_imag += accRows;
2093
2094#define MICRO_COMPLEX_ONE_PEEL1 \
2095 MICRO_COMPLEX_UNROLL_TYPE_PEEL(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \
2096 rhs_ptr_real += (remaining_cols * PEEL_COMPLEX); \
2097 if(!RhsIsReal) rhs_ptr_imag += (remaining_cols * PEEL_COMPLEX);
2098
2099#define MICRO_COMPLEX_ONE1 \
2100 MICRO_COMPLEX_UNROLL_TYPE_ONE(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \
2101 rhs_ptr_real += remaining_cols; \
2102 if(!RhsIsReal) rhs_ptr_imag += remaining_cols;
2103
2104#define MICRO_COMPLEX_DST_PTR_ONE(iter) \
2105 if (unroll_factor > iter) { \
2106 bsetzero<Scalar, Packet>(accReal##iter); \
2107 bsetzero<Scalar, Packet>(accImag##iter); \
2108 } else { \
2109 EIGEN_UNUSED_VARIABLE(accReal##iter); \
2110 EIGEN_UNUSED_VARIABLE(accImag##iter); \
2111 }
2112
2113#define MICRO_COMPLEX_DST_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_DST_PTR_ONE)
2114
2115#define MICRO_COMPLEX_SRC_PTR_ONE(iter) \
2116 if (unroll_factor > iter) { \
2117 lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \
2118 if(!LhsIsReal) { \
2119 lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \
2120 } else { \
2121 EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
2122 } \
2123 } else { \
2124 EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \
2125 EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
2126 }
2127
2128#define MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE)
2129
2130#define MICRO_COMPLEX_PREFETCH_ONE(iter) \
2131 if (unroll_factor > iter) { \
2132 EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \
2133 if(!LhsIsReal) { \
2134 EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \
2135 } \
2136 }
2137
2138#define MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_PREFETCH_ONE)
2139
2140#define MICRO_COMPLEX_STORE_ONE(iter) \
2141 if (unroll_factor > iter) { \
2142 bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row + iter*accCols, col); \
2143 bscalec<Packet,4>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \
2144 bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1); \
2145 res.template storePacketBlock<Packetc,4>(row + iter*accCols + 0, col, acc0); \
2146 res.template storePacketBlock<Packetc,4>(row + iter*accCols + accColsC, col, acc1); \
2147 }
2148
2149#define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE)
2150
2151#define MICRO_COMPLEX_COL_STORE_ONE(iter) \
2152 if (unroll_factor > iter) { \
2153 bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row + iter*accCols, col); \
2154 bscalec<Packet,1>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \
2155 bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1); \
2156 res.template storePacketBlock<Packetc,1>(row + iter*accCols + 0, col, acc0); \
2157 res.template storePacketBlock<Packetc,1>(row + iter*accCols + accColsC, col, acc1); \
2158 }
2159
2160#define MICRO_COMPLEX_COL_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_COL_STORE_ONE)
2161
2162template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
2164 const DataMapper& res,
2165 const Scalar* lhs_base,
2166 const Scalar* rhs_base,
2167 Index depth,
2168 Index strideA,
2169 Index offsetA,
2170 Index strideB,
2171 Index& row,
2172 Index col,
2173 const Packet& pAlphaReal,
2174 const Packet& pAlphaImag)
2175{
2176 const Scalar* rhs_ptr_real = rhs_base;
2177 const Scalar* rhs_ptr_imag;
2178 if(!RhsIsReal) {
2180 } else {
2182 }
2192
2195
2196 Index k = 0;
2197 for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX)
2198 {
2200 if(!RhsIsReal) {
2202 }
2205 }
2206 for(; k < depth; k++)
2207 {
2209 }
2211
2213}
2214
2215template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
2217 const DataMapper& res,
2218 const Scalar* lhs_base,
2219 const Scalar* rhs_base,
2220 Index depth,
2221 Index strideA,
2222 Index offsetA,
2223 Index strideB,
2224 Index& row,
2225 Index col,
2227 const Packet& pAlphaReal,
2228 const Packet& pAlphaImag)
2229{
2230 const Scalar* rhs_ptr_real = rhs_base;
2231 const Scalar* rhs_ptr_imag;
2232 if(!RhsIsReal) {
2234 } else {
2236 }
2246
2249
2250 Index k = 0;
2251 for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX)
2252 {
2254 if(!RhsIsReal) {
2256 }
2259 }
2260 for(; k < depth; k++)
2261 {
2263 }
2265
2267}
2268
2269template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
2271 const DataMapper& res,
2272 const Scalar* lhs_base,
2273 const Scalar* rhs_base,
2274 Index depth,
2275 Index strideA,
2276 Index offsetA,
2277 Index strideB,
2278 Index& row,
2279 Index rows,
2280 Index col,
2282 const Packet& pAlphaReal,
2283 const Packet& pAlphaImag)
2284{
2285#define MAX_COMPLEX_UNROLL 3
2286 while(row + MAX_COMPLEX_UNROLL*accCols <= rows) {
2288 }
2289 switch( (rows-row)/accCols ) {
2290#if MAX_COMPLEX_UNROLL > 4
2291 case 4:
2293 break;
2294#endif
2295#if MAX_COMPLEX_UNROLL > 3
2296 case 3:
2298 break;
2299#endif
2300#if MAX_COMPLEX_UNROLL > 2
2301 case 2:
2303 break;
2304#endif
2305#if MAX_COMPLEX_UNROLL > 1
2306 case 1:
2308 break;
2309#endif
2310 default:
2311 break;
2312 }
2313#undef MAX_COMPLEX_UNROLL
2314}
2315
2316template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
2318{
2319 const Index remaining_rows = rows % accCols;
2320 const Index remaining_cols = cols % accRows;
2321
2322 if( strideA == -1 ) strideA = depth;
2323 if( strideB == -1 ) strideB = depth;
2324
2325 const Packet pAlphaReal = pset1<Packet>(alpha.real());
2326 const Packet pAlphaImag = pset1<Packet>(alpha.imag());
2327 const Packet pMask = bmask<Packet>((const int)(remaining_rows));
2328
2329 const Scalar* blockA = (Scalar *) blockAc;
2330 const Scalar* blockB = (Scalar *) blockBc;
2331
2332 Index col = 0;
2333 for(; col + accRows <= cols; col += accRows)
2334 {
2335 const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
2336 const Scalar* lhs_base = blockA;
2337 Index row = 0;
2338
2339#define MAX_COMPLEX_UNROLL 3
2340 while(row + MAX_COMPLEX_UNROLL*accCols <= rows) {
2342 }
2343 switch( (rows-row)/accCols ) {
2344#if MAX_COMPLEX_UNROLL > 4
2345 case 4:
2347 break;
2348#endif
2349#if MAX_COMPLEX_UNROLL > 3
2350 case 3:
2352 break;
2353#endif
2354#if MAX_COMPLEX_UNROLL > 2
2355 case 2:
2357 break;
2358#endif
2359#if MAX_COMPLEX_UNROLL > 1
2360 case 1:
2362 break;
2363#endif
2364 default:
2365 break;
2366 }
2367#undef MAX_COMPLEX_UNROLL
2368
2369 if(remaining_rows > 0)
2370 {
2372 }
2373 }
2374
2375 if(remaining_cols > 0)
2376 {
2378 const Scalar* lhs_base = blockA;
2379
2380 for(; col < cols; col++)
2381 {
2382 Index row = 0;
2383
2385
2386 if (remaining_rows > 0)
2387 {
2389 }
2390 rhs_base++;
2391 }
2392 }
2393}
2394
2395#undef accColsC
2396#undef advanceCols
2397#undef advanceRows
2398
2399/************************************
2400 * ppc64le template specializations *
2401 * **********************************/
2402template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2404{
2405 void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2406};
2407
2408template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2410 ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2411{
2413 pack(blockA, lhs, depth, rows, stride, offset);
2414}
2415
2416template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2418{
2419 void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2420};
2421
2422template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2424 ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2425{
2427 pack(blockA, lhs, depth, rows, stride, offset);
2428}
2429
2430#if EIGEN_ALTIVEC_USE_CUSTOM_PACK
2431template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2433{
2434 void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2435};
2436
2437template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2439 ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2440{
2442 pack(blockB, rhs, depth, cols, stride, offset);
2443}
2444
2445template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2447{
2448 void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2449};
2450
2451template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2453 ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2454{
2456 pack(blockB, rhs, depth, cols, stride, offset);
2457}
2458#endif
2459
2460template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2462{
2463 void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2464};
2465
2466template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2468 ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2469{
2471 pack(blockA, lhs, depth, rows, stride, offset);
2472}
2473
2474template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2476{
2477 void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2478};
2479
2480template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2482 ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2483{
2485 pack(blockA, lhs, depth, rows, stride, offset);
2486}
2487
2488template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2490{
2491 void operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2492};
2493
2494template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2496 ::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2497{
2499 pack(blockA, lhs, depth, rows, stride, offset);
2500}
2501
2502template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2504{
2505 void operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2506};
2507
2508template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2510 ::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2511{
2513 pack(blockA, lhs, depth, rows, stride, offset);
2514}
2515
2516#if EIGEN_ALTIVEC_USE_CUSTOM_PACK
2517template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2519{
2520 void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2521};
2522
2523template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2525 ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2526{
2528 pack(blockB, rhs, depth, cols, stride, offset);
2529}
2530
2531template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2533{
2534 void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2535};
2536
2537template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2539 ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2540{
2542 pack(blockB, rhs, depth, cols, stride, offset);
2543}
2544#endif
2545
2546template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2548{
2549 void operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2550};
2551
2552template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2554 ::operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2555{
2557 pack(blockB, rhs, depth, cols, stride, offset);
2558}
2559
2560template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2562{
2563 void operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2564};
2565
2566template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2568 ::operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2569{
2571 pack(blockB, rhs, depth, cols, stride, offset);
2572}
2573
2574template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2576{
2577 void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2578};
2579
2580template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2582 ::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2583{
2585 pack(blockA, lhs, depth, rows, stride, offset);
2586}
2587
2588template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2590{
2591 void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2592};
2593
2594template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2596 ::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2597{
2599 pack(blockA, lhs, depth, rows, stride, offset);
2600}
2601
2602template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2604{
2605 void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2606};
2607
2608template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2610 ::operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2611{
2613 pack(blockB, rhs, depth, cols, stride, offset);
2614}
2615
2616template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2618{
2619 void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2620};
2621
2622template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2624 ::operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2625{
2627 pack(blockB, rhs, depth, cols, stride, offset);
2628}
2629
2630// ********* gebp specializations *********
2631template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2633{
2636
2637 void operator()(const DataMapper& res, const float* blockA, const float* blockB,
2638 Index rows, Index depth, Index cols, float alpha,
2640};
2641
2642template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2644 ::operator()(const DataMapper& res, const float* blockA, const float* blockB,
2645 Index rows, Index depth, Index cols, float alpha,
2647 {
2650 void (*gemm_function)(const DataMapper&, const float*, const float*, Index, Index, Index, float, Index, Index, Index, Index);
2651
2652 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2653 //generate with MMA only
2655 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2656 if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
2658 }
2659 else{
2661 }
2662 #else
2664 #endif
2666 }
2667
2668template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2669struct gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2670{
2674
2675 void operator()(const DataMapper& res, const std::complex<float>* blockA, const std::complex<float>* blockB,
2676 Index rows, Index depth, Index cols, std::complex<float> alpha,
2678};
2679
2680template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2681void gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2682 ::operator()(const DataMapper& res, const std::complex<float>* blockA, const std::complex<float>* blockB,
2683 Index rows, Index depth, Index cols, std::complex<float> alpha,
2685 {
2688 void (*gemm_function)(const DataMapper&, const std::complex<float>*, const std::complex<float>*,
2689 Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
2690
2691 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2692 //generate with MMA only
2694 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2695 if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
2697 }
2698 else{
2700 }
2701 #else
2703 #endif
2705 }
2706
2707template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2709{
2713
2714 void operator()(const DataMapper& res, const float* blockA, const std::complex<float>* blockB,
2715 Index rows, Index depth, Index cols, std::complex<float> alpha,
2717};
2718
2719template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2721 ::operator()(const DataMapper& res, const float* blockA, const std::complex<float>* blockB,
2722 Index rows, Index depth, Index cols, std::complex<float> alpha,
2724 {
2727 void (*gemm_function)(const DataMapper&, const float*, const std::complex<float>*,
2728 Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
2729 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2730 //generate with MMA only
2732 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2733 if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
2735 }
2736 else{
2738 }
2739 #else
2741 #endif
2743 }
2744
2745template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2747{
2751
2752 void operator()(const DataMapper& res, const std::complex<float>* blockA, const float* blockB,
2753 Index rows, Index depth, Index cols, std::complex<float> alpha,
2755};
2756
2757template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2759 ::operator()(const DataMapper& res, const std::complex<float>* blockA, const float* blockB,
2760 Index rows, Index depth, Index cols, std::complex<float> alpha,
2762 {
2765 void (*gemm_function)(const DataMapper&, const std::complex<float>*, const float*,
2766 Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
2767 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2768 //generate with MMA only
2770 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2771 if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
2773 }
2774 else{
2776 }
2777 #else
2779 #endif
2781 }
2782
2783template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2785{
2788
2789 void operator()(const DataMapper& res, const double* blockA, const double* blockB,
2790 Index rows, Index depth, Index cols, double alpha,
2792};
2793
2794template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2796 ::operator()(const DataMapper& res, const double* blockA, const double* blockB,
2797 Index rows, Index depth, Index cols, double alpha,
2799 {
2802 void (*gemm_function)(const DataMapper&, const double*, const double*, Index, Index, Index, double, Index, Index, Index, Index);
2803
2804 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2805 //generate with MMA only
2807 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2808 if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
2810 }
2811 else{
2813 }
2814 #else
2816 #endif
2818 }
2819
2820template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2821struct gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2822{
2826
2827 void operator()(const DataMapper& res, const std::complex<double>* blockA, const std::complex<double>* blockB,
2828 Index rows, Index depth, Index cols, std::complex<double> alpha,
2830};
2831
2832template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2833void gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2834 ::operator()(const DataMapper& res, const std::complex<double>* blockA, const std::complex<double>* blockB,
2835 Index rows, Index depth, Index cols, std::complex<double> alpha,
2837 {
2840 void (*gemm_function)(const DataMapper&, const std::complex<double>*, const std::complex<double>*,
2841 Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
2842 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2843 //generate with MMA only
2845 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2846 if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
2848 }
2849 else{
2851 }
2852 #else
2854 #endif
2856 }
2857
2858template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2860{
2864
2865 void operator()(const DataMapper& res, const std::complex<double>* blockA, const double* blockB,
2866 Index rows, Index depth, Index cols, std::complex<double> alpha,
2868};
2869
2870template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2872 ::operator()(const DataMapper& res, const std::complex<double>* blockA, const double* blockB,
2873 Index rows, Index depth, Index cols, std::complex<double> alpha,
2875 {
2878 void (*gemm_function)(const DataMapper&, const std::complex<double>*, const double*,
2879 Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
2880 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2881 //generate with MMA only
2883 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2884 if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
2886 }
2887 else{
2889 }
2890 #else
2892 #endif
2894 }
2895
2896template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2898{
2902
2903 void operator()(const DataMapper& res, const double* blockA, const std::complex<double>* blockB,
2904 Index rows, Index depth, Index cols, std::complex<double> alpha,
2906};
2907
2908template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2910 ::operator()(const DataMapper& res, const double* blockA, const std::complex<double>* blockB,
2911 Index rows, Index depth, Index cols, std::complex<double> alpha,
2913 {
2916 void (*gemm_function)(const DataMapper&, const double*, const std::complex<double>*,
2917 Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
2918 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2919 //generate with MMA only
2921 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2922 if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
2924 }
2925 else{
2927 }
2928 #else
2930 #endif
2932 }
2933} // end namespace internal
2934
2935} // end namespace Eigen
2936
2937#endif // EIGEN_MATRIX_PRODUCT_ALTIVEC_H
#define __UNPACK_TYPE__(PACKETNAME)
Definition PacketMath.h:74
ArrayXXi a
Definition Array_initializer_list_23_cxx11.cpp:1
Array< int, Dynamic, 1 > v
Definition Array_initializer_list_vector_cxx11.cpp:1
int i
Definition BiCGSTAB_step_by_step.cpp:9
internal::enable_if< internal::valid_indexed_view_overload< RowIndices, ColIndices >::value &&internal::traits< typenameEIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::ReturnAsIndexedView, typenameEIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::type operator()(const RowIndices &rowIndices, const ColIndices &colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
Definition IndexedViewMethods.h:73
#define EIGEN_ALWAYS_INLINE
Definition Macros.h:932
#define EIGEN_UNUSED_VARIABLE(var)
Definition Macros.h:1076
#define EIGEN_STRONG_INLINE
Definition Macros.h:917
m col(1)
m row(1)
#define EIGEN_POWER_PREFETCH(p)
Definition MatrixProductCommon.h:5
#define MICRO_COMPLEX_DST_PTR
Definition MatrixProduct.h:2113
#define advanceCols
Definition MatrixProduct.h:1788
#define MICRO_ONE_PEEL4
Definition MatrixProduct.h:1498
#define MICRO_STORE
Definition MatrixProduct.h:1549
#define accColsC
Definition MatrixProduct.h:1786
#define MICRO_DST_PTR
Definition MatrixProduct.h:1521
#define MICRO_COMPLEX_COL_STORE
Definition MatrixProduct.h:2160
#define MAX_UNROLL
#define advanceRows
Definition MatrixProduct.h:1787
#define MICRO_ONE_PEEL1
Definition MatrixProduct.h:1506
#define MICRO_COMPLEX_ONE_PEEL4
Definition MatrixProduct.h:2084
#define MICRO_COMPLEX_PREFETCH
Definition MatrixProduct.h:2138
#define PEEL_COMPLEX
Definition MatrixProduct.h:1791
#define MICRO_ONE1
Definition MatrixProduct.h:1510
#define PEEL
Definition MatrixProduct.h:1302
#define MICRO_COMPLEX_ONE4
Definition MatrixProduct.h:2089
#define MICRO_PREFETCH
Definition MatrixProduct.h:1537
#define MICRO_ONE4
Definition MatrixProduct.h:1502
#define MICRO_COMPLEX_ONE_PEEL1
Definition MatrixProduct.h:2094
#define MICRO_COMPLEX_STORE
Definition MatrixProduct.h:2149
#define MICRO_COL_STORE
Definition MatrixProduct.h:1558
#define MICRO_SRC_PTR
Definition MatrixProduct.h:1530
#define MICRO_COMPLEX_SRC_PTR
Definition MatrixProduct.h:2128
#define MAX_COMPLEX_UNROLL
#define MICRO_COMPLEX_ONE1
Definition MatrixProduct.h:2099
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
Definition PartialRedux_count.cpp:3
m m block(1, 0, 2, 2)<< 4
int rows
Definition Tutorial_commainit_02.cpp:1
int cols
Definition Tutorial_commainit_02.cpp:1
Scalar Scalar int size
Definition benchVecAdd.cpp:17
SCALAR Scalar
Definition bench_gemm.cpp:46
Definition ForwardDeclarations.h:87
@ N
Definition constructor.cpp:23
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Definition gnuplot_common_settings.hh:64
@ ColMajor
Definition Constants.h:319
@ RowMajor
Definition Constants.h:321
RealScalar alpha
Definition level1_cplx_impl.h:147
v2f64 Packet2d
Definition PacketMath.h:820
EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock< Packet, 4 > &acc, PacketBlock< Packet, 4 > &accZ, const Packet &pAlpha)
Definition MatrixProduct.h:1152
EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar *blockB, const Scalar *_rhs, Index rhsStride, Index rows, Index cols, Index k2)
Definition MatrixProduct.h:240
v2i64 Packet2l
Definition PacketMath.h:821
EIGEN_ALWAYS_INLINE void bscalec(PacketBlock< Packet, N > &aReal, PacketBlock< Packet, N > &aImag, const Packet &bReal, const Packet &bImag, PacketBlock< Packet, N > &cReal, PacketBlock< Packet, N > &cImag)
Definition MatrixProduct.h:1168
EIGEN_STRONG_INLINE void gemm(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
Definition MatrixProduct.h:1697
__vector int Packet4i
Definition PacketMath.h:31
EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex< Scalar > *blockA, const std::complex< Scalar > *_lhs, Index lhsStride, Index cols, Index rows)
Definition MatrixProduct.h:189
EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index &row, Index col, const Packet &pAlphaReal, const Packet &pAlphaImag)
Definition MatrixProduct.h:2163
EIGEN_ALWAYS_INLINE std::complex< Scalar > getAdjointVal(Index i, Index j, const_blas_data_mapper< std::complex< Scalar >, Index, StorageOrder > &dt)
Definition MatrixProduct.h:120
__vector unsigned char Packet16uc
Definition PacketMath.h:37
EIGEN_ALWAYS_INLINE void pgerc(PacketBlock< Packet, N > *accReal, PacketBlock< Packet, N > *accImag, const Scalar *lhs_ptr, const Scalar *lhs_ptr_imag, const Packet *rhsV, const Packet *rhsVi)
Definition MatrixProduct.h:1077
EIGEN_STRONG_INLINE void gemm_complex(const DataMapper &res, const LhsScalar *blockAc, const RhsScalar *blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
Definition MatrixProduct.h:2317
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
Definition Complex.h:224
EIGEN_ALWAYS_INLINE void pger_common(PacketBlock< Packet, 4 > *acc, const Packet &lhsV, const Packet *rhsV)
Definition MatrixProduct.h:999
EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock< Packet, N > *accReal, PacketBlock< Packet, N > *accImag, const Packet &lhsV, const Packet &lhsVi, const Packet *rhsV, const Packet *rhsVi)
Definition MatrixProduct.h:1058
EIGEN_STRONG_INLINE void symm_pack_lhs_helper(Scalar *blockA, const Scalar *_lhs, Index lhsStride, Index cols, Index rows)
Definition MatrixProduct.h:281
EIGEN_STRONG_INLINE void gemm_unrolled_col_iteration(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index &row, Index col, Index remaining_cols, const Packet &pAlpha)
Definition MatrixProduct.h:1597
EIGEN_STRONG_INLINE void gemm_complex_extra_col(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index row, Index col, Index remaining_rows, Index remaining_cols, const Packet &pAlphaReal, const Packet &pAlphaImag)
Definition MatrixProduct.h:1814
EIGEN_ALWAYS_INLINE void pbroadcast4_old(const __UNPACK_TYPE__(Packet) *a, Packet &a0, Packet &a1, Packet &a2, Packet &a3)
Definition MatrixProduct.h:1285
EIGEN_STRONG_INLINE void gemm_complex_extra_row(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index row, Index col, Index rows, Index cols, Index remaining_rows, const Packet &pAlphaReal, const Packet &pAlphaImag, const Packet &pMask)
Definition MatrixProduct.h:1912
EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW(const Scalar *&lhs_ptr_real, const Scalar *&lhs_ptr_imag, const Scalar *&rhs_ptr_real, const Scalar *&rhs_ptr_imag, PacketBlock< Packet, 4 > &accReal, PacketBlock< Packet, 4 > &accImag, Index remaining_rows)
Definition MatrixProduct.h:1893
EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar *lhs, Packet &lhsV, Index remaining_rows)
Definition MatrixProduct.h:1035
EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex< Scalar > *blockB, const std::complex< Scalar > *_rhs, Index rhsStride, Index rows, Index cols, Index k2)
Definition MatrixProduct.h:139
EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW(const Scalar *&lhs_ptr, const Scalar *&rhs_ptr, PacketBlock< Packet, 4 > &accZero, Index remaining_rows)
Definition MatrixProduct.h:1369
EIGEN_ALWAYS_INLINE void MICRO_EXTRA_COL(const Scalar *&lhs_ptr, const Scalar *&rhs_ptr, PacketBlock< Packet, 1 > &accZero, Index remaining_rows, Index remaining_cols)
Definition MatrixProduct.h:1305
EIGEN_STRONG_INLINE Packet2d pset1< Packet2d >(const double &from)
Definition PacketMath.h:872
EIGEN_STRONG_INLINE void gemm_extra_row(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index row, Index col, Index rows, Index cols, Index remaining_rows, const Packet &pAlpha, const Packet &pMask)
Definition MatrixProduct.h:1383
EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_COL(const Scalar *&lhs_ptr_real, const Scalar *&lhs_ptr_imag, const Scalar *&rhs_ptr_real, const Scalar *&rhs_ptr_imag, PacketBlock< Packet, 1 > &accReal, PacketBlock< Packet, 1 > &accImag, Index remaining_rows, Index remaining_cols)
Definition MatrixProduct.h:1794
EIGEN_ALWAYS_INLINE void band(PacketBlock< Packet, 4 > &acc, const Packet &pMask)
Definition MatrixProduct.h:1180
EIGEN_STRONG_INLINE void gemm_unrolled_col(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index &row, Index rows, Index col, Index remaining_cols, const Packet &pAlpha)
Definition MatrixProduct.h:1634
EIGEN_STRONG_INLINE void pstore< double >(double *to, const Packet4d &from)
Definition PacketMath.h:623
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition PacketMath.h:827
EIGEN_STRONG_INLINE Packet2d pload< Packet2d >(const double *from)
Definition PacketMath.h:967
EIGEN_ALWAYS_INLINE void storeBlock(Scalar *to, PacketBlock< Packet, 4 > &block)
Definition MatrixProduct.h:410
EIGEN_ALWAYS_INLINE Packet2d bmask< Packet2d >(const int remaining_rows)
Definition MatrixProduct.h:1267
EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows)
Definition MatrixProduct.h:1253
EIGEN_ALWAYS_INLINE void pbroadcast4_old< Packet2d >(const double *a, Packet2d &a0, Packet2d &a1, Packet2d &a2, Packet2d &a3)
Definition MatrixProduct.h:1291
EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex< float > *from0, const std::complex< float > *from1)
Definition Complex.h:130
EIGEN_STRONG_INLINE Packet8h pand(const Packet8h &a, const Packet8h &b)
Definition PacketMath.h:1050
EIGEN_STRONG_INLINE void gemm_extra_col(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index row, Index col, Index remaining_rows, Index remaining_cols, const Packet &pAlpha)
Definition MatrixProduct.h:1320
EIGEN_STRONG_INLINE void gemm_complex_unrolled_col(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index &row, Index rows, Index col, Index remaining_cols, const Packet &pAlphaReal, const Packet &pAlphaImag)
Definition MatrixProduct.h:2270
EIGEN_ALWAYS_INLINE void bscale(PacketBlock< Packet, 4 > &acc, PacketBlock< Packet, 4 > &accZ, const Packet &pAlpha)
Definition MatrixProduct.h:1137
EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar *lhs)
Definition MatrixProduct.h:1114
EIGEN_ALWAYS_INLINE void pger(PacketBlock< Packet, N > *acc, const Scalar *lhs, const Packet *rhsV)
Definition MatrixProduct.h:1027
EIGEN_STRONG_INLINE void gemm_unrolled_iteration(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index &row, Index col, const Packet &pAlpha)
Definition MatrixProduct.h:1561
EIGEN_ALWAYS_INLINE void bload(PacketBlock< Packet, 4 > &acc, const DataMapper &res, Index row, Index col)
Definition MatrixProduct.h:1199
EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock< Packet, 4 > &acc)
Definition MatrixProduct.h:1121
EIGEN_STRONG_INLINE void gemm_complex_unrolled_col_iteration(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index &row, Index col, Index remaining_cols, const Packet &pAlphaReal, const Packet &pAlphaImag)
Definition MatrixProduct.h:2216
Namespace containing all symbols from the Eigen library.
Definition bench_norm.cpp:85
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
Definition BandTriangularSolver.h:13
Definition BFloat16.h:88
Definition Complex.h:341
Definition Complex.h:31
EIGEN_STRONG_INLINE void operator()(std::complex< double > *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride, Index offset)
Definition MatrixProduct.h:925
EIGEN_STRONG_INLINE void operator()(std::complex< double > *blockA, const DataMapper &lhs, Index depth, Index rows, Index stride, Index offset)
Definition MatrixProduct.h:812
Definition MatrixProduct.h:429
EIGEN_STRONG_INLINE void operator()(std::complex< Scalar > *blockA, const DataMapper &lhs, Index depth, Index rows, Index stride, Index offset)
Definition MatrixProduct.h:430
EIGEN_STRONG_INLINE void operator()(double *blockA, const DataMapper &lhs, Index depth, Index rows, Index stride, Index offset)
Definition MatrixProduct.h:658
EIGEN_STRONG_INLINE void operator()(double *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride, Index offset)
Definition MatrixProduct.h:725
Definition MatrixProduct.h:572
EIGEN_STRONG_INLINE void operator()(Scalar *blockA, const DataMapper &lhs, Index depth, Index rows, Index stride, Index offset)
Definition MatrixProduct.h:573
Definition GeneralBlockPanelKernel.h:1058
EIGEN_DONT_INLINE void operator()(const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0)
Definition GeneralBlockPanelKernel.h:1405
Definition BlasUtil.h:28
Definition BlasUtil.h:25
Definition GenericPacketMath.h:107
PacketBlock< Packet2d, 2 > rhstype
Definition MatrixProduct.h:72
PacketBlock< vectortype, 4 > type
Definition MatrixProduct.h:71
Packet2d vectortype
Definition MatrixProduct.h:70
Definition MatrixProduct.h:55
@ size
Definition MatrixProduct.h:62
@ rows
Definition MatrixProduct.h:63
@ vectorsize
Definition MatrixProduct.h:61
packet_traits< Scalar >::type vectortype
Definition MatrixProduct.h:56
vectortype rhstype
Definition MatrixProduct.h:58
PacketBlock< vectortype, 4 > type
Definition MatrixProduct.h:57
void operator()(double *blockA, const double *_lhs, Index lhsStride, Index cols, Index rows)
Definition MatrixProduct.h:392
void operator()(float *blockA, const float *_lhs, Index lhsStride, Index cols, Index rows)
Definition MatrixProduct.h:373
void operator()(std::complex< double > *blockA, const std::complex< double > *_lhs, Index lhsStride, Index cols, Index rows)
Definition MatrixProduct.h:354
void operator()(std::complex< float > *blockA, const std::complex< float > *_lhs, Index lhsStride, Index cols, Index rows)
Definition MatrixProduct.h:334
Definition SelfadjointMatrixMatrix.h:20
void operator()(double *blockB, const double *_rhs, Index rhsStride, Index rows, Index cols, Index k2)
Definition MatrixProduct.h:383
void operator()(float *blockB, const float *_rhs, Index rhsStride, Index rows, Index cols, Index k2)
Definition MatrixProduct.h:364
void operator()(std::complex< double > *blockB, const std::complex< double > *_rhs, Index rhsStride, Index rows, Index cols, Index k2)
Definition MatrixProduct.h:345
void operator()(std::complex< float > *blockB, const std::complex< float > *_rhs, Index rhsStride, Index rows, Index cols, Index k2)
Definition MatrixProduct.h:325
Definition SelfadjointMatrixMatrix.h:102
Definition ForwardDeclarations.h:17
Definition datatypes.h:12
Definition main.h:101
Definition main.h:100
std::ptrdiff_t j
Definition tut_arithmetic_redux_minmax.cpp:2
Definition PacketMath.h:47