TR-mbed 1.0
Loading...
Searching...
No Matches
TensorContractionSycl.h
Go to the documentation of this file.
1// This file is part of Eigen, a lightweight C++ template library for linear algebra.
2//
3// Mehdi Goli Codeplay Software Ltd.
4// Ralph Potter Codeplay Software Ltd.
5// Luke Iwanski Codeplay Software Ltd.
6// Contact: <eigen@codeplay.com>
7//
8// This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not
9// distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10
11/*****************************************************************
12 * TensorContractionSycl.h
13 *
14 * \brief:
15 * TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
16 *
17 *****************************************************************/
18
19#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
20#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
21
22namespace Eigen {
23
24namespace TensorSycl {
25namespace internal {
26
27#ifndef EIGEN_SYCL_DISABLE_GEMV
42template <typename Scalar, typename StorageIndex, StorageIndex NCWindow, StorageIndex CFactor, StorageIndex NCFactor>
44 // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension
45 static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0;
46 // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension
47 static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1;
48 // TileSizeDimNC: determines the tile size for the non-contracting dimension
49 static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor;
50 // TileSizeDimC: determines the tile size for the contracting dimension
52 // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension
54 // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension
56 // BC : determines if supporting bank conflict is required
57 static EIGEN_CONSTEXPR bool BC = false;
58};
59#endif
60
78template <typename Scalar, typename StorageIndex, StorageIndex REG_SIZE_M, StorageIndex REG_SIZE_N, StorageIndex TSDK>
80 // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered
81 static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK;
82 // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the
83 // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro//
84#ifndef EIGEN_SYCL_REG_M
85 static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M;
86#else
87 static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M;
88#endif
89// WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the
90// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro
91#ifndef EIGEN_SYCL_REG_N
92 static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N;
93#else
94 static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N;
95#endif
96 // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension
97 static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0;
98 // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension
99 static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1;
100 // TileSizeDimM: determines the tile size for the m dimension
102 // TileSizeDimN: determines the tile size for the n dimension
104 // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisable by packetsize
105 static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs =
107 // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisable by packetsize
108 static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs =
110 // BC : determines if supporting bank conflict is required
111 static EIGEN_CONSTEXPR bool BC = true;
112 // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by
113 // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device doesnot have sufficient local memory)
115#ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER
116 false;
117#else
118 true;
119#endif
120};
121
122/* !
123 * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to
124 * specialize the contraction algorithm based on device support for dedicated local memory.
125 */
127/* !
128 * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private).
129 */
131
157template <bool PacketLoad, bool is_coalesced_layout, bool, typename PacketType, typename TensorMapper,
158 typename StorageIndex>
159static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<PacketLoad, PacketType>::type read(
160 const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) {
161 const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex;
162 const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex;
163 return tensorMapper.get_tensor().template packet<Unaligned>(row + (col * ld));
164}
165
188template <bool PacketLoad, bool, bool IsRhs, typename PacketType, typename TensorMapper, typename StorageIndex>
189static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!PacketLoad, PacketType>::type read(
190 const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) {
191 const StorageIndex row = (IsRhs) ? CIndex : NCIndex;
192 const StorageIndex col = (IsRhs) ? NCIndex : CIndex;
193 return tensorMapper(row, col);
194}
195
217template <typename StorageIndex, StorageIndex ld, data_source dt, typename PacketType, typename DataScalar>
219 typename ::Eigen::internal::enable_if<dt != data_source::global_mem, void>::type
220 write(PacketType &packet_data, DataScalar ptr) {
223 for (int i = 0; i < PacketSize; i++) {
225 ptr += ld;
226 }
227}
228
244template <data_source dt, typename PacketType, typename DataScalar>
245static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<
247write(PacketType &packet_data, DataScalar *ptr) {
249}
250
264template <data_source dt, typename PacketType, typename DataScalar>
265static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<
267write(PacketType &packet_data, DataScalar *ptr) {
268 *ptr = packet_data;
269}
270
276template <bool is_internal>
278 return true;
279}
280
286template <>
288 return cond;
289}
290
317template <bool is_transposed, bool is_rhs_, bool packet_load_, typename PacketType>
328
368template <typename StorageIndex>
370 const StorageIndex linearLocalThreadId;
371 const StorageIndex kGroupId;
372 const StorageIndex mGroupOffset;
373 const StorageIndex nGroupOffset;
374 const StorageIndex kGroupOffset;
375 const StorageIndex mLocalOffset;
376 const StorageIndex nLocalOffset;
377 const StorageIndex mGlobalOffset;
378 const StorageIndex nGlobalOffset;
379 StorageIndex kSize;
380 const bool is_internal;
381 // this is used to adjust the last block
383 const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_,
384 const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_,
385 const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_,
386 StorageIndex kSize_, const bool is_internal_)
387 : linearLocalThreadId(linearLocalThreadId_),
388 kGroupId(kGroupId_),
389 mGroupOffset(mGroupOffset_),
390 nGroupOffset(nGroupOffset_),
391 kGroupOffset(kGroupOffset_),
392 mLocalOffset(mLocalOffset_),
393 nLocalOffset(nLocalOffset_),
394 mGlobalOffset(mGlobalOffset_),
395 nGlobalOffset(nGlobalOffset_),
396 kSize(kSize_),
397 is_internal(is_internal_) {}
398};
399
450template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
451 typename RhsMapper, typename StorageIndex, typename Properties, typename TripleDim, bool Vectorizable,
452 typename input_mapper_properties, bool IsFinal, contraction_type contraction_tp>
454 public:
455 typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
458 Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
463
464 typedef BlockProperties<is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix && Vectorizable,
467
468 typedef BlockProperties<is_rhs_transposed, true, input_mapper_properties::is_rhs_matrix && Vectorizable,
471
472 static EIGEN_CONSTEXPR StorageIndex NStride =
473 contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride;
474
475 typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
476 typedef cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::local_space> local_ptr;
477 typedef OutScalar * /*cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::private_space>*/ private_ptr;
478 typedef
479 typename ::Eigen::internal::conditional<contraction_tp == contraction_type::local, local_ptr, private_ptr>::type
481 static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local
482 ? Properties::TileSizeDimM + Properties::BC
483 : Properties::WorkLoadPerThreadM;
484 static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local
485 ? Properties::TileSizeDimN + Properties::BC
486 : Properties::WorkLoadPerThreadN;
487 static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
488
501 template <contraction_type, StorageIndex>
502 struct MemHolder {
504 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {}
505 };
509 template <StorageIndex MemSize>
511 OutScalar ptr[MemSize] = {OutScalar{0}};
512 };
535 struct TiledMemory {
540 const std::pair<StorageIndex, StorageIndex> lhs_extract_index;
541 const std::pair<StorageIndex, StorageIndex> rhs_extract_index;
542 template <contraction_type tp = contraction_tp>
545 typename ::Eigen::internal::enable_if<tp == contraction_type::no_local>::type * = 0)
550 lhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})),
551 rhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})) {}
552
553 template <contraction_type tp = contraction_tp>
555 TiledMemory(const ThreadProperties<StorageIndex> &thread_properties, local_ptr block_start_ptr,
556 typename ::Eigen::internal::enable_if<tp == contraction_type::local>::type * = 0)
557 : lhs_scratch_extract{block_start_ptr},
559 ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)},
560 lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset),
561 rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset),
563 local_id_extract<LHSBlockProperties, Properties::TileSizeDimM>(thread_properties.linearLocalThreadId)),
565 local_id_extract<RHSBlockProperties, Properties::TileSizeDimN>(thread_properties.linearLocalThreadId)) {}
566 };
567
569 const LhsMapper lhs;
570 const RhsMapper rhs;
571 OutAccessor out_res;
572 const StorageIndex groupSizeM;
573 const StorageIndex groupSizeN;
574 const StorageIndex numTiles;
575 const TripleDim triple_dim;
576
578 const RhsMapper rhs_, OutAccessor out_res_,
579 const StorageIndex groupSizeM_,
580 const StorageIndex groupSizeN_,
581 const StorageIndex numTiles_,
582 const TripleDim triple_dim_)
583 : scratch(scratch_),
584 lhs(lhs_),
585 rhs(rhs_),
586 out_res(out_res_),
587 groupSizeM(groupSizeM_),
588 groupSizeN(groupSizeN_),
589 numTiles(numTiles_),
590 triple_dim(triple_dim_) {}
591
593 const RhsMapper rhs_, OutAccessor out_res_,
594 const StorageIndex groupSizeM_,
595 const StorageIndex numTiles_,
596 const TripleDim triple_dim_)
597 : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}
598
599 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
600 const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
601 const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
602 const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
603 const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM;
604 const StorageIndex tmp = itemID.get_group(0) / groupSizeM;
605 const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN;
606 const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN;
607 const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM;
608 const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN;
609 const StorageIndex mLocalOffset = PacketSize * mLocalThreadId;
610 const StorageIndex nLocalOffset = NStride * nLocalThreadId;
611 const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset;
612 const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset;
613
614 const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK;
615 StorageIndex kGroupOffset = kGroupId * kSizePerWG;
616 const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM &&
617 triple_dim.N - nGroupOffset >= Properties::TileSizeDimN &&
618 triple_dim.K - kGroupOffset >= kSizePerWG;
619 // this is used to adjust the last block
620 StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset);
621 // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to
622 // tile
623 kGroupOffset += kSize;
624
625 auto thread_properties =
626 ThreadProperties<StorageIndex>(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset,
627 mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal);
628
629 auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N);
630
631 (thread_properties.is_internal) ? compute_panel<true>(itemID, thread_properties, out_ptr)
632 : compute_panel<false>(itemID, thread_properties, out_ptr);
633 }
634 // The compute block computes the contraction operation private block for each thread and store the resutl in the
635 // privateRes memory of Each computation the compute block function is independent of local and no local concepts as
636 // it only compute the block on each thread's private memory space
637 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
638 PacketReturnType *privateRes) {
639 StorageIndex idx = 0;
640 EIGEN_CONSTEXPR StorageIndex lhs_stride =
641 contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
643 for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) {
644 auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)};
645 StorageIndex lhs_index = 0;
647 for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
648 PacketReturnType lhsPack{};
650 lhs_block_ptr + lhs_index);
651 privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]);
652
653 lhs_index += lhs_stride;
654 idx++;
655 }
656 }
657 }
658 // The store function write the computed contraction operation in the private memory of each thread to the global
659 // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base
660 // class.
661 template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>
663 StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) {
664 auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {
665 return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);
666 };
667 // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is
668 // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId*
669 // WorkLoadPerThreadN slice of N
670 EIGEN_CONSTEXPR StorageIndex GlobalNStride =
671 contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN;
673 for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) {
674 // output leading dimension
675 StorageIndex outputLD = 0;
676 // When local memory is used the PrivateNstride is always 1 because the coalesed access on N is loaded into Local
677 // memory and extracting from local to global is the same as no transposed version. However, when local memory is
678 // not used and RHS is transposed we packetize the load for RHS.
680 for (StorageIndex nId = 0; nId < PrivateNStride; nId++) {
681 StorageIndex globalRow = mGlobalOffset;
683 for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
684 PacketReturnType privetOut = privateRes[wLPTM];
685 if (check_boundary<is_internal_block>(chk_bound(globalRow, nId))) {
686 // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second
687 // StorageIndex Therefore it is always coalesced layout
688 write<data_source::global_mem>(privetOut, out_ptr + outputLD + globalRow);
689 } else {
691 for (StorageIndex mId = 0; mId < PacketSize; mId++) {
692 StorageIndex mOffset = globalRow + mId;
693 if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) {
694 out_ptr[mOffset + outputLD] =
696 }
697 }
698 }
699 globalRow += (PacketSize * Properties::LocalThreadSizeM);
700 }
701 outputLD += triple_dim.M;
702 privateRes += Properties::WorkLoadPerThreadM / PacketSize;
703 }
704 out_ptr += (GlobalNStride * outputLD);
705
706 nGlobalOffset += (PrivateNStride * GlobalNStride);
707 }
708 }
709 // when no local memory is used the following extract_block will be enabled
710 template <typename InputBlockProperties, bool is_internal_block, typename Input, typename PrivateReg,
711 contraction_type contract_tp = contraction_tp>
713 typename ::Eigen::internal::enable_if<contract_tp == contraction_type::no_local>::type
714 extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
715 const StorageIndex &ncOffset, const StorageIndex cOffset) {
716 EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
717 InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
718 EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
719 InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM;
720 const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
721
722 auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
723 return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
724 (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
725 };
726 const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
727 StorageIndex cIndex = cOffset;
728
730 for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) {
731 StorageIndex ncIndex = ncOffset;
733 for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) {
734 if (check_boundary<is_internal_block>(chk_bound(cIndex, ncIndex))) {
735 auto val =
736 read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
737 InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, ncIndex, cIndex, ld);
738
739 write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
741 } else {
743 for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
744 const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
745 const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
746 OutScalar val =
747 (ncInd < NC && cInd < triple_dim.K)
748 ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
749 inpt, ncInd, cInd, ld)
750 : OutScalar(0);
751 write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
753 val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) +
754 ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC));
755 }
756 }
757
758 // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So
759 // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread.
760 ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1)
761 ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC
762 : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC);
763 private_ptr += InputBlockProperties::nc_stride;
764 }
765 // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC
766 private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC;
767 cIndex += InputBlockProperties::c_stride;
768 }
769 }
770 template <typename InputBlockProperties, StorageIndex TileSizeDimNC>
771 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair<StorageIndex, StorageIndex> local_id_extract(
772 const StorageIndex &linearLocalThreadId) {
773 const StorageIndex localThreadNC =
774 (InputBlockProperties::is_coalesced_layout)
775 ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride)
776 : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
777 const StorageIndex localThreadC =
778 (InputBlockProperties::is_coalesced_layout)
779 ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride)
780 : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
781 return std::pair<StorageIndex, StorageIndex>(localThreadNC, localThreadC);
782 }
783
784 template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
786 typename ::Eigen::internal::enable_if<db && ctp == contraction_type::local>::type
787 sync_mem(const cl::sycl::nd_item<1> &, bool &db_offset) noexcept {
788 db_offset = !db_offset;
789 }
790
791 template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
793 typename ::Eigen::internal::enable_if<!db && ctp == contraction_type::local>::type
794 sync_mem(const cl::sycl::nd_item<1> &itemID, bool &) noexcept {
795 itemID.barrier(cl::sycl::access::fence_space::local_space);
796 }
797
798 template <contraction_type ctp = contraction_tp>
800 typename ::Eigen::internal::enable_if<ctp == contraction_type::no_local>::type
801 sync_mem(const cl::sycl::nd_item<1> &, bool &) noexcept {
802 return;
803 }
804
805 template <bool need_sync, contraction_type ctp = contraction_tp>
807 typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::no_local>::type
808 sync_thread(const cl::sycl::nd_item<1> &
809#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
810 itemID
811#endif
812 ) noexcept {
813#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
814 itemID.barrier(cl::sycl::access::fence_spacce::local_space);
815#else
816 return;
817#endif
818 }
819 template <bool need_sync, contraction_type ctp = contraction_tp>
821 typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::local>::type
822 sync_thread(const cl::sycl::nd_item<1> &itemID) {
823 itemID.barrier(cl::sycl::access::fence_space::local_space);
824 }
825 template <bool need_sync>
826 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!need_sync>::type sync_thread(
827 const cl::sycl::nd_item<1> &) {
828 return;
829 }
830
831 template <bool is_internal_block>
832 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,
833 ThreadProperties<StorageIndex> &thread_properties,
834 TiledMemory &tiled_input_block,
835 PacketReturnType *privateRes, bool &db_offset) {
836 // Tiling the Rhs block from global to local memory
837 extract_block<RHSBlockProperties, is_internal_block>(
838 rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),
839 tiled_input_block.rhs_extract_index,
840 contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset,
841 thread_properties.kGroupOffset - thread_properties.kSize);
842
843 sync_thread<contraction_tp == contraction_type::no_local>(itemID);
844
845 // Tiling the Lhs block from global to local memory
846 extract_block<LHSBlockProperties, is_internal_block>(
847 lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK),
848 tiled_input_block.lhs_extract_index,
849 contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset,
850 thread_properties.kGroupOffset - thread_properties.kSize);
851
852 // itemID.barrier(cl::sycl::access::fence_space::local_space);
853 sync_thread<contraction_tp == contraction_type::local>(itemID);
854 // switch to compute mede
855 StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK);
856 StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR);
857 // Loop over the values of a single tile
858 for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) {
859 compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset,
860 tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes);
861 lhs_offset += LSDL;
862 rhs_offset += LSDR;
863 }
864 // computing the K index for the next tile
865 thread_properties.kSize -= Properties::TileSizeDimK;
866 sync_mem(itemID, db_offset);
867 }
868
869 // when local memory is available the following compute_panel will be enabled
870 template <bool is_internal_block, typename OutPtr>
871 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,
872 ThreadProperties<StorageIndex> &thread_properties,
873 OutPtr out_ptr) {
874 auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};
875 // Allocate register space
876 PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {
878 bool db_offset = 0;
879
880 while (thread_properties.kSize >= Properties::TileSizeDimK) {
881 compute_tile_per_panel<is_internal_block>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
882 }
883 if (thread_properties.kSize > 0) {
884 compute_tile_per_panel<false>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
885 }
886
887 // Storing the final results in the output
888 store<is_internal_block,
889 contraction_tp == contraction_type::local ? static_cast<StorageIndex>(1) : RHSBlockProperties::nc_stride>(
890 out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset,
891 thread_properties.nGlobalOffset);
892 }
893 // When local memory is available the following extract_block will be enabled
894 template <typename InputBlockProperties, bool is_internal_block, typename Input, typename Local,
895 contraction_type contract_tp = contraction_tp>
897 typename ::Eigen::internal::enable_if<contract_tp == contraction_type::local>::type
898 extract_block(const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex>& local_index,
899 const StorageIndex &ncOffset, const StorageIndex cOffset) {
900 EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
901 InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
902 EIGEN_CONSTEXPR StorageIndex LoadPerThread =
903 InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs;
904 EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL;
905 static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) &&
906 (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)),
907 " LocalOffset must be divisable by stride");
908 const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
909 StorageIndex localThreadNC = local_index.first;
910 StorageIndex localThreadC = local_index.second;
911 auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
912 return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
913 (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
914 };
916 for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) {
917 const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC);
918 const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC);
919 const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
920 if (check_boundary<is_internal_block>(chk_bound(CIndex, NCIndex))) {
921 auto val =
922 read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
923 InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, NCIndex, CIndex, ld);
924 write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
925 val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
926 (InputBlockProperties::c_stride * localThreadC * LSD));
927 } else {
929 for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
930 const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
931 const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
932 OutScalar val =
933 (nCInd < NC && cInd < triple_dim.K)
934 ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
935 inpt, nCInd, cInd, ld)
936 : OutScalar(0);
937
938 write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
939 val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
940 (InputBlockProperties::is_coalesced_layout ? i : 0) +
941 ((InputBlockProperties::c_stride * localThreadC +
942 (InputBlockProperties::is_coalesced_layout ? 0 : i)) *
943 LSD));
944 }
945 }
946 localThreadNC += (InputBlockProperties::is_coalesced_layout)
947 ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride)
948 : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
949 localThreadC += (InputBlockProperties::is_coalesced_layout)
950 ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride)
951 : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
952 }
953 }
954};
955
956#ifndef EIGEN_SYCL_DISABLE_GEMV
957
999template <typename OutScalar, typename OutAccessor, typename VectorMapper, typename TensorMapper, typename StorageIndex,
1000 typename Properties, StorageIndex KFactor, bool Vectorizable, bool is_lhs_vec, bool IsFinal>
1002 typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
1005 Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
1006 typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
1007
1008 static EIGEN_CONSTEXPR StorageIndex OutScratchOffset =
1009 KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
1010
1011 // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make
1012 // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true.
1015
1017 const VectorMapper vec;
1018 const TensorMapper mat;
1019 OutAccessor out_res;
1020 const StorageIndex nonContractGroupSize;
1021 const StorageIndex nonContractDim;
1022 const StorageIndex contractDim;
1023
1025 const TensorMapper mat_, OutAccessor out_res_,
1026 const StorageIndex nonContractGroupSize_,
1027 const StorageIndex nonContractDim_,
1028 const StorageIndex contractDim_)
1029 : scratch(scratch_),
1030 vec(vec_),
1031 mat(mat_),
1032 out_res(out_res_),
1033 nonContractGroupSize(nonContractGroupSize_),
1034 nonContractDim(nonContractDim_),
1035 contractDim(contractDim_) {}
1036
1037 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
1038 auto scratch_ptr = scratch.get_pointer();
1039 const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
1040 StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
1041 : linearLocalThreadId % Properties::LocalThreadSizeNC;
1042 StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC
1043 : linearLocalThreadId / Properties::LocalThreadSizeNC;
1044 const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize;
1045 const StorageIndex nonContractGroupId =
1046 is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize;
1047 const StorageIndex contractGroupId =
1048 is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize;
1049 auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : contractGroupId * nonContractDim);
1050
1051 const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC;
1052 const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC;
1053 auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
1054 const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId;
1055 const StorageIndex globalContractDimOffset = contractGroupOffset + contractId;
1056 auto local_output = scratch_ptr + OutScratchOffset;
1057 const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC &&
1058 contractDim - contractGroupOffset >= Properties::TileSizeDimC;
1059 is_internal
1060 ? compute_panel<true>(itemID, vec, mat, local_output, out_ptr,
1062 scratch_ptr, contractGroupOffset,
1063#endif
1064 nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
1065 nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex)
1066 : compute_panel<false>(itemID, vec, mat, local_output, out_ptr,
1068 scratch_ptr, contractGroupOffset,
1069#endif
1070 nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
1071 nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex);
1072 }
1073 template <bool is_internal_block, typename OutPtr>
1075 const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output,
1076 OutPtr out_ptr,
1078 OutScalar *scratch_ptr, const StorageIndex contractGroupOffset,
1079#endif
1080 const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim,
1081 StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId,
1082 StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) {
1083 OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)};
1084 // Reading the vector
1085#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
1086 const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId;
1087 extract_block<VecBlockProperties, is_internal_block, KFactor,
1088 Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC>(vec, scratch_ptr, linearLocalThreadId,
1089 vectorOffset, contractDim);
1090
1091 itemID.barrier(cl::sycl::access::fence_space::local_space);
1092 auto in_scratch_ptr = scratch_ptr + contractId;
1093#endif
1094
1095 StorageIndex privateOffsetC = 0;
1097 for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) {
1098 StorageIndex privateOffsetNC = 0;
1099 bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim);
1100#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
1101 auto vecScalar = *in_scratch_ptr;
1102#else
1103 auto vecScalar = (check_boundary<is_internal_block>(contract_conds))
1104 ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC,
1105 is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0))
1106 : OutScalar(0);
1107#endif
1109 for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
1110 auto matScalar = (check_boundary<is_internal_block>(
1111 contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim)))
1112 ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC
1113 : globalNonContractDimOffset + privateOffsetNC,
1114 is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC
1115 : globalContractDimOffset + privateOffsetC)
1116 : OutScalar(0);
1117
1118 outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]);
1119 privateOffsetNC += Properties::LocalThreadSizeNC;
1120 }
1121 privateOffsetC += Properties::LocalThreadSizeC;
1122#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
1123 in_scratch_ptr += Properties::LocalThreadSizeC;
1124#endif
1125 }
1126
1127 auto out_scratch_ptr = local_output + outScratchIndex;
1128 // Each block of 16*16 element in shared memory should reduce to 16*1
1130 for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
1131 *out_scratch_ptr = outScalar[j];
1132
1133 out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
1134 }
1135 if (is_lhs_vec) {
1136 nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC;
1137 contractId = linearLocalThreadId / Properties::LocalThreadSizeNC;
1138 outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
1139 }
1140
1141 out_scratch_ptr = local_output + outScratchIndex;
1143 for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
1145 for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) {
1146 itemID.barrier(cl::sycl::access::fence_space::local_space);
1147 if (contractId < offset) {
1148 StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset);
1149 *out_scratch_ptr += out_scratch_ptr[myNeigbourId];
1150 }
1151 }
1152 // moving to next 16 by 16 block
1153 out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
1154 }
1155
1156 if (contractId == 0) {
1157 out_scratch_ptr = local_output + nonContractId;
1158 StorageIndex global_final_offset = nonContractGroupOffset + nonContractId;
1159 out_ptr += global_final_offset;
1161 for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
1162 if (check_boundary<is_internal_block>(global_final_offset < nonContractDim)) {
1163 auto res = *out_scratch_ptr;
1164
1165 *out_ptr = res;
1166 out_ptr += Properties::LocalThreadSizeNC;
1167 }
1168 // moving to next 16 by 16 block to ge the next 16 reduced elements
1169 out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
1170 if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC;
1171 }
1172 }
1173 }
1174
1175 template <typename InputBlockProperties, bool is_internal_block, int CFactor, int GroupSize, typename Input,
1176 typename Local>
1177 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr,
1178 const StorageIndex &linearLocalThreadId,
1179 const StorageIndex &cOffset, const StorageIndex &C) {
1180 local_ptr += InputBlockProperties::c_stride * linearLocalThreadId;
1181 StorageIndex cIndex = cOffset;
1182 for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) {
1183 if (check_boundary<is_internal_block>(cIndex + InputBlockProperties::c_stride - 1 < C)) {
1184 auto val = read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
1185 InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, StorageIndex(0),
1186 cIndex, StorageIndex(1));
1187 write<StorageIndex, 1, data_source::local_mem>(val, local_ptr);
1188 } else {
1190 for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
1191 OutScalar val =
1192 (cIndex + i < C)
1193 ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
1194 inpt, StorageIndex(0), cIndex + i, StorageIndex(1))
1195 : OutScalar(0);
1196 write<StorageIndex, 1, data_source::local_mem>(val, local_ptr + i);
1197 }
1198 }
1199 local_ptr += InputBlockProperties::c_stride * GroupSize;
1200 cIndex += InputBlockProperties::c_stride * GroupSize;
1201 }
1202 }
1203};
1204#endif
1205
1206#ifndef EIGEN_SYCL_DISABLE_SCALAR
1207
1239template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
1240 typename RhsMapper, typename StorageIndex, bool Vectorizable>
1242 typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
1244 const LhsMapper lhs;
1245 const RhsMapper rhs;
1246 OutAccessor out_res;
1247 const StorageIndex rng;
1248
1250 GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_,
1251 const StorageIndex rng_)
1252 : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}
1253
1254 EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) {
1255 auto out_ptr = out_res.get_pointer();
1256 auto scratch_ptr = scratch.get_pointer().get();
1257
1258 StorageIndex globalid = itemID.get_global_id(0);
1259 StorageIndex localid = itemID.get_local_id(0);
1260 OutScalar accumulator = OutScalar(0);
1261 for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) {
1262 accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator);
1263 }
1264 auto out_scratch_ptr = scratch_ptr + localid;
1265 *out_scratch_ptr = accumulator;
1266 for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) {
1267 itemID.barrier(cl::sycl::access::fence_space::local_space);
1268 if (localid < offset) {
1269 *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]);
1270 }
1271 }
1272 if (localid == 0) {
1273 out_ptr[itemID.get_group(0)] = accumulator;
1274 }
1275 }
1276};
1277#endif
1278
1279} // namespace internal
1280} // namespace TensorSycl
1281
1282template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
1284 Eigen::SyclDevice>
1285 : public TensorContractionEvaluatorBase<TensorEvaluator<
1286 const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Eigen::SyclDevice>> {
1287 static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value,
1288 "SYCL tensor contraction does not support output kernels.");
1289
1290 typedef Eigen::SyclDevice Device;
1291
1299 typedef typename Base::Storage Storage;
1301 struct TripleDim {
1305 TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {}
1306 };
1307 enum {
1311 };
1312
1313 static EIGEN_CONSTEXPR int LDims = Base::LDims;
1314 static EIGEN_CONSTEXPR int RDims = Base::RDims;
1315 static EIGEN_CONSTEXPR int ContractDims = Base::ContractDims;
1316
1319
1321 typedef array<StorageIndex, LDims - ContractDims> left_nocontract_t;
1322 typedef array<StorageIndex, RDims - ContractDims> right_nocontract_t;
1323
1324 static const int NumDims = LDims + RDims - 2 * ContractDims;
1325
1327
1332
1335
1336 template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered>
1337 struct input_mapper_propertis {
1338 static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous;
1339 static EIGEN_CONSTEXPR bool is_rhs_matrix =
1340 (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered);
1341 };
1342
1343 TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {}
1344
1345 // We need to redefine this method to make nvcc happy
1347 this->m_leftImpl.evalSubExprsIfNeeded(NULL);
1348 this->m_rightImpl.evalSubExprsIfNeeded(NULL);
1349 if (!data) {
1350 this->m_result = this->m_device.get(
1351 static_cast<Scalar *>(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar))));
1352 data = this->m_result;
1353 }
1354 evalToSycl(data);
1355 return (this->m_result != NULL);
1356 }
1357 const Eigen::SyclDevice &device() const { return this->m_device; }
1358 void evalToSycl(typename Base::EvaluatorPointerType buffer) const {
1359 if (this->m_lhs_inner_dim_contiguous) {
1360 if (this->m_rhs_inner_dim_contiguous) {
1361 if (this->m_rhs_inner_dim_reordered) {
1362 evalTyped<true, true, true, Unaligned>(buffer);
1363 } else {
1364 evalTyped<true, true, false, Unaligned>(buffer);
1365 }
1366 } else {
1367 if (this->m_rhs_inner_dim_reordered) {
1368 evalTyped<true, false, true, Unaligned>(buffer);
1369 } else {
1370 evalTyped<true, false, false, Unaligned>(buffer);
1371 }
1372 }
1373 } else {
1374 if (this->m_rhs_inner_dim_contiguous) {
1375 if (this->m_rhs_inner_dim_reordered) {
1376 evalTyped<false, true, true, Unaligned>(buffer);
1377 } else {
1378 evalTyped<false, true, false, Unaligned>(buffer);
1379 }
1380 } else {
1381 if (this->m_rhs_inner_dim_reordered) {
1382 evalTyped<false, false, true, Unaligned>(buffer);
1383 } else {
1384 evalTyped<false, false, false, Unaligned>(buffer);
1385 }
1386 }
1387 }
1388 }
1389
1390 template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
1391 void evalTyped(typename Base::EvaluatorPointerType buffer) const {
1392 const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size};
1395 PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakeSYCLPointer>
1396 LhsMapper;
1397
1400 PacketType<CoeffReturnType, Device>::size, rhs_inner_dim_contiguous,
1401 rhs_inner_dim_reordered, Unaligned, MakeSYCLPointer>
1402 RhsMapper;
1403
1404 // initialize data mappers
1405 LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
1406 this->m_left_contracting_strides, this->m_k_strides);
1407
1408 RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
1409 this->m_right_contracting_strides, this->m_k_strides);
1410
1411#ifndef EIGEN_SYCL_DISABLE_SCALAR
1412 if (triple_dim.M == 1 && triple_dim.N == 1) {
1413 launchSC(buffer, lhs, rhs, triple_dim.K);
1414 } else
1415#endif
1416#ifndef EIGEN_SYCL_DISABLE_GEMV
1417 if (triple_dim.M != 1 && triple_dim.N == 1) {
1418 LaunchVT<false>(buffer, rhs, lhs, triple_dim.M, triple_dim.K);
1419 } else if (triple_dim.M == 1 && triple_dim.N != 1) {
1420 LaunchVT<true>(buffer, lhs, rhs, triple_dim.N, triple_dim.K);
1421 } else // This is equivalent of if (m!=1 && n!=1)
1422#endif
1423 {
1424 typedef input_mapper_propertis<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>
1425 inpt_mapper_properties;
1426#ifndef EIGEN_SYCL_DISABLE_SKINNY
1427 bool skinny = false;
1428 auto platform_name = this->device().getPlatformName();
1429 // This is based on empirical calculation for AMD r9-nano and Fiji
1430 if (platform_name.find("AMD") == 0) {
1431 skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) &&
1432 ((triple_dim.M < 1024 && triple_dim.N < 1024) ||
1433 (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K)));
1434 } else {
1435 skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) ||
1436 ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) ||
1437 ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100));
1438 }
1439 if (skinny)
1440 adjustTT<true, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
1441 else
1442#endif // EIGEN_SYCL_DISABLE_SKINNY
1443 adjustTT<false, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
1444 }
1445 }
1446
1447 template <bool skinny, typename input_mapper_properties, typename LhsMapper, typename RhsMapper>
1448 void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
1449 const TripleDim &triple_dim) const {
1450#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
1451 if (device().has_local_memory()) {
1453 launchTT<TensorSycl::internal::contraction_type::local, skinny, input_mapper_properties, PanelParameters>(
1454 buffer, lhs, rhs, triple_dim);
1455 }
1456#endif
1457#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF
1458 if (!(device().has_local_memory())) {
1460 launchTT<TensorSycl::internal::contraction_type::no_local, skinny, input_mapper_properties, PanelParameters>(
1461 buffer, lhs, rhs, triple_dim);
1462 }
1463#endif
1464 }
1465
1466 template <TensorSycl::internal::contraction_type ct, bool skinny, typename input_mapper_properties,
1467 typename Properties, typename LhsMapper, typename RhsMapper>
1468 void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
1469 const TripleDim &triple_dim) const {
1470 const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM);
1471 const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN);
1472 const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM;
1473 const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN;
1474
1475 const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK);
1476 StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK;
1477 StorageIndex groupSizeK =
1478 skinny
1479 ? std::max(std::min(totalTilesK,
1480 (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) /
1481 (groupSizeM * groupSizeN)),
1482 StorageIndex(1))
1483 : StorageIndex(1);
1484
1485 const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK;
1486
1487 const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK;
1488
1489 const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
1490 const StorageIndex globalRange = totalGroupSize * localRange;
1491
1493 ? ((Properties::DoubleBuffer + 1) *
1494 (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) +
1495 ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) *
1496 (Properties::TileSizeDimN + Properties::BC))
1497 : StorageIndex(1);
1498
1499 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
1500 if (groupSizeK == 1) {
1502 LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
1503 PacketAccess, input_mapper_properties, true, ct>
1504 ContractKernelName;
1505 device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
1506 lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim);
1507 } else {
1509 LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
1510 PacketAccess, input_mapper_properties, false, ct>
1511 ContractKernelName;
1512 CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
1513 device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType)));
1514 EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
1515
1516 device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
1517 lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup,
1518 triple_dim);
1519
1521 auto op = Op();
1524 ReductionKernel;
1525
1526 device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
1527 tmp_global_accessor, buffer,
1528 cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex(
1529 Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))),
1530 cl::sycl::range<1>(localRange)),
1531 StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK);
1532
1533 device().deallocate_temp(temp_pointer);
1534 }
1535 }
1536
1537#ifndef EIGEN_SYCL_DISABLE_GEMV
1538 template <bool is_lhs_vec, typename VectorMapper, typename TensorMapper, typename StorageIndex>
1539 void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat,
1540 StorageIndex NC, StorageIndex C) const {
1541 const StorageIndex nonContractDim = NC;
1542 EIGEN_CONSTEXPR StorageIndex NCFactor = 1;
1543 EIGEN_CONSTEXPR StorageIndex CFactor = 1;
1544 EIGEN_CONSTEXPR StorageIndex NCWindow = 16;
1546 Properties;
1547 const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC);
1548 const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC);
1549 const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC);
1550 const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC);
1551 const StorageIndex globalRange =
1552 (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC));
1553 const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC;
1554 const StorageIndex scratchSize =
1555 (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
1556 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
1557 if (cNumGroups > 1) {
1559 TensorMapper, StorageIndex, Properties, CFactor, false,
1560 is_lhs_vec, false>
1561 ContractKernelName;
1562 CoeffReturnType *temp_pointer =
1563 static_cast<CoeffReturnType *>(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType)));
1564 EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
1565
1566 device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
1567 vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
1568
1572 ReductionKernel;
1573
1574 device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
1575 tmp_global_accessor, buffer,
1576 cl::sycl::nd_range<1>(cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)),
1577 cl::sycl::range<1>(localRange)),
1578 StorageIndex(1), Op(), nonContractDim, cNumGroups);
1579
1580 device().deallocate_temp(temp_pointer);
1581 } else {
1583 TensorMapper, StorageIndex, Properties, CFactor, false,
1584 is_lhs_vec, true>
1585 ContractKernelName;
1586 device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
1587 vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
1588 }
1589 }
1590#endif
1591
1592#ifndef EIGEN_SYCL_DISABLE_SCALAR
1593 template <typename LhsMapper, typename RhsMapper>
1594 EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
1595 StorageIndex K) const {
1596 EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
1597 (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
1598 "The Local thread size must be a power of 2 for the reduction "
1599 "operation");
1600 EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
1601
1602 // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread
1603 // reduces at least 512 elementss individually, we get better performance.
1604 const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1);
1605 const StorageIndex global_range = num_work_group * local_range;
1606
1608 CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false>
1609 ContractKernelName;
1610 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
1611 if (num_work_group > 1) {
1612 CoeffReturnType *temp_pointer =
1613 static_cast<CoeffReturnType *>(device().allocate_temp(num_work_group * sizeof(CoeffReturnType)));
1614 EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
1615 device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, tmp_global_accessor,
1616 thread_range, local_range, K);
1619 EvaluatorPointerType, StorageIndex, local_range>
1620 GenericRKernel;
1621 device().template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
1622 tmp_global_accessor, buffer,
1623 cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op());
1624
1625 device().deallocate_temp(temp_pointer);
1626 } else {
1627 device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, buffer, thread_range,
1628 local_range, K);
1629 }
1630 }
1631#endif
1632
1634 this->m_leftImpl.cleanup();
1635 this->m_rightImpl.cleanup();
1636
1637 if (this->m_result) {
1638 this->m_device.deallocate_temp(this->m_result);
1639 this->m_result = NULL;
1640 }
1641 }
1642 // The placeholder accessors must bound to a command group handler for SYCL
1643 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
1644 this->m_leftImpl.bind(cgh);
1645 this->m_rightImpl.bind(cgh);
1646 this->m_result.bind(cgh);
1647 }
1648};
1649} // namespace Eigen
1650#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
int i
Definition BiCGSTAB_step_by_step.cpp:9
#define EIGEN_ALWAYS_INLINE
Definition Macros.h:932
#define EIGEN_UNROLL_LOOP
Definition Macros.h:1461
#define EIGEN_CONSTEXPR
Definition Macros.h:787
#define EIGEN_DEVICE_FUNC
Definition Macros.h:976
#define EIGEN_STRONG_INLINE
Definition Macros.h:917
m col(1)
m row(1)
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
Definition PartialRedux_count.cpp:3
#define EIGEN_STATIC_ASSERT(CONDITION, MSG)
Definition StaticAssert.h:127
#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
Definition TensorMacros.h:69
MatrixXf mat
Definition Tutorial_AdvancedInitialization_CommaTemporary.cpp:1
Matrix< Scalar, Dynamic, Dynamic > C
Definition bench_gemm.cpp:50
Definition TensorContraction.h:326
Eigen::internal::traits< TensorContractionOp >::Index Index
Definition TensorContraction.h:333
internal::gebp_traits< typenameLhsXprType::CoeffReturnType, typenameRhsXprType::CoeffReturnType >::ResScalar CoeffReturnType
Definition TensorContraction.h:330
TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation.
Definition TensorContractionSycl.h:453
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename::Eigen::internal::enable_if< ctp==contraction_type::no_local >::type sync_mem(const cl::sycl::nd_item< 1 > &, bool &) noexcept
Definition TensorContractionSycl.h:801
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_, const StorageIndex groupSizeM_, const StorageIndex numTiles_, const TripleDim triple_dim_)
Definition TensorContractionSycl.h:592
static EIGEN_CONSTEXPR bool is_lhs_transposed
Definition TensorContractionSycl.h:459
const StorageIndex groupSizeM
Definition TensorContractionSycl.h:572
BlockProperties< is_rhs_transposed, true, input_mapper_properties::is_rhs_matrix &&Vectorizable, PacketReturnType > RHSBlockProperties
Definition TensorContractionSycl.h:470
static EIGEN_CONSTEXPR StorageIndex LSDR
Definition TensorContractionSycl.h:484
static EIGEN_CONSTEXPR StorageIndex LocalOffset
Definition TensorContractionSycl.h:487
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename::Eigen::internal::enable_if< db &&ctp==contraction_type::local >::type sync_mem(const cl::sycl::nd_item< 1 > &, bool &db_offset) noexcept
Definition TensorContractionSycl.h:787
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item< 1 > &itemID, ThreadProperties< StorageIndex > &thread_properties, TiledMemory &tiled_input_block, PacketReturnType *privateRes, bool &db_offset)
Definition TensorContractionSycl.h:832
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes, StorageIndex mGlobalOffset, StorageIndex nGlobalOffset)
Definition TensorContractionSycl.h:662
BlockProperties< is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix &&Vectorizable, PacketReturnType > LHSBlockProperties
Definition TensorContractionSycl.h:466
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr, PacketReturnType *privateRes)
Definition TensorContractionSycl.h:637
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_, const StorageIndex groupSizeM_, const StorageIndex groupSizeN_, const StorageIndex numTiles_, const TripleDim triple_dim_)
Definition TensorContractionSycl.h:577
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair< StorageIndex, StorageIndex > local_id_extract(const StorageIndex &linearLocalThreadId)
Definition TensorContractionSycl.h:771
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename::Eigen::internal::enable_if< need_sync &&ctp==contraction_type::no_local >::type sync_thread(const cl::sycl::nd_item< 1 > &) noexcept
Definition TensorContractionSycl.h:808
const TripleDim triple_dim
Definition TensorContractionSycl.h:575
Eigen::TensorSycl::internal::Vectorise< OutScalar, Eigen::SyclDevice, Vectorizable >::PacketReturnType PacketReturnType
Definition TensorContractionSycl.h:456
static EIGEN_CONSTEXPR StorageIndex LSDL
Definition TensorContractionSycl.h:481
static EIGEN_CONSTEXPR StorageIndex NStride
Definition TensorContractionSycl.h:472
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename::Eigen::internal::enable_if< need_sync &&ctp==contraction_type::local >::type sync_thread(const cl::sycl::nd_item< 1 > &itemID)
Definition TensorContractionSycl.h:822
const RhsMapper rhs
Definition TensorContractionSycl.h:570
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename::Eigen::internal::enable_if< contract_tp==contraction_type::no_local >::type extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair< StorageIndex, StorageIndex > &, const StorageIndex &ncOffset, const StorageIndex cOffset)
Definition TensorContractionSycl.h:714
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename::Eigen::internal::enable_if< contract_tp==contraction_type::local >::type extract_block(const Input &inpt, Local local_ptr, const std::pair< StorageIndex, StorageIndex > &local_index, const StorageIndex &ncOffset, const StorageIndex cOffset)
Definition TensorContractionSycl.h:898
const StorageIndex numTiles
Definition TensorContractionSycl.h:574
OutScalar * private_ptr
Definition TensorContractionSycl.h:477
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename::Eigen::internal::enable_if<!db &&ctp==contraction_type::local >::type sync_mem(const cl::sycl::nd_item< 1 > &itemID, bool &) noexcept
Definition TensorContractionSycl.h:794
Scratch scratch
Definition TensorContractionSycl.h:568
cl::sycl::accessor< OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Scratch
Definition TensorContractionSycl.h:475
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item< 1 > &itemID, ThreadProperties< StorageIndex > &thread_properties, OutPtr out_ptr)
Definition TensorContractionSycl.h:871
cl::sycl::multi_ptr< OutScalar, cl::sycl::access::address_space::local_space > local_ptr
Definition TensorContractionSycl.h:476
typename::Eigen::internal::conditional< contraction_tp==contraction_type::local, local_ptr, private_ptr >::type tile_ptr
Definition TensorContractionSycl.h:480
const LhsMapper lhs
Definition TensorContractionSycl.h:569
static EIGEN_CONSTEXPR int PacketSize
Definition TensorContractionSycl.h:457
OutAccessor out_res
Definition TensorContractionSycl.h:571
const StorageIndex groupSizeN
Definition TensorContractionSycl.h:573
static EIGEN_CONSTEXPR bool is_rhs_transposed
Definition TensorContractionSycl.h:461
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ::Eigen::internal::enable_if<!need_sync >::type sync_thread(const cl::sycl::nd_item< 1 > &)
Definition TensorContractionSycl.h:826
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID)
Definition TensorContractionSycl.h:599
Definition EmulateArray.h:21
Definition TensorContractionMapper.h:525
@ N
Definition constructor.cpp:23
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Definition gnuplot_common_settings.hh:64
@ Unaligned
Definition Constants.h:233
data_source
Definition TensorContractionSycl.h:130
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary< false >(bool cond)
check_boundary: specialization of the check_boundary for non-internal blocks.
Definition TensorContractionSycl.h:287
contraction_type
Definition TensorContractionSycl.h:126
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool)
check_boundary: is used to check the edge condition for non-internal blocks.
Definition TensorContractionSycl.h:277
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition PacketMath.h:827
@ Lhs
Definition TensorContractionMapper.h:19
@ Rhs
Definition TensorContractionMapper.h:18
Namespace containing all symbols from the Eigen library.
Definition bench_norm.cpp:85
Definition BandTriangularSolver.h:13
Definition BFloat16.h:88
Definition TensorDimensions.h:263
Definition TensorMeta.h:50
Definition TensorForwardDeclarations.h:37
Definition TensorContraction.h:366
internal::traits< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::LeftArgType LeftArgType
Definition TensorContraction.h:368
internal::traits< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::RightArgType RightArgType
Definition TensorContraction.h:369
internal::traits< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::OutputKernelType OutputKernelType
Definition TensorContraction.h:370
internal::traits< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::Indices Indices
Definition TensorContraction.h:367
TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType > XprType
Definition TensorContractionSycl.h:1294
void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, const TripleDim &triple_dim) const
Definition TensorContractionSycl.h:1468
void evalTyped(typename Base::EvaluatorPointerType buffer) const
Definition TensorContractionSycl.h:1391
internal::remove_const< typenameXprType::Scalar >::type Scalar
Definition TensorContractionSycl.h:1295
array< StorageIndex, LDims - ContractDims > left_nocontract_t
Definition TensorContractionSycl.h:1321
array< StorageIndex, RDims - ContractDims > right_nocontract_t
Definition TensorContractionSycl.h:1322
void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, const TripleDim &triple_dim) const
Definition TensorContractionSycl.h:1448
void evalToSycl(typename Base::EvaluatorPointerType buffer) const
Definition TensorContractionSycl.h:1358
TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Device > Self
Definition TensorContractionSycl.h:1292
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition TensorContractionSycl.h:1298
TensorEvaluator< typename Base::EvalRightArgType, Device > RightEvaluator
Definition TensorContractionSycl.h:1329
Eigen::internal::remove_const< typenameRightEvaluator::CoeffReturnType >::type RhsScalar
Definition TensorContractionSycl.h:1331
void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat, StorageIndex NC, StorageIndex C) const
Definition TensorContractionSycl.h:1539
TensorEvaluator< typename Base::EvalLeftArgType, Device > LeftEvaluator
Definition TensorContractionSycl.h:1328
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data)
Definition TensorContractionSycl.h:1346
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const
Definition TensorContractionSycl.h:1643
EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, StorageIndex K) const
Definition TensorContractionSycl.h:1594
Eigen::internal::remove_const< typenameLeftEvaluator::CoeffReturnType >::type LhsScalar
Definition TensorContractionSycl.h:1330
TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_)
Definition TensorContractionSycl.h:1305
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition TensorEvaluator.h:29
const Device EIGEN_DEVICE_REF m_device
Definition TensorEvaluator.h:192
Storage::Type EvaluatorPointerType
Definition TensorEvaluator.h:39
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition TensorEvaluator.h:181
Derived::Scalar CoeffReturnType
Definition TensorEvaluator.h:32
@ BlockAccess
Definition TensorEvaluator.h:48
@ PacketAccess
Definition TensorEvaluator.h:47
@ Layout
Definition TensorEvaluator.h:50
Derived::Dimensions Dimensions
Definition TensorEvaluator.h:34
BlockProperties is a template class that provides different characteristic of a block of each Tensor ...
Definition TensorContractionSycl.h:318
static EIGEN_CONSTEXPR int c_stride
Definition TensorContractionSycl.h:326
static EIGEN_CONSTEXPR int elements_per_access
Definition TensorContractionSycl.h:323
static EIGEN_CONSTEXPR bool is_coalesced_layout
Definition TensorContractionSycl.h:324
static EIGEN_CONSTEXPR bool is_rhs
Definition TensorContractionSycl.h:321
Eigen::internal::unpacket_traits< PacketType >::type OutScalar
Definition TensorContractionSycl.h:320
static EIGEN_CONSTEXPR bool packet_load
Definition TensorContractionSycl.h:319
static EIGEN_CONSTEXPR int nc_stride
Definition TensorContractionSycl.h:325
Eigen::internal::conditional< packet_load, PacketType, OutScalar >::type OutType
Definition TensorContractionSycl.h:322
GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contrac...
Definition TensorContractionSycl.h:1241
OutAccessor out_res
Definition TensorContractionSycl.h:1246
Scratch scratch
Definition TensorContractionSycl.h:1243
EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item< 1 > itemID)
Definition TensorContractionSycl.h:1254
cl::sycl::accessor< OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Scratch
Definition TensorContractionSycl.h:1242
EIGEN_DEVICE_FUNC GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_, const StorageIndex rng_)
Definition TensorContractionSycl.h:1250
const RhsMapper rhs
Definition TensorContractionSycl.h:1245
const StorageIndex rng
Definition TensorContractionSycl.h:1247
const LhsMapper lhs
Definition TensorContractionSycl.h:1244
GeneralVectorTensor is a template class that provides Tensor -vector contraction operation,...
Definition TensorContractionSycl.h:1001
Scratch scratch
Definition TensorContractionSycl.h:1016
cl::sycl::accessor< OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Scratch
Definition TensorContractionSycl.h:1006
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID)
Definition TensorContractionSycl.h:1037
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item< 1 > &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output, OutPtr out_ptr, const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim, StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId, StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex)
Definition TensorContractionSycl.h:1074
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_, const TensorMapper mat_, OutAccessor out_res_, const StorageIndex nonContractGroupSize_, const StorageIndex nonContractDim_, const StorageIndex contractDim_)
Definition TensorContractionSycl.h:1024
const TensorMapper mat
Definition TensorContractionSycl.h:1018
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr, const StorageIndex &linearLocalThreadId, const StorageIndex &cOffset, const StorageIndex &C)
Definition TensorContractionSycl.h:1177
const StorageIndex nonContractDim
Definition TensorContractionSycl.h:1021
const StorageIndex nonContractGroupSize
Definition TensorContractionSycl.h:1020
const StorageIndex contractDim
Definition TensorContractionSycl.h:1022
BlockProperties< is_lhs_vec ? false :true, is_lhs_vec ? false :true, Vectorizable, PacketReturnType > VecBlockProperties
Definition TensorContractionSycl.h:1014
OutAccessor out_res
Definition TensorContractionSycl.h:1019
static EIGEN_CONSTEXPR int PacketSize
Definition TensorContractionSycl.h:1004
Eigen::TensorSycl::internal::Vectorise< OutScalar, Eigen::SyclDevice, Vectorizable >::PacketReturnType PacketReturnType
Definition TensorContractionSycl.h:1003
static EIGEN_CONSTEXPR StorageIndex OutScratchOffset
Definition TensorContractionSycl.h:1008
const VectorMapper vec
Definition TensorContractionSycl.h:1017
static EIGEN_DEVICE_FUNC void set_packet(PacketReturnType, Scalar *)
Definition InteropHeaders.h:142
static EIGEN_DEVICE_FUNC Scalar scalarize(Index, PacketReturnType &)
Definition InteropHeaders.h:134
Definition TensorReductionSycl.h:76
TTPanelSize, a template class used for setting the panel size required for launching General Tensor T...
Definition TensorContractionSycl.h:79
static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN
Definition TensorContractionSycl.h:92
static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM
Definition TensorContractionSycl.h:85
static EIGEN_CONSTEXPR bool DoubleBuffer
Definition TensorContractionSycl.h:114
static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM
Definition TensorContractionSycl.h:97
static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN
Definition TensorContractionSycl.h:99
static EIGEN_CONSTEXPR StorageIndex TileSizeDimM
Definition TensorContractionSycl.h:101
static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs
Definition TensorContractionSycl.h:105
static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs
Definition TensorContractionSycl.h:108
static EIGEN_CONSTEXPR bool BC
Definition TensorContractionSycl.h:111
static EIGEN_CONSTEXPR StorageIndex TileSizeDimN
Definition TensorContractionSycl.h:103
static EIGEN_CONSTEXPR StorageIndex TileSizeDimK
Definition TensorContractionSycl.h:81
TVPanelSize, a template class used for setting the panel size required for launching General TensorVe...
Definition TensorContractionSycl.h:43
static EIGEN_CONSTEXPR bool BC
Definition TensorContractionSycl.h:57
static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC
Definition TensorContractionSycl.h:49
static EIGEN_CONSTEXPR StorageIndex TileSizeDimC
Definition TensorContractionSycl.h:51
static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC
Definition TensorContractionSycl.h:55
static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC
Definition TensorContractionSycl.h:53
static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC
Definition TensorContractionSycl.h:47
static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC
Definition TensorContractionSycl.h:45
MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it ...
Definition TensorContractionSycl.h:502
tile_ptr ptr
Definition TensorContractionSycl.h:503
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr)
Definition TensorContractionSycl.h:504
TiledMemory: contains required memory pointer for loading each tile of the TensorContraction panel fr...
Definition TensorContractionSycl.h:535
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TiledMemory(const ThreadProperties< StorageIndex > &, local_ptr, typename ::Eigen::internal::enable_if< tp==contraction_type::no_local >::type *=0)
Definition TensorContractionSycl.h:544
tile_ptr rhs_scratch_ptr_compute
Definition TensorContractionSycl.h:539
const std::pair< StorageIndex, StorageIndex > rhs_extract_index
Definition TensorContractionSycl.h:541
MemHolder< contraction_tp, Properties::WorkLoadPerThreadN *Properties::TileSizeDimK > rhs_scratch_extract
Definition TensorContractionSycl.h:537
const std::pair< StorageIndex, StorageIndex > lhs_extract_index
Definition TensorContractionSycl.h:540
tile_ptr lhs_scratch_ptr_compute
Definition TensorContractionSycl.h:538
MemHolder< contraction_tp, Properties::WorkLoadPerThreadM *Properties::TileSizeDimK > lhs_scratch_extract
Definition TensorContractionSycl.h:536
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TiledMemory(const ThreadProperties< StorageIndex > &thread_properties, local_ptr block_start_ptr, typename ::Eigen::internal::enable_if< tp==contraction_type::local >::type *=0)
Definition TensorContractionSycl.h:555
ThreadProperties is a template class that provides each thread's properties within a workgroup....
Definition TensorContractionSycl.h:369
const StorageIndex mGroupOffset
Definition TensorContractionSycl.h:372
const StorageIndex nGroupOffset
Definition TensorContractionSycl.h:373
const StorageIndex mLocalOffset
Definition TensorContractionSycl.h:375
const StorageIndex kGroupId
Definition TensorContractionSycl.h:371
const StorageIndex linearLocalThreadId
Definition TensorContractionSycl.h:370
const StorageIndex nLocalOffset
Definition TensorContractionSycl.h:376
StorageIndex kSize
Definition TensorContractionSycl.h:379
const StorageIndex kGroupOffset
Definition TensorContractionSycl.h:374
const StorageIndex mGlobalOffset
Definition TensorContractionSycl.h:377
const bool is_internal
Definition TensorContractionSycl.h:380
const StorageIndex nGlobalOffset
Definition TensorContractionSycl.h:378
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties(const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_, const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_, const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_, StorageIndex kSize_, const bool is_internal_)
Definition TensorContractionSycl.h:382
Definition TensorContractionMapper.h:555
Definition ForwardDeclarations.h:17
Definition GenericPacketMath.h:133
std::ptrdiff_t j
Definition tut_arithmetic_redux_minmax.cpp:2